├── tests
    ├── extractors
    │   ├── __init__.py
    │   ├── metas.py
    │   ├── opengraph.py
    │   ├── authors.py
    │   ├── links.py
    │   ├── tweets.py
    │   ├── title.py
    │   ├── publishdate.py
    │   ├── videos.py
    │   └── tags.py
    ├── data
    │   ├── extractors
    │   │   ├── links
    │   │   │   ├── test_links.json
    │   │   │   └── test_links.html
    │   │   ├── tweets
    │   │   │   ├── test_tweet.json
    │   │   │   └── test_tweet.html
    │   │   ├── title
    │   │   │   ├── test_title_empty.json
    │   │   │   ├── test_title_opengraph.json
    │   │   │   ├── test_title_empty.html
    │   │   │   └── test_title_opengraph.html
    │   │   ├── publishdate
    │   │   │   ├── test_publish_date.json
    │   │   │   ├── test_publish_date_schema.json
    │   │   │   ├── test_publish_date_article.json
    │   │   │   ├── test_publish_date_rnews.json
    │   │   │   ├── test_publish_date.html
    │   │   │   ├── test_publish_date_rnews.html
    │   │   │   ├── test_publish_date_article.html
    │   │   │   └── test_publish_date_schema.html
    │   │   ├── images
    │   │   │   ├── test_basic_image
    │   │   │   │   ├── 50850547cc7310bc53e30e802c6318f1
    │   │   │   │   └── test_basic_image.json
    │   │   │   ├── test_known_image_empty_src
    │   │   │   │   ├── test_known_image_empty_src.json
    │   │   │   │   └── test_known_image_empty_src.html
    │   │   │   ├── test_opengraph_tag
    │   │   │   │   ├── test_opengraph_tag.json
    │   │   │   │   └── test_opengraph_tag.html
    │   │   │   ├── test_known_image_css_id
    │   │   │   │   ├── test_known_image_css_id.json
    │   │   │   │   └── test_known_image_css_id.html
    │   │   │   ├── test_known_image_css_class
    │   │   │   │   ├── test_known_image_css_class.json
    │   │   │   │   └── test_known_image_css_class.html
    │   │   │   ├── test_known_image_css_parent_id
    │   │   │   │   ├── test_known_image_css_parent_id.json
    │   │   │   │   └── test_known_image_css_parent_id.html
    │   │   │   ├── test_known_image_name_parent
    │   │   │   │   ├── test_known_image_name_parent.json
    │   │   │   │   └── test_known_image_name_parent.html
    │   │   │   └── test_known_image_css_parent_class
    │   │   │   │   ├── test_known_image_css_parent_class.json
    │   │   │   │   └── test_known_image_css_parent_class.html
    │   │   ├── content
    │   │   │   ├── test_articlebody_tag.json
    │   │   │   ├── test_articlebody_itemprop.json
    │   │   │   ├── test_articlebody_attribute.json
    │   │   │   ├── test_issue129.json
    │   │   │   ├── test_okaymarketing.json
    │   │   │   ├── test_usatoday_issue_74.json
    │   │   │   ├── test_issue115.json
    │   │   │   ├── test_mashable_issue_74.json
    │   │   │   ├── test_politico.json
    │   │   │   ├── test_issue32.json
    │   │   │   ├── test_businessinsider3.json
    │   │   │   ├── test_espn.json
    │   │   │   ├── test_elmondo1.json
    │   │   │   ├── test_liberation.json
    │   │   │   ├── test_issue28.json
    │   │   │   ├── test_techcrunch1.json
    │   │   │   ├── test_businessWeek1.json
    │   │   │   ├── test_cnn1.json
    │   │   │   ├── test_businessWeek3.json
    │   │   │   ├── test_yahoo.json
    │   │   │   ├── test_time.json
    │   │   │   ├── test_foxNews.json
    │   │   │   ├── test_businessWeek2.json
    │   │   │   ├── test_cnbc1.json
    │   │   │   ├── test_cbslocal.json
    │   │   │   ├── test_huffingtonPost2.json
    │   │   │   ├── test_allnewlyrics1.json
    │   │   │   ├── test_issue4.json
    │   │   │   ├── test_cnet.json
    │   │   │   ├── test_aolNews.json
    │   │   │   ├── test_articlebody_tag.html
    │   │   │   ├── test_issue25.json
    │   │   │   ├── test_articlebody_attribute.html
    │   │   │   ├── test_articlebody_itemprop.html
    │   │   │   ├── test_cnn_arabic.json
    │   │   │   ├── test_time2.json
    │   │   │   ├── test_marketplace.json
    │   │   │   ├── test_engadget.json
    │   │   │   ├── test_get_canonical_url.json
    │   │   │   ├── test_lefigaro.json
    │   │   │   ├── test_bbc_chinese.json
    │   │   │   ├── test_testHuffingtonPost.json
    │   │   │   ├── test_msn1.json
    │   │   │   ├── test_donga_korean.json
    │   │   │   ├── test_issue24.html
    │   │   │   ├── test_issue24.json
    │   │   │   └── test_elpais.json
    │   │   ├── authors
    │   │   │   ├── test_author_schema.json
    │   │   │   └── test_author_schema.html
    │   │   ├── tags
    │   │   │   ├── test_tags_abcau.json
    │   │   │   ├── test_tags_kexp.json
    │   │   │   ├── test_tags_deadline.json
    │   │   │   ├── test_tags_wnyc.json
    │   │   │   └── test_tags_cnet.json
    │   │   ├── opengraph
    │   │   │   ├── test_opengraph.json
    │   │   │   └── test_opengraph.html
    │   │   └── videos
    │   │   │   ├── test_iframe.json
    │   │   │   ├── test_embed.json
    │   │   │   ├── test_object.json
    │   │   │   ├── test_iframe.html
    │   │   │   ├── test_embed.html
    │   │   │   └── test_object.html
    │   └── parser
    │   │   └── test1.html
    ├── __init__.py
    ├── article.py
    └── configuration.py
├── requirements.txt
├── MANIFEST.in
├── THANKS
├── .gitignore
├── .travis.yml
├── goose
    ├── resources
    │   ├── images
    │   │   └── known-image-css.txt
    │   └── text
    │   │   ├── stopwords-nl.txt
    │   │   ├── stopwords-ko.txt
    │   │   ├── stopwords-fi.txt
    │   │   ├── stopwords-da.txt
    │   │   ├── stopwords-no.txt
    │   │   ├── stopwords-nb.txt
    │   │   ├── stopwords-zh.txt
    │   │   ├── stopwords-pt.txt
    │   │   ├── stopwords-ar.txt
    │   │   ├── stopwords-pl.txt
    │   │   ├── stopwords-fr.txt
    │   │   ├── stopwords-es.txt
    │   │   ├── stopwords-hu.txt
    │   │   ├── stopwords-it.txt
    │   │   └── stopwords-ru.txt
    ├── version.py
    ├── extractors
    │   ├── __init__.py
    │   ├── links.py
    │   ├── opengraph.py
    │   ├── tweets.py
    │   ├── authors.py
    │   ├── tags.py
    │   ├── publishdate.py
    │   ├── title.py
    │   ├── metas.py
    │   └── videos.py
    ├── video.py
    ├── network.py
    ├── image.py
    ├── __init__.py
    ├── configuration.py
    └── utils
    │   ├── __init__.py
    │   └── images.py
└── setup.py


/tests/extractors/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | Pillow
2 | lxml
3 | cssselect
4 | jieba
5 | beautifulsoup
6 | nltk
7 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | recursive-include goose/resources/images *
2 | recursive-include goose/resources/text *


--------------------------------------------------------------------------------
/tests/data/extractors/links/test_links.json:
--------------------------------------------------------------------------------
1 | {
2 |     "url": "http://exemple.com/links/", 
3 |     "expected": {
4 |         "links": 2
5 |     }
6 | }
7 | 


--------------------------------------------------------------------------------
/tests/data/extractors/tweets/test_tweet.json:
--------------------------------------------------------------------------------
1 | {
2 |     "url": "http://exemple.com/tweet/", 
3 |     "expected": {
4 |         "tweets": 2
5 |     }
6 | }
7 | 


--------------------------------------------------------------------------------
/THANKS:
--------------------------------------------------------------------------------
1 | Thanks to all who has contribute to python-goose. 
2 | You can find the contributors list here :
3 | https://github.com/grangier/python-goose/graphs/contributors


--------------------------------------------------------------------------------
/tests/data/extractors/title/test_title_empty.json:
--------------------------------------------------------------------------------
1 | {
2 |     "url": "http://exemple.com/test_title_empty.html",
3 |     "expected": {
4 |         "title": ""
5 |     }
6 | }
7 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | goose.egg-info/
 3 | build/
 4 | dist/
 5 | .DS_Store*
 6 | ._.DS_Store*
 7 | env/
 8 | *~
 9 | .idea
10 | ._*
11 | *.egg
12 | venv/
13 | goose_extractor.egg-info/
14 | 


--------------------------------------------------------------------------------
/tests/data/extractors/title/test_title_opengraph.json:
--------------------------------------------------------------------------------
1 | {
2 |     "url": "http://exemple.com/test_opengraphcontent",
3 |     "expected": {
4 |         "title": "Good article title"
5 |     }
6 | }
7 | 


--------------------------------------------------------------------------------
/tests/data/extractors/publishdate/test_publish_date.json:
--------------------------------------------------------------------------------
1 | {
2 |   "url": "http://example.com/example",
3 |     "expected": {
4 |         "publish_date": "2014-06-30T16:54:02+00:00"
5 |     }
6 | }
7 | 


--------------------------------------------------------------------------------
/tests/data/extractors/publishdate/test_publish_date_schema.json:
--------------------------------------------------------------------------------
1 | {
2 |     "url": "http://example.com/example",
3 |     "expected": {
4 |         "publish_date": "2014-10-09T12:06:16"
5 |     }
6 | }
7 | 


--------------------------------------------------------------------------------
/tests/data/extractors/publishdate/test_publish_date_article.json:
--------------------------------------------------------------------------------
1 | {
2 |   "url": "http://example.com/example",
3 |     "expected": {
4 |         "publish_date": "2012-01-11T15:55:01+00:00"
5 |     }
6 | }
7 | 


--------------------------------------------------------------------------------
/tests/data/extractors/publishdate/test_publish_date_rnews.json:
--------------------------------------------------------------------------------
1 | {
2 |     "url": "http://example.com/example",
3 |     "expected": {
4 |         "publish_date": "2010-02-22T11:53:04+00:00"
5 |     }
6 | }
7 | 


--------------------------------------------------------------------------------
/tests/data/extractors/images/test_basic_image/50850547cc7310bc53e30e802c6318f1:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/grangier/python-goose/HEAD/tests/data/extractors/images/test_basic_image/50850547cc7310bc53e30e802c6318f1


--------------------------------------------------------------------------------
/tests/data/extractors/publishdate/test_publish_date.html:
--------------------------------------------------------------------------------
1 | <html>
2 |   <head>
3 |     <meta name='OriginalPublicationDate' content='2014-06-30T16:54:02+00:00'>
4 |   </head>
5 |   <body>
6 |   </body>
7 | </html>
8 | 


--------------------------------------------------------------------------------
/tests/data/extractors/publishdate/test_publish_date_rnews.html:
--------------------------------------------------------------------------------
1 | <html>
2 |   <head>
3 |     <meta property='rnews:datePublished' content='2010-02-22T11:53:04+00:00'>
4 |   </head>
5 |   <body>
6 |   </body>
7 | </html>
8 | 


--------------------------------------------------------------------------------
/tests/data/extractors/content/test_articlebody_tag.json:
--------------------------------------------------------------------------------
1 | {
2 |     "url": "http://exemple.com/test_opengraphcontent",
3 |     "expected": {
4 |         "cleaned_text": "Search-and-rescue teams were mobilized "
5 |     }
6 | }
7 | 


--------------------------------------------------------------------------------
/tests/data/extractors/publishdate/test_publish_date_article.html:
--------------------------------------------------------------------------------
1 | <html>
2 |   <head>
3 |     <meta property='article:published_time' content='2012-01-11T15:55:01+00:00'>
4 |   </head>
5 |   <body>
6 |   </body>
7 | </html>
8 | 


--------------------------------------------------------------------------------
/tests/data/extractors/content/test_articlebody_itemprop.json:
--------------------------------------------------------------------------------
1 | {
2 |     "url": "http://exemple.com/test_opengraphcontent",
3 |     "expected": {
4 |         "cleaned_text": "Search-and-rescue teams were mobilized "
5 |     }
6 | }
7 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | 
 3 | python:
 4 |     - 2.6
 5 |     - 2.7
 6 | 
 7 | install:
 8 |     - pip install -r requirements.txt --use-mirrors
 9 |     - python setup.py install
10 | 
11 | script: python setup.py test
12 | 


--------------------------------------------------------------------------------
/tests/data/extractors/content/test_articlebody_attribute.json:
--------------------------------------------------------------------------------
1 | {
2 |     "url": "http://exemple.com/test_opengraphcontent",
3 |     "expected": {
4 |         "cleaned_text": "Search-and-rescue teams were mobilized "
5 |     }
6 | }
7 | 


--------------------------------------------------------------------------------
/tests/data/extractors/content/test_issue129.json:
--------------------------------------------------------------------------------
1 | {
2 |     "url": "http://lostinjit.blogspot.fr/2011/10/pypy-and-road-towards-scipy.html", 
3 |     "expected": {
4 |         "cleaned_text": "Recent PyPys effort to bring NumPy and the associated fundraiser"
5 |     }
6 | }
7 | 


--------------------------------------------------------------------------------
/goose/resources/images/known-image-css.txt:
--------------------------------------------------------------------------------
1 | latimes.com^thumbnail
2 | cnn.com^storytext|cnn_strycntntlft
3 | foxnews.com^entry-content
4 | msn.com^articleText
5 | go.com^mediaimage
6 | lefigaro.fr^photo center
7 | cadres.apec.fr^noFieldsTable
8 | emploi.lesechos.fr^offerHeader
9 | linkfinance.fr^offerHeader


--------------------------------------------------------------------------------
/tests/data/extractors/authors/test_author_schema.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "url": "http://exemple.com/tweet/", 
 3 |     "expected": {
 4 |         "authors": [
 5 |             "KEVIN SACK", 
 6 |             "ADAM NOSSITER", 
 7 |             "PAM BELLUCK", 
 8 |             "SHERI FINK"
 9 |         ]
10 |     }
11 | }
12 | 


--------------------------------------------------------------------------------
/tests/data/extractors/tags/test_tags_abcau.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "url": "http://www.abc.net.au/news/2013-04-22/swimming-greats-say-cuts-a-shame/4644544", 
 3 |     "expected": {
 4 |         "tags": [
 5 |             "olympics-summer", 
 6 |             "australia", 
 7 |             "swimming"
 8 |         ]
 9 |     }
10 | }


--------------------------------------------------------------------------------
/tests/data/extractors/content/test_okaymarketing.json:
--------------------------------------------------------------------------------
1 | {
2 |     "url": "http://www.businessinsider.com/larry-page-the-untold-story-2014-4",
3 |     "expected": {
4 |         "cleaned_text": "If you are operating a local business there is something you can do right now to gain an advantage over your competition."
5 |     }
6 | }


--------------------------------------------------------------------------------
/tests/data/extractors/tags/test_tags_kexp.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "url": "http://blogs.kusp.org/filmgang/2013/02/08/stand-up-guys/", 
 3 |     "expected": {
 4 |         "tags": [
 5 |             "kusp film review", 
 6 |             "Stand Up Guys", 
 7 |             "film", 
 8 |             "Dennis Morton"
 9 |         ]
10 |     }
11 | }


--------------------------------------------------------------------------------
/tests/data/extractors/tags/test_tags_deadline.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "url": "http://www.deadline.com/2013/06/deadline-big-media-with-david-lieberman-episode-38/", 
 3 |     "expected": {
 4 |         "tags": [
 5 |             "Deadline Big Media", 
 6 |             "TiVo", 
 7 |             "Amazon Prime", 
 8 |             "Steve Ballmer"
 9 |         ]
10 |     }
11 | }


--------------------------------------------------------------------------------
/tests/data/extractors/tags/test_tags_wnyc.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "url": "http://www.wnyc.org/shows/heresthething/2013/may/27/", 
 3 |     "expected": {
 4 |         "tags": [
 5 |             "Life", 
 6 |             "alec baldwin", 
 7 |             "other desert cities", 
 8 |             "News", 
 9 |             "Music", 
10 |             "stacy keach"
11 |         ]
12 |     }
13 | }


--------------------------------------------------------------------------------
/tests/data/extractors/content/test_usatoday_issue_74.json:
--------------------------------------------------------------------------------
1 | {
2 |     "url": "http://www.usatoday.com/story/tech/columnist/talkingtech/2014/01/25/namm-2014---ik-multimedias-rings-to-make-music/4863193/", 
3 |     "expected": {
4 |         "cleaned_text": "ANAHEIM, Calif. — Musicians often show off lots of ring bling —but rarely have rings been thought of for making music — until now."
5 |     }
6 | }


--------------------------------------------------------------------------------
/tests/data/extractors/content/test_issue115.json:
--------------------------------------------------------------------------------
1 | {
2 |     "url": "http://blogs.wsj.com/accelerators/2014/06/03/jessica-livingston-why-startups-need-to-focus-on-sales-not-marketing/",
3 |     "expected": {
4 |         "cleaned_text": "JESSICA LIVINGSTON: The most important thing an early-stage startup should know about marketing is rather counterintuitive: that you probably shouldn’t be doing anything you’d use the term"
5 |     }
6 | }
7 | 


--------------------------------------------------------------------------------
/goose/resources/text/stopwords-nl.txt:
--------------------------------------------------------------------------------
 1 | aan
 2 | af
 3 | al
 4 | als
 5 | bij
 6 | dan
 7 | dat
 8 | die
 9 | dit
10 | een
11 | en
12 | er
13 | had
14 | heb
15 | hem
16 | het
17 | hij
18 | hoe
19 | hun
20 | ik
21 | in
22 | is
23 | je
24 | kan
25 | me
26 | men
27 | met
28 | mij
29 | nog
30 | nu
31 | of
32 | ons
33 | ook
34 | te
35 | tot
36 | uit
37 | van
38 | was
39 | wat
40 | we
41 | wel
42 | wij
43 | zal
44 | ze
45 | zei
46 | zij
47 | zo
48 | zou
49 | 


--------------------------------------------------------------------------------
/tests/data/extractors/content/test_mashable_issue_74.json:
--------------------------------------------------------------------------------
1 | {
2 |     "url": "http://mashable.com/2014/01/26/square-cofounder-jim-mckelvey/", 
3 |     "expected": {
4 |         "cleaned_text": "Some 2,000 miles away from Square's massive new headquarters in San Francisco, Jim McKelvey is standing in work boots and a thick dock coat trying to revive a city.\n\nMcKelvey founded the mobile payments company in 2009 with Jack Dorsey, who had previously helped launch Twitter."
5 |     }
6 | }


--------------------------------------------------------------------------------
/tests/data/extractors/opengraph/test_opengraph.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "url": "http://exemple.com/test_opengraphcontent",
 3 |     "expected": {
 4 |         "opengraph": {
 5 |             "url": "http://www.somenews.com/2012/09/19/nyregion/some-news-article.html?pagewanted=all", 
 6 |             "image": "http://graphics8.somenews.com/images/2012/09/19/region/some-news-image.jpg", 
 7 |             "type": "article", 
 8 |             "description": "Some News Happened in New York", 
 9 |             "title": "Some News Article Story"
10 |         }
11 |     }
12 | }
13 | 


--------------------------------------------------------------------------------
/goose/resources/text/stopwords-ko.txt:
--------------------------------------------------------------------------------
 1 | 을
 2 | 의
 3 | 에
 4 | 이
 5 | 를
 6 | 으로
 7 | 은
 8 | 는
 9 | 가
10 | 로
11 | 하고
12 | 과
13 | 에서
14 | 도
15 | 와
16 | 이다
17 | 고
18 | 부터
19 | 까지
20 | 께
21 | 에는
22 | 이라고
23 | 만
24 | 라고
25 | 보다
26 | 에도
27 | 다
28 | 토록
29 | 에게
30 | 나
31 | 대로
32 | 에서는
33 | 이나
34 | 이며
35 | 요
36 | 든
37 | 으로써
38 | 같이
39 | 로는
40 | 밖에
41 | 과의
42 | 며
43 | 로부터
44 | 처럼
45 | 아
46 | 라
47 | 여
48 | 으로는
49 | 이고
50 | 에서의
51 | 이라는
52 | 만에
53 | 으로부터
54 | 에서도
55 | 와의
56 | 엔
57 | 만을
58 | 부터는
59 | 만의
60 | 야
61 | 까지의
62 | 과는
63 | 치고
64 | 과를
65 | 으로의
66 | 까지는
67 | 보다는
68 | 만이
69 | 에만
70 | 로의


--------------------------------------------------------------------------------
/tests/data/extractors/images/test_basic_image/test_basic_image.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "url": "http://blogs.kusp.org/filmgang/2013/02/08/stand-up-guys/", 
 3 |     "expected": {
 4 |         "top_image": {
 5 |             "extraction_type": "bigimage", 
 6 |             "src": "http://md0.libe.com/photo/465395/?modified_at=1351411813&ratio_x=03&ratio_y=02&width=476", 
 7 |             "confidence_score": 100, 
 8 |             "bytes": 0, 
 9 |             "height": 317, 
10 |             "width": 476, 
11 |             "top_image_node": null
12 |         }
13 |     }
14 | }
15 | 


--------------------------------------------------------------------------------
/tests/data/extractors/images/test_known_image_empty_src/test_known_image_empty_src.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "url": "http://go.com/bla/bla",
 3 |     "expected": {
 4 |         "cleaned_text" : "TextNode 1 - The Scala supported IDE is one of the few pain points of developers who want to start u",
 5 |         "top_image": {
 6 |             "extraction_type": "NA",
 7 |             "src": "",
 8 |             "confidence_score": 0.0,
 9 |             "bytes": 0,
10 |             "height": 0,
11 |             "width": 0,
12 |             "top_image_node": null
13 |         }
14 |     }
15 | }


--------------------------------------------------------------------------------
/tests/data/extractors/images/test_opengraph_tag/test_opengraph_tag.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "url": "http://go.com/bla/bla", 
 3 |     "expected": {
 4 |         "cleaned_text" : "TextNode 1 - The Scala supported IDE is one of the few pain points of developers who want to start u",
 5 |         "top_image": {
 6 |             "extraction_type": "opengraph", 
 7 |             "src": "http://go.com/images/465395/", 
 8 |             "confidence_score": 100, 
 9 |             "bytes": 0, 
10 |             "height": 0, 
11 |             "width": 0, 
12 |             "top_image_node": null
13 |         }
14 |     }
15 | }


--------------------------------------------------------------------------------
/tests/data/extractors/images/test_known_image_css_id/test_known_image_css_id.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "url": "http://go.com/bla/bla", 
 3 |     "expected": {
 4 |         "cleaned_text" : "TextNode 1 - The Scala supported IDE is one of the few pain points of developers who want to start u",
 5 |         "top_image": {
 6 |             "extraction_type": "known", 
 7 |             "src": "http://go.com/images/465395/", 
 8 |             "confidence_score": 90, 
 9 |             "bytes": 0, 
10 |             "height": 0, 
11 |             "width": 0, 
12 |             "top_image_node": null
13 |         }
14 |     }
15 | }


--------------------------------------------------------------------------------
/tests/data/extractors/images/test_known_image_css_class/test_known_image_css_class.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "url": "http://go.com/bla/bla", 
 3 |     "expected": {
 4 |         "cleaned_text" : "TextNode 1 - The Scala supported IDE is one of the few pain points of developers who want to start u",
 5 |         "top_image": {
 6 |             "extraction_type": "known", 
 7 |             "src": "http://go.com/images/465395/", 
 8 |             "confidence_score": 90, 
 9 |             "bytes": 0, 
10 |             "height": 0, 
11 |             "width": 0, 
12 |             "top_image_node": null
13 |         }
14 |     }
15 | }


--------------------------------------------------------------------------------
/tests/data/extractors/images/test_known_image_css_parent_id/test_known_image_css_parent_id.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "url": "http://go.com/bla/bla", 
 3 |     "expected": {
 4 |         "cleaned_text" : "TextNode 1 - The Scala supported IDE is one of the few pain points of developers who want to start u",
 5 |         "top_image": {
 6 |             "extraction_type": "known", 
 7 |             "src": "http://go.com/images/465395/", 
 8 |             "confidence_score": 90, 
 9 |             "bytes": 0, 
10 |             "height": 0, 
11 |             "width": 0, 
12 |             "top_image_node": null
13 |         }
14 |     }
15 | }


--------------------------------------------------------------------------------
/tests/data/extractors/images/test_known_image_name_parent/test_known_image_name_parent.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "url": "http://go.com/bla/bla", 
 3 |     "expected": {
 4 |         "cleaned_text" : "TextNode 1 - The Scala supported IDE is one of the few pain points of developers who want to start u",
 5 |         "top_image": {
 6 |             "extraction_type": "known", 
 7 |             "src": "http://go.com/images/465395/", 
 8 |             "confidence_score": 90, 
 9 |             "bytes": 0, 
10 |             "height": 0, 
11 |             "width": 0, 
12 |             "top_image_node": null
13 |         }
14 |     }
15 | }


--------------------------------------------------------------------------------
/tests/data/extractors/images/test_known_image_css_parent_class/test_known_image_css_parent_class.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "url": "http://go.com/bla/bla", 
 3 |     "expected": {
 4 |         "cleaned_text" : "TextNode 1 - The Scala supported IDE is one of the few pain points of developers who want to start u",
 5 |         "top_image": {
 6 |             "extraction_type": "known", 
 7 |             "src": "http://go.com/images/465395/", 
 8 |             "confidence_score": 90, 
 9 |             "bytes": 0, 
10 |             "height": 0, 
11 |             "width": 0, 
12 |             "top_image_node": null
13 |         }
14 |     }
15 | }


--------------------------------------------------------------------------------
/tests/data/extractors/content/test_politico.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "url": "http://www.politico.com/news/stories/1010/43352.html", 
 3 |     "expected": {
 4 |         "meta_description": "Demographic changes are likely to alter the route Obama took to victory in 2008.", 
 5 |         "domain": "www.politico.com", 
 6 |         "final_url": "http://www.politico.com/news/stories/1010/43352.html", 
 7 |         "meta_keywords": "2012, Maggie Haberman and Shira Toeplitz", 
 8 |         "cleaned_text": "If the newest Census Bureau estimates stay close to form", 
 9 |         "meta_favicon": "http://www.politico.com/favicon.ico", 
10 |         "meta_lang": "en"
11 |     }
12 | }


--------------------------------------------------------------------------------
/tests/data/extractors/tags/test_tags_cnet.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "url": "http://www.cnet.com/8301-13952_1-57596170-81/the-404-1310-where-its-love-at-first-swipe-podcast/", 
 3 |     "expected": {
 4 |         "tags": [
 5 |             "purgatory", 
 6 |             "USDATE", 
 7 |             "Pope", 
 8 |             "online dating", 
 9 |             "leftovers", 
10 |             "app", 
11 |             "Yahoo", 
12 |             "OKCupid", 
13 |             "romance", 
14 |             "Pontifex", 
15 |             "Tinder", 
16 |             "Leftover Swap", 
17 |             "Match.com", 
18 |             "Twitter", 
19 |             "Marc Maron"
20 |         ]
21 |     }
22 | }


--------------------------------------------------------------------------------
/tests/data/parser/test1.html:
--------------------------------------------------------------------------------
 1 | <html>
 2 |   <body>
 3 |     <div>
 4 |       <p>
 5 |         xxxx <a>aaaaa</a> xxxxxxxx
 6 |         <span> span span </span>
 7 |       </p>
 8 |     </div>
 9 |     <div>
10 |       <p>
11 |         xxxx <a>aaaaa</a> xxxxxxxx
12 |         <span> span span </span>
13 |       </p>
14 |     </div>
15 |     <div class="foo">
16 |       <p>
17 |         xxxx <a>aaaaa</a> xxxxxxxx
18 |         <span> span span </span>
19 |       </p>
20 |     </div>
21 |     <div>
22 |       <p>
23 |         xxxx <a>aaaaa</a> xxxxxxxx
24 |         <span> span span </span>
25 |       </p>
26 |       <p class="foo bar">test</p>
27 |     </div>
28 |   </body>
29 | </html>
30 |     


--------------------------------------------------------------------------------
/tests/data/extractors/content/test_issue32.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "url": "http://www.tulsaworld.com/site/articlepath.aspx?articleid=20111118_61_A16_Opposi344152&rss_lnk=7", 
 3 |     "expected": {
 4 |         "meta_description": "", 
 5 |         "domain": "www.tulsaworld.com", 
 6 |         "final_url": "http://www.tulsaworld.com/site/articlepath.aspx?articleid=20111118_61_A16_Opposi344152&rss_lnk=7", 
 7 |         "meta_keywords": "COURT RULE INFORMATION RECORDS DISTRICT OKLAHOMA PERSONAL PROPOSAL PROPOSED REASONS", 
 8 |         "cleaned_text": "Opposition to a proposal to remove certain personal data", 
 9 |         "meta_favicon": "/favicon.ico", 
10 |         "meta_lang": null
11 |     }
12 | }


--------------------------------------------------------------------------------
/tests/data/extractors/content/test_businessinsider3.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "url": "http://www.businessinsider.com/larry-page-the-untold-story-2014-4",
 3 |     "expected": {
 4 |         "meta_description": "One day in July 2001, Larry Page decided to...",
 5 |         "domain": "www.businessinsider.com", 
 6 |         "final_url": "http://www.businessinsider.com/larry-page-the-untold-story-2014-4",
 7 |         "meta_keywords": "Google, Larry Page, Longform, Nicholas Carlson,",
 8 |         "cleaned_text": "One day in July 2001, Larry Page decided to fire Google",
 9 |         "meta_favicon": "http://static5.businessinsider.com/assets/images/faviconBI.ico",
10 |         "meta_lang": "en"
11 |     }
12 | }


--------------------------------------------------------------------------------
/tests/data/extractors/content/test_espn.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "url": "http://sports.espn.go.com/espn/commentary/news/story?id=5461430", 
 3 |     "expected": {
 4 |         "meta_description": "Are Florida coach Urban Meyer and Alabama coach Nick Saban closing practice because of agents or because they like to control every aspect of their programs?", 
 5 |         "domain": "sports.espn.go.com", 
 6 |         "final_url": "http://sports.espn.go.com/espn/commentary/news/story?id=5461430", 
 7 |         "meta_keywords": "", 
 8 |         "cleaned_text": "If you believe what college football coaches have said about sports", 
 9 |         "meta_favicon": "", 
10 |         "meta_lang": null
11 |     }
12 | }


--------------------------------------------------------------------------------
/tests/data/extractors/content/test_elmondo1.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "url": "http://www.elmundo.es/elmundo/2012/10/28/espana/1351388909.html",
 3 |     "target_language": "es",
 4 |     "expected": {
 5 |         "meta_description": "Detenida en Francia Izaskun Lesaka Izaskun Lesaka pas\u00f3 a la c\u00fapula tras la detenci\u00f3n de Ata. Su pareja y lugarteniente, Joseba Iturbe, tambi\u00e9n ha sido detenido.", 
 6 |         "domain": "www.elmundo.es", 
 7 |         "final_url": "http://www.elmundo.es/elmundo/2012/10/28/espana/1351388909.html", 
 8 |         "meta_keywords": "Detenida, Francia, Izaskun, Lesaka, Espa\u00f1a", 
 9 |         "cleaned_text": "Importante golpe a la banda terrorista ETA en Francia.", 
10 |         "meta_favicon": "", 
11 |         "meta_lang": null
12 |     }
13 | }


--------------------------------------------------------------------------------
/tests/data/extractors/content/test_liberation.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "url": "http://www.liberation.fr/politiques/2012/10/27/ayrault-assume-et-revendique-sa-methode_856451", 
 3 |     "expected": {
 4 |         "meta_description": "Apr\u00e8s une semaine agit\u00e9e, le Premier ministre s'est offert un succ\u00e8s d'estrade \u00e0 bon compte lors du congr\u00e8s du Parti socialiste \u00e0 Toulouse.", 
 5 |         "domain": "www.liberation.fr", 
 6 |         "final_url": "http://www.liberation.fr/politiques/2012/10/27/ayrault-assume-et-revendique-sa-methode_856451", 
 7 |         "meta_keywords": "actualit\u00e9s, news", 
 8 |         "cleaned_text": "A Toulouse, Jean-Marc Ayrault aura fait deux rappels sur", 
 9 |         "meta_favicon": "", 
10 |         "meta_lang": "fr"
11 |     }
12 | }


--------------------------------------------------------------------------------
/tests/data/extractors/title/test_title_empty.html:
--------------------------------------------------------------------------------
 1 | <html>
 2 |     <head>
 3 |         <title></title>
 4 |     </head>
 5 |     <body>
 6 |         <div>
 7 |             <p>
 8 |               TextNode 1 - The Scala supported IDE is one of the few pain points of developers who want to start using Scala in their Java project. On existing long term project developed by a team its hard to step in and introduce a new language that is not supported by the existing IDE. On way to go about it is to hid the fact that you use Scala from the Java world by using one way dependency injection. Still, if you wish to truly absorb Scala into your existing java environment then you'll soon introduced cross language dependencies.
 9 |             </p>
10 |         </div>
11 |     </body>
12 | </html>
13 | 


--------------------------------------------------------------------------------
/goose/resources/text/stopwords-fi.txt:
--------------------------------------------------------------------------------
 1 | alla
 2 | ansiosta
 3 | ehkä
 4 | ei
 5 | enemmän
 6 | ennen
 7 | etessa
 8 | f
 9 | haikki
10 | he
11 | hitaasti
12 | hoikein
13 | hyvin
14 | hän
15 | ilman
16 | ja
17 | jos
18 | jälkeen
19 | kanssa
20 | kaukana
21 | kenties
22 | keskellä
23 | kesken
24 | koskaan
25 | kuinkan
26 | kukka
27 | kylliksi
28 | kyllä
29 | liian
30 | lla
31 | lla
32 | luona
33 | lähellä
34 | läpi
35 | me
36 | miksi
37 | mikä
38 | milloin
39 | milloinkan
40 | minä
41 | missä
42 | miten
43 | nopeasti
44 | nyt
45 | oikea
46 | oikealla
47 | paljon
48 | siellä
49 | sinä
50 | ssa
51 | sta
52 | suoraan
53 | tai
54 | takana
55 | takia
56 | tarpeeksi
57 | te
58 | tässä
59 | ulkopuolella
60 | vahemmän
61 | vasen
62 | vasenmalla
63 | vastan
64 | vielä
65 | vieressä
66 | vähän
67 | yhdessä
68 | ylös
69 | 


--------------------------------------------------------------------------------
/tests/data/extractors/content/test_issue28.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "url": "http://www.telegraph.co.uk/foodanddrink/foodanddrinknews/8808120/Worlds-hottest-chilli-contest-leaves-two-in-hospital.html", 
 3 |     "expected": {
 4 |         "meta_description": "A 'world's hottest chilli' competition at a curry restaurant left two people\n  in hospital.", 
 5 |         "domain": "www.telegraph.co.uk", 
 6 |         "final_url": "http://www.telegraph.co.uk/foodanddrink/foodanddrinknews/8808120/Worlds-hottest-chilli-contest-leaves-two-in-hospital.html", 
 7 |         "meta_keywords": "Curry-competition, Food and Drink News,Food and Drink", 
 8 |         "cleaned_text": "Emergency services were called to Kismot Restaurant's curry-eating challenge,", 
 9 |         "meta_favicon": "", 
10 |         "meta_lang": "en"
11 |     }
12 | }


--------------------------------------------------------------------------------
/tests/data/extractors/content/test_techcrunch1.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "url": "http://techcrunch.com/2011/08/13/2005-zuckerberg-didnt-want-to-take-over-the-world/", 
 3 |     "expected": {
 4 |         "meta_description": "", 
 5 |         "domain": "techcrunch.com", 
 6 |         "final_url": "http://techcrunch.com/2011/08/13/2005-zuckerberg-didnt-want-to-take-over-the-world/", 
 7 |         "meta_keywords": "", 
 8 |         "cleaned_text": "The Huffington Post has come across this fascinating five-minute interview", 
 9 |         "tags": [
10 |             "facebook"
11 |         ], 
12 |         "title": "2005 Zuckerberg Didn\u2019t Want To Take Over The World", 
13 |         "meta_favicon": "http://s2.wp.com/wp-content/themes/vip/tctechcrunch2/images/favicon.ico?m=1310283187g", 
14 |         "meta_lang": "en"
15 |     }
16 | }


--------------------------------------------------------------------------------
/tests/data/extractors/content/test_businessWeek1.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "url": "http://www.businessweek.com/magazine/content/10_34/b4192066630779.htm", 
 3 |     "expected": {
 4 |         "meta_description": "The Web and cable star has achieved fame by targeting nerdy guys, who she says \"control popularity\".", 
 5 |         "domain": "www.businessweek.com", 
 6 |         "final_url": "http://www.businessweek.com/magazine/content/10_34/b4192066630779.htm", 
 7 |         "meta_keywords": "Olivia Munn, Attack of the Show, Jon Stewart, Daily Show, G4", 
 8 |         "cleaned_text": "Six years ago, Olivia Munn arrived in Hollywood with fading ambitions of making it as a sports reporter and set about deploying", 
 9 |         "title": "Olivia Munn: Queen of the Uncool", 
10 |         "meta_favicon": "", 
11 |         "meta_lang": "en"
12 |     }
13 | }


--------------------------------------------------------------------------------
/tests/data/extractors/content/test_cnn1.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "url": "http://www.cnn.com/2010/POLITICS/08/13/democrats.social.security/index.html", 
 3 |     "expected": {
 4 |         "meta_description": "Democrats pledged Friday to not only keep Social Security in place, but use the historic program against Republicans ahead of the midterm election.", 
 5 |         "domain": "www.cnn.com", 
 6 |         "final_url": "http://www.cnn.com/2010/POLITICS/08/13/democrats.social.security/index.html", 
 7 |         "meta_keywords": "", 
 8 |         "cleaned_text": "Washington (CNN) -- Democrats pledged ", 
 9 |         "title": "Democrats to use Social Security against GOP this fall - CNN.com", 
10 |         "meta_favicon": "http://i.cdn.turner.com/cnn/.element/img/3.0/global/misc/apple-touch-icon.png", 
11 |         "meta_lang": "en"
12 |     }
13 | }
14 | 


--------------------------------------------------------------------------------
/tests/data/extractors/content/test_businessWeek3.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "url": "http://www.businessweek.com/technology/here-comes-apples-real-tv-09132011.html", 
 3 |     "expected": {
 4 |         "meta_description": "A bold, new Apple TV set would replace today\u2019s cable systems, game consoles, and 3D goggles\u2014and launch a war with cable providers", 
 5 |         "domain": "www.businessweek.com", 
 6 |         "final_url": "http://www.businessweek.com/technology/here-comes-apples-real-tv-09132011.html", 
 7 |         "meta_keywords": "Apple, Apple CEO, Google, Television, Cable & Wireless, Netflix, Steve Jobs, Comcast, cable, cable TV, hulu, Roku", 
 8 |         "cleaned_text": "Get ready, America, because by Christmas 2012 you will have an Apple TV in your living room", 
 9 |         "meta_favicon": "", 
10 |         "meta_lang": null
11 |     }
12 | }


--------------------------------------------------------------------------------
/tests/data/extractors/content/test_yahoo.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "url": "http://news.yahoo.com/apple-says-steve-jobs-resigning-ceo-224628633.html", 
 3 |     "expected": {
 4 |         "meta_description": "Read 'Apple says Steve Jobs resigning as CEO' on Yahoo! News. Steve Jobs, the mind behind the iPhone, iPad and other devices that turned Apple Inc. into one of the world's most powerful companies, resigned as CEO on Wednesday, saying he can no longer handle the job but will continue to play a leadership role.", 
 5 |         "domain": "news.yahoo.com", 
 6 |         "final_url": "http://news.yahoo.com/apple-says-steve-jobs-resigning-ceo-224628633.html", 
 7 |         "meta_keywords": "", 
 8 |         "cleaned_text": "SAN FRANCISCO (AP) \u2014 Steve Jobs, the mind behind the iPhone", 
 9 |         "meta_favicon": "", 
10 |         "meta_lang": "en"
11 |     }
12 | }


--------------------------------------------------------------------------------
/tests/data/extractors/content/test_time.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "url": "http://www.time.com/time/health/article/0,8599,2011497,00.html", 
 3 |     "expected": {
 4 |         "meta_description": "Researchers at the University of Georgia  believe that much of the oil from the BP spill is still present underwater in the Gulf of Mexico, where its impact on aquatic life is far from clear", 
 5 |         "domain": "www.time.com", 
 6 |         "final_url": "http://www.time.com/time/health/article/0,8599,2011497,00.html", 
 7 |         "meta_keywords": "bp, oil, spill, gulf, mexico, invisible, dispersed, deepwater horizon, Charles Hopkinson", 
 8 |         "cleaned_text": "This month, the federal government released", 
 9 |         "title": "Oil from Spill Could Still Pose Major Threat", 
10 |         "meta_favicon": "http://img.timeinc.net/time/favicon.ico", 
11 |         "meta_lang": null
12 |     }
13 | }
14 | 


--------------------------------------------------------------------------------
/tests/data/extractors/content/test_foxNews.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "url": "http://www.foxnews.com/politics/2010/08/14/russias-nuclear-help-iran-stirs-questions-improved-relations/", 
 3 |     "expected": {
 4 |         "meta_description": "Russia's announcement that it will help Iran get nuclear fuel is raising questions about the better-than- ever relationship between Russia and the U.S. , according to President Obama, after the two former Cold War adversaries recently signed a nuclear reduction treaty.", 
 5 |         "domain": "www.foxnews.com", 
 6 |         "final_url": "http://www.foxnews.com/politics/2010/08/14/russias-nuclear-help-iran-stirs-questions-improved-relations/", 
 7 |         "meta_keywords": "", 
 8 |         "cleaned_text": "Russia's announcement that it will help Iran get nuclear fuel is raising questions", 
 9 |         "meta_favicon": "", 
10 |         "meta_lang": "en"
11 |     }
12 | }


--------------------------------------------------------------------------------
/tests/data/extractors/content/test_businessWeek2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "url": "http://www.businessweek.com/management/five-social-media-lessons-for-business-09202011.html", 
 3 |     "expected": {
 4 |         "meta_description": "A Home Depot executive discusses the retailer's strategy for engaging consumers via Facebook, Twitter, and blogs, relying on store associates for much of the social interaction", 
 5 |         "domain": "www.businessweek.com", 
 6 |         "final_url": "http://www.businessweek.com/management/five-social-media-lessons-for-business-09202011.html", 
 7 |         "meta_keywords": "Facebook, Twitter, social media, Home Depot, retailers, social media lessons", 
 8 |         "cleaned_text": "At Home Depot, we first realized we needed to have a real conversation with", 
 9 |         "title": "Five Social Media Lessons for Business", 
10 |         "meta_favicon": "", 
11 |         "meta_lang": null
12 |     }
13 | }


--------------------------------------------------------------------------------
/tests/data/extractors/content/test_cnbc1.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "url": "http://www.cnbc.com/id/44613978", 
 3 |     "expected": {
 4 |         "meta_description": "The Fed launched much anticipated \"operation twist\" with a twist\u2014it is also taking direct aim at mortgages. The Fed also warned of significant downside economic risks. Check out how it changed its statement.", 
 5 |         "domain": "www.cnbc.com", 
 6 |         "final_url": "http://www.cnbc.com/id/44613978", 
 7 |         "meta_keywords": "Bonds,Economy,Interest Rates,Economy (Global),Banking,Economic Measures,Debt,Central Banks,Ben Bernanke,Federal Reserve,Currencies,Investment Strategy,Top Blogs", 
 8 |         "cleaned_text": "Some traders found Wednesday's Fed statement to be a bit gloomier than expected.", 
 9 |         "meta_favicon": "http://media.cnbc.com/i/CNBC/CNBC_Images/mobile_images/cnbc_iphone_icon.png", 
10 |         "meta_lang": null
11 |     }
12 | }


--------------------------------------------------------------------------------
/tests/data/extractors/content/test_cbslocal.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "url": "http://newyork.cbslocal.com/2012/06/08/bc-morning-show-american-hero-kelly-malloy/", 
 3 |     "expected": {
 4 |         "meta_description": "Boomer &amp; Craig were thrilled to welcome an American Hero into the Allstate Studio, as Kelly Malloy stopped-by and was given the royal treatment she deserved...", 
 5 |         "domain": "newyork.cbslocal.com", 
 6 |         "final_url": "http://newyork.cbslocal.com/2012/06/08/bc-morning-show-american-hero-kelly-malloy/", 
 7 |         "meta_keywords": "vibNews", 
 8 |         "cleaned_text": "Boomer & Craig were thrilled to welcome an American Hero into the Allstate Studio, as Kelly", 
 9 |         "tags": [
10 |             "Boomer & Carton", 
11 |             "Kelly Malloy", 
12 |             "Bobby Dwyer"
13 |         ], 
14 |         "meta_favicon": "http://s2.wp.com/i/favicon.ico?m=1311976027g", 
15 |         "meta_lang": "en"
16 |     }
17 | }


--------------------------------------------------------------------------------
/tests/data/extractors/title/test_title_opengraph.html:
--------------------------------------------------------------------------------
 1 | <html>
 2 |     <head>
 3 |       <meta property="og:site_name" content="TechCrunch"/>
 4 |       <meta property="og:title" content="Good article title | TechCrunch"/>
 5 |       <title>Wrong article title - website</title>
 6 |     </head>
 7 |     <body>
 8 |         <div>
 9 |             <p>
10 |               TextNode 1 - The Scala supported IDE is one of the few pain points of developers who want to start using Scala in their Java project. On existing long term project developed by a team its hard to step in and introduce a new language that is not supported by the existing IDE. On way to go about it is to hid the fact that you use Scala from the Java world by using one way dependency injection. Still, if you wish to truly absorb Scala into your existing java environment then you'll soon introduced cross language dependencies.
11 |             </p>
12 |         </div>
13 |     </body>
14 | </html>
15 | 


--------------------------------------------------------------------------------
/tests/data/extractors/content/test_huffingtonPost2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "url": "http://www.huffingtonpost.com/2011/10/06/alabama-workers-immigration-law_n_997793.html", 
 3 |     "expected": {
 4 |         "meta_description": "MONTGOMERY, Ala. -- Alabama's strict new immigration law may be backfiring. Intended to force illegal workers out of jobs, it is also driving away many construction workers, roofers and field hands in the country legally who do backbreaking jobs that Americans generally won't.", 
 5 |         "domain": "www.huffingtonpost.com", 
 6 |         "final_url": "http://www.huffingtonpost.com/2011/10/06/alabama-workers-immigration-law_n_997793.html", 
 7 |         "meta_keywords": "alabama, workers, leave, state, as, immigration, law, takes, effect, business", 
 8 |         "cleaned_text": "MONTGOMERY, Ala. -- Alabama's strict new immigration law may be backfiring.", 
 9 |         "meta_favicon": "/favicon.ico", 
10 |         "meta_lang": "en"
11 |     }
12 | }


--------------------------------------------------------------------------------
/tests/data/extractors/content/test_allnewlyrics1.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "url": "http://allnewlyrics.com/only-one-lyrics-pj-morton-ft-stevie-wonder.html", 
 3 |     "expected": {
 4 |         "meta_description": "All about hot new song with lyrics at AllNewLyrics.Com", 
 5 |         "domain": "allnewlyrics.com", 
 6 |         "final_url": "http://allnewlyrics.com/only-one-lyrics-pj-morton-ft-stevie-wonder.html", 
 7 |         "meta_keywords": "Music, Songs, Lyrics, Letras, Lirik, Tekst, Text, Testo, Paroles, Popular, New", 
 8 |         "cleaned_text": "PJ Morton \u2013 Only One Lyrics (Ft. Stevie Wonder)\n\nI\u2019m pretty sure I don\u2019t need anything else\n\n This is the best feeling I\u2019ve ever felt", 
 9 |         "tags": [
10 |             "PJ Morton", 
11 |             "Stevie Wonder"
12 |         ], 
13 |         "title": "\u201cOnly One\u201d Lyrics : PJ Morton (Ft. Stevie Wonder)", 
14 |         "meta_favicon": "", 
15 |         "meta_lang": "en"
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/tests/data/extractors/content/test_issue4.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "url": "http://www.slate.fr/story/64063/tapie-mougeotte-la-provence", 
 3 |     "target_language": "fr",
 4 |     "expected": {
 5 |         "meta_description": "L'ex-pr\u00e9sident de l'OM et l'ancien PDG de TF1 s'int\u00e9resseraient au rachat du quotidien r\u00e9gional. Nous vous proposons, comme au lyc\u00e9e, un exercice pour en d\u00e9gager la signification.", 
 6 |         "domain": "www.slate.fr", 
 7 |         "final_url": "http://www.slate.fr/story/64063/tapie-mougeotte-la-provence", 
 8 |         "meta_keywords": "FRANCE,Bernard Tapie,Etienne Mougeotte,presse,Qatar,Cr\u00e9dit Lyonnaus,La Provence,aides de l'Etat,politique,marseille,m\u00e9dias fran\u00e7ais,", 
 9 |         "cleaned_text": "Exercice: apr\u00e8s avoir attentivement lu cette br\u00e8ve parue dans L'Express, vous expliquerez en quoi elle r\u00e9sume une certaine id\u00e9e de la France.\n\n\u00abBernar", 
10 |         "meta_favicon": "", 
11 |         "meta_lang": "fr"
12 |     }
13 | }


--------------------------------------------------------------------------------
/tests/data/extractors/content/test_cnet.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "url": "http://news.cnet.com/8301-30686_3-20014053-266.html?tag=topStories1", 
 3 |     "expected": {
 4 |         "meta_description": "The phone company is adding bells and whistles to its Fios TV service, including an app that turns an iPad into a TV and several other new options for taking your video content on the go. Read this blog post by Marguerite Reardon on Signal Strength.", 
 5 |         "domain": "news.cnet.com", 
 6 |         "final_url": "http://news.cnet.com/8301-30686_3-20014053-266.html?tag=topStories1", 
 7 |         "meta_keywords": "Marguerite Reardon, wireless, broadband, telecom", 
 8 |         "cleaned_text": "NEW YORK--Verizon Communications is prepping a new", 
 9 |         "tags": [
10 |             "iPad", 
11 |             "Verizon Communications", 
12 |             "Verizon Fios TV", 
13 |             "Fios", 
14 |             "Apple iPad"
15 |         ], 
16 |         "meta_favicon": "", 
17 |         "meta_lang": null
18 |     }
19 | }


--------------------------------------------------------------------------------
/tests/data/extractors/publishdate/test_publish_date_schema.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/loose.dtd"><html>
 2 | <html>
 3 |     <head>
 4 |         <title>test video</title>
 5 |     </head>
 6 | 
 7 |     <body>
 8 |         <div>
 9 |             <time itemprop="datePublished" datetime="2014-10-09T12:06:16">9 octobre 2014 à 12:06</time>
10 |             <p>
11 |             TextNode 1 - The Scala supported IDE is one of the few pain points of developers who want to start using Scala in their Java project. On existing long term project developed by a team its hard to step in and introduce a new language that is not supported by the existing IDE. On way to go about it is to hid the fact that you use Scala from the Java world by using one way dependency injection. Still, if you wish to truly absorb Scala into your existing java environment then you'll soon introduced cross language dependencies.
12 |             </p>
13 |         </div>
14 |     </body>
15 | </html>
16 | 


--------------------------------------------------------------------------------
/tests/data/extractors/videos/test_iframe.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "url": "http://foo.bar/index.html", 
 3 |     "expected": {
 4 |         "movies": [
 5 |             {
 6 |                 "src": "http://www.dailymotion.com/embed/video/x130bpf", 
 7 |                 "embed_code": "<iframe frameborder=\"0\" width=\"480\" height=\"270\" src=\"http://www.dailymotion.com/embed/video/x130bpf\"/>", 
 8 |                 "height": "270", 
 9 |                 "width": "480", 
10 |                 "provider": "dailymotion", 
11 |                 "embed_type": "iframe"
12 |             },
13 |             {
14 |                 "src": "http://www.dailymotion.com/embed/video/x130bpf", 
15 |                 "embed_code": "<iframe frameborder=\"0\" width=\"480\" height=\"270\" src=\"http://www.dailymotion.com/embed/video/x130bpf\"/>", 
16 |                 "height": "270", 
17 |                 "width": "480", 
18 |                 "provider": "dailymotion", 
19 |                 "embed_type": "iframe"
20 |             }
21 |         ]
22 |     }
23 | }


--------------------------------------------------------------------------------
/tests/data/extractors/content/test_aolNews.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "url": "http://www.aolnews.com/nation/article/the-few-the-proud-the-marines-getting-a-makeover/19592478", 
 3 |     "expected": {
 4 |         "meta_description": "(Aug. 13) -- Declaring", 
 5 |         "domain": "www.aolnews.com", 
 6 |         "final_url": "http://www.aolnews.com/nation/article/the-few-the-proud-the-marines-getting-a-makeover/19592478", 
 7 |         "meta_keywords": "news, update, breaking, nation, U.S., elections, world, entertainment, sports, business, weird news, health, science, latest news articles, breaking news, current news, top news", 
 8 |         "cleaned_text": "WASHINGTON (Aug. 13) -- Declaring \"the maritime soul of the Marine Corps\" is", 
 9 |         "tags": [
10 |             "Defense Secretary Robert Gates", 
11 |             "military", 
12 |             "Marines", 
13 |             "armed forces"
14 |         ], 
15 |         "meta_favicon": "http://o.aolcdn.com/art/ch_news/aol_favicon.ico", 
16 |         "meta_lang": "en"
17 |     }
18 | }


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """\
 3 | This is a python port of "Goose" orignialy licensed to Gravity.com
 4 | under one or more contributor license agreements.  See the NOTICE file
 5 | distributed with this work for additional information
 6 | regarding copyright ownership.
 7 | 
 8 | Python port was written by Xavier Grangier for Recrutae
 9 | 
10 | Gravity.com licenses this file
11 | to you under the Apache License, Version 2.0 (the "License");
12 | you may not use this file except in compliance
13 | with the License.  You may obtain a copy of the License at
14 | 
15 | http://www.apache.org/licenses/LICENSE-2.0
16 | 
17 | Unless required by applicable law or agreed to in writing, software
18 | distributed under the License is distributed on an "AS IS" BASIS,
19 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20 | See the License for the specific language governing permissions and
21 | limitations under the License.
22 | """
23 | import unittest
24 | 
25 | if __name__ == '__main__':
26 |     unittest.main()
27 | 


--------------------------------------------------------------------------------
/goose/version.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """\
 3 | This is a python port of "Goose" orignialy licensed to Gravity.com
 4 | under one or more contributor license agreements.  See the NOTICE file
 5 | distributed with this work for additional information
 6 | regarding copyright ownership.
 7 | 
 8 | Python port was written by Xavier Grangier for Recrutae
 9 | 
10 | Gravity.com licenses this file
11 | to you under the Apache License, Version 2.0 (the "License");
12 | you may not use this file except in compliance
13 | with the License.  You may obtain a copy of the License at
14 | 
15 | http://www.apache.org/licenses/LICENSE-2.0
16 | 
17 | Unless required by applicable law or agreed to in writing, software
18 | distributed under the License is distributed on an "AS IS" BASIS,
19 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20 | See the License for the specific language governing permissions and
21 | limitations under the License.
22 | """
23 | 
24 | version_info = (1, 0, 25)
25 | __version__ = ".".join(map(str, version_info))
26 | 


--------------------------------------------------------------------------------
/goose/resources/text/stopwords-da.txt:
--------------------------------------------------------------------------------
  1 | af
  2 | alle
  3 | andet
  4 | andre
  5 | at
  6 | begge
  7 | da
  8 | de
  9 | den
 10 | denne
 11 | der
 12 | deres
 13 | det
 14 | dette
 15 | dig
 16 | din
 17 | dog
 18 | du
 19 | ej
 20 | eller
 21 | en
 22 | end
 23 | ene
 24 | eneste
 25 | enhver
 26 | et
 27 | fem
 28 | fire
 29 | flere
 30 | fleste
 31 | for
 32 | fordi
 33 | forrige
 34 | fra
 35 | få
 36 | før
 37 | god
 38 | han
 39 | hans
 40 | har
 41 | hendes
 42 | her
 43 | hun
 44 | hvad
 45 | hvem
 46 | hver
 47 | hvilken
 48 | hvis
 49 | hvor
 50 | hvordan
 51 | hvorfor
 52 | hvornår
 53 | i
 54 | ikke
 55 | ind
 56 | ingen
 57 | intet
 58 | jeg
 59 | jeres
 60 | kan
 61 | kom
 62 | kommer
 63 | lav
 64 | lidt
 65 | lille
 66 | man
 67 | mand
 68 | mange
 69 | med
 70 | meget
 71 | men
 72 | mens
 73 | mere
 74 | mig
 75 | ned
 76 | ni
 77 | nogen
 78 | noget
 79 | ny
 80 | nyt
 81 | nær
 82 | næste
 83 | næsten
 84 | og
 85 | op
 86 | otte
 87 | over
 88 | på
 89 | se
 90 | seks
 91 | ses
 92 | som
 93 | stor
 94 | store
 95 | syv
 96 | ti
 97 | til
 98 | to
 99 | tre
100 | ud
101 | var
102 | 


--------------------------------------------------------------------------------
/tests/extractors/metas.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """\
 3 | This is a python port of "Goose" orignialy licensed to Gravity.com
 4 | under one or more contributor license agreements.  See the NOTICE file
 5 | distributed with this work for additional information
 6 | regarding copyright ownership.
 7 | 
 8 | Python port was written by Xavier Grangier for Recrutae
 9 | 
10 | Gravity.com licenses this file
11 | to you under the Apache License, Version 2.0 (the "License");
12 | you may not use this file except in compliance
13 | with the License.  You may obtain a copy of the License at
14 | 
15 | http://www.apache.org/licenses/LICENSE-2.0
16 | 
17 | Unless required by applicable law or agreed to in writing, software
18 | distributed under the License is distributed on an "AS IS" BASIS,
19 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20 | See the License for the specific language governing permissions and
21 | limitations under the License.
22 | """
23 | 
24 | from base import TestExtractionBase
25 | 
26 | 
27 | class TestMetas(TestExtractionBase):
28 | 
29 |     pass
30 | 


--------------------------------------------------------------------------------
/tests/data/extractors/content/test_articlebody_tag.html:
--------------------------------------------------------------------------------
 1 | <html>
 2 |   <body>
 3 |     <div>
 4 |       <p>
 5 |           Not an Actual Content
 6 |           TextNode 1 - The Scala supported IDE is one of the few pain points of developers who want to start using Scala in their Java project. On existing long term project developed by a team its hard to step in and introduce a new language that is not supported by the existing IDE. On way to go about it is to hid the fact that you use Scala from the Java world by using one way dependency injection. Still, if you wish to truly absorb Scala into your existing java environment then you'll soon introduced cross language dependencies.
 7 |       </p>
 8 |     </div>
 9 |     <article>
10 |       <p>
11 |         Search-and-rescue teams were mobilized from across Southeast Asia on Sunday after a commercial airliner with 162 people on board lost contact with ground controllers off the coast of Borneo, a search effort that evoked a distressingly familiar mix of grief and mystery nine months after a Malaysia Airlines jetliner disappeared over the Indian Ocean.
12 |       </p>
13 |     </article>
14 |   </body>
15 | </html>
16 | 


--------------------------------------------------------------------------------
/tests/data/extractors/content/test_issue25.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "url": "http://www.accountancyage.com/aa/analysis/2111729/institutes-ifrs-bang", 
 3 |     "expected": {
 4 |         "meta_description": "UK institutes are all for rapid IFRS adoption in the US\n,  Institutes,Accounting standards,  ACCA,ICAEW,IFRS", 
 5 |         "domain": "www.accountancyage.com", 
 6 |         "final_url": "http://www.accountancyage.com/aa/analysis/2111729/institutes-ifrs-bang", 
 7 |         "meta_keywords": "", 
 8 |         "cleaned_text": "UK INSTITUTES have thrown their weight behind rapid adoption of international financial reporting standards in the US.", 
 9 |         "tags": [
10 |             "Legal Privilege", 
11 |             "Investigations", 
12 |             "ICAEW", 
13 |             "ACCA", 
14 |             "Audit Reform Special.", 
15 |             "Vantis", 
16 |             "Celebrities", 
17 |             "Football Finance", 
18 |             "Tax Amnesty", 
19 |             "Convergence", 
20 |             "IFRS"
21 |         ], 
22 |         "meta_favicon": "http://www.accountancyage.com/images/AccountancyAge.png", 
23 |         "meta_lang": "en"
24 |     }
25 | }


--------------------------------------------------------------------------------
/tests/data/extractors/videos/test_embed.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "url": "http://foo.bar/index.html", 
 3 |     "expected": {
 4 |         "movies": [
 5 |             {
 6 |                 "src": "https://www.youtube.com/v/M7lc1UVf-VE?version=3&autoplay=1", 
 7 |                 "embed_code": "<embed src=\"https://www.youtube.com/v/M7lc1UVf-VE?version=3&amp;autoplay=1\" type=\"application/x-shockwave-flash\" allowscriptaccess=\"always\" width=\"640\" height=\"390\"/>", 
 8 |                 "height": "390", 
 9 |                 "width": "640", 
10 |                 "provider": "youtube", 
11 |                 "embed_type": "embed"
12 |             },
13 |             {
14 |                 "src": "https://www.youtube.com/v/M7lc1UVf-VE?version=3&autoplay=1", 
15 |                 "embed_code": "<embed src=\"https://www.youtube.com/v/M7lc1UVf-VE?version=3&amp;autoplay=1\" type=\"application/x-shockwave-flash\" allowscriptaccess=\"always\" width=\"640\" height=\"390\"/>", 
16 |                 "height": "390", 
17 |                 "width": "640", 
18 |                 "provider": "youtube", 
19 |                 "embed_type": "embed"
20 |             }
21 |         ]
22 |     }
23 | }


--------------------------------------------------------------------------------
/tests/data/extractors/content/test_articlebody_attribute.html:
--------------------------------------------------------------------------------
 1 | <html>
 2 |   <body>
 3 |     <div>
 4 |       <p>
 5 |           Not an Actual Content
 6 |           TextNode 1 - The Scala supported IDE is one of the few pain points of developers who want to start using Scala in their Java project. On existing long term project developed by a team its hard to step in and introduce a new language that is not supported by the existing IDE. On way to go about it is to hid the fact that you use Scala from the Java world by using one way dependency injection. Still, if you wish to truly absorb Scala into your existing java environment then you'll soon introduced cross language dependencies.
 7 |       </p>
 8 |     </div>
 9 |     <div class='post-content'>
10 |       <p>
11 |         Search-and-rescue teams were mobilized from across Southeast Asia on Sunday after a commercial airliner with 162 people on board lost contact with ground controllers off the coast of Borneo, a search effort that evoked a distressingly familiar mix of grief and mystery nine months after a Malaysia Airlines jetliner disappeared over the Indian Ocean.
12 |       </p>
13 |     </div>
14 |   </body>
15 | </html>
16 | 


--------------------------------------------------------------------------------
/tests/data/extractors/content/test_articlebody_itemprop.html:
--------------------------------------------------------------------------------
 1 | <html>
 2 |   <body>
 3 |     <div>
 4 |       <p>
 5 |           Not an Actual Content
 6 |           TextNode 1 - The Scala supported IDE is one of the few pain points of developers who want to start using Scala in their Java project. On existing long term project developed by a team its hard to step in and introduce a new language that is not supported by the existing IDE. On way to go about it is to hid the fact that you use Scala from the Java world by using one way dependency injection. Still, if you wish to truly absorb Scala into your existing java environment then you'll soon introduced cross language dependencies.
 7 |       </p>
 8 |     </div>
 9 |     <div itemprop='articleBody'>
10 |       <p>
11 |         Search-and-rescue teams were mobilized from across Southeast Asia on Sunday after a commercial airliner with 162 people on board lost contact with ground controllers off the coast of Borneo, a search effort that evoked a distressingly familiar mix of grief and mystery nine months after a Malaysia Airlines jetliner disappeared over the Indian Ocean.
12 |       </p>
13 |     </div>
14 |   </body>
15 | </html>
16 | 


--------------------------------------------------------------------------------
/tests/data/extractors/content/test_cnn_arabic.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "url": "http://arabic.cnn.com/2013/middle_east/8/3/syria.clashes/index.html", 
 3 |     "expected": {
 4 |         "meta_description": "", 
 5 |         "domain": "arabic.cnn.com", 
 6 |         "final_url": "http://arabic.cnn.com/2013/middle_east/8/3/syria.clashes/index.html", 
 7 |         "meta_keywords": "", 
 8 |         "cleaned_text": "\u062f\u0645\u0634\u0642\u060c \u0633\u0648\u0631\u064a\u0627 (CNN) -- \u0623\u0643\u062f\u062a \u062c\u0647\u0627\u062a \u0633\u0648\u0631\u064a\u0629 \u0645\u0639\u0627\u0631\u0636\u0629 \u0623\u0646 \u0641\u0635\u0627\u0626\u0644 \u0645\u0633\u0644\u062d\u0629 \u0645\u0639\u0627\u0631\u0636\u0629 \u0644\u0646\u0638\u0627\u0645 \u0627\u0644\u0631\u0626\u064a\u0633 \u0628\u0634\u0627\u0631 \u0627\u0644\u0623\u0633\u062f \u0648\u0639\u0644\u0649 \u0635\u0644\u0629 \u0628\u0640\"\u0627\u0644\u062c\u064a\u0634 \u0627\u0644\u062d\u0631\" \u062a\u0645\u0643\u0646\u062a \u0645\u0646 \u0627\u0644\u0633\u064a\u0637\u0631\u0629 \u0639\u0644\u0649 \u0645\u0633\u062a\u0648\u062f\u0639\u0627\u062a \u0644\u0644\u0623\u0633\u0644", 
 9 |         "meta_favicon": "", 
10 |         "meta_lang": "ar"
11 |     }
12 | }


--------------------------------------------------------------------------------
/tests/article.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """\
 3 | This is a python port of "Goose" orignialy licensed to Gravity.com
 4 | under one or more contributor license agreements.  See the NOTICE file
 5 | distributed with this work for additional information
 6 | regarding copyright ownership.
 7 | 
 8 | Python port was written by Xavier Grangier for Recrutae
 9 | 
10 | Gravity.com licenses this file
11 | to you under the Apache License, Version 2.0 (the "License");
12 | you may not use this file except in compliance
13 | with the License.  You may obtain a copy of the License at
14 | 
15 | http://www.apache.org/licenses/LICENSE-2.0
16 | 
17 | Unless required by applicable law or agreed to in writing, software
18 | distributed under the License is distributed on an "AS IS" BASIS,
19 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20 | See the License for the specific language governing permissions and
21 | limitations under the License.
22 | """
23 | 
24 | import unittest
25 | from goose.article import Article
26 | 
27 | 
28 | class TestArticle(unittest.TestCase):
29 | 
30 |     def test_instance(self):
31 |         a = Article()
32 |         self.assertEqual(isinstance(a, Article), True)
33 | 


--------------------------------------------------------------------------------
/tests/data/extractors/opengraph/test_opengraph.html:
--------------------------------------------------------------------------------
 1 | <html>
 2 |     <head>
 3 |       <meta property="og:url" content="http://www.somenews.com/2012/09/19/nyregion/some-news-article.html?pagewanted=all"/>
 4 |       <meta property="og:type" content="article"/>
 5 |       <meta property="og:title" content="Some News Article Story"/>
 6 |       <meta property="og:description" content="Some News Happened in New York">
 7 |       <meta property="og:image" content="http://graphics8.somenews.com/images/2012/09/19/region/some-news-image.jpg"/>
 8 |     </head>
 9 |     <body>
10 |         <div>
11 |             <p>
12 |               TextNode 1 - The Scala supported IDE is one of the few pain points of developers who want to start using Scala in their Java project. On existing long term project developed by a team its hard to step in and introduce a new language that is not supported by the existing IDE. On way to go about it is to hid the fact that you use Scala from the Java world by using one way dependency injection. Still, if you wish to truly absorb Scala into your existing java environment then you'll soon introduced cross language dependencies.
13 |             </p>
14 |         </div>
15 |     </body>
16 | </html>
17 | 


--------------------------------------------------------------------------------
/goose/resources/text/stopwords-no.txt:
--------------------------------------------------------------------------------
  1 | at
  2 | av
  3 | de
  4 | den
  5 | der
  6 | det
  7 | du
  8 | en
  9 | er
 10 | et
 11 | for
 12 | fra
 13 | før
 14 | med
 15 | og
 16 | om
 17 | over
 18 | på
 19 | som
 20 | til
 21 | ved
 22 | år
 23 | alle
 24 | bare
 25 | ble
 26 | bort
 27 | bra
 28 | da
 29 | deg
 30 | dem
 31 | denne
 32 | dere
 33 | deres
 34 | det
 35 | dette
 36 | din
 37 | disse
 38 | dit
 39 | ditt
 40 | eller
 41 | ene
 42 | enn
 43 | er
 44 | et
 45 | ett
 46 | etter
 47 | for
 48 | fram
 49 | først
 50 | få
 51 | god
 52 | gå
 53 | ha
 54 | han
 55 | hans
 56 | har
 57 | her
 58 | hit
 59 | hun
 60 | hva
 61 | hvem
 62 | hver
 63 | ikke
 64 | inn
 65 | ja
 66 | jeg
 67 | kan
 68 | kom
 69 | kun
 70 | kunne
 71 | lage
 72 | lang
 73 | lik
 74 | like
 75 | man
 76 | mer
 77 | min
 78 | mot
 79 | mye
 80 | må
 81 | måte
 82 | ned
 83 | nei
 84 | noe
 85 | noen
 86 | ny
 87 | nå
 88 | når
 89 | også
 90 | opp
 91 | oss
 92 | seg
 93 | selv
 94 | si
 95 | siden
 96 | sin
 97 | sine
 98 | sist
 99 | skal
100 | skulle
101 | slik
102 | som
103 | så
104 | sånn
105 | tid
106 | til
107 | under
108 | ut
109 | uten
110 | var
111 | ved
112 | vi
113 | vil
114 | vite
115 | vår
116 | å
117 | dei
118 | di
119 | då
120 | eg


--------------------------------------------------------------------------------
/tests/extractors/opengraph.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """\
 3 | This is a python port of "Goose" orignialy licensed to Gravity.com
 4 | under one or more contributor license agreements.  See the NOTICE file
 5 | distributed with this work for additional information
 6 | regarding copyright ownership.
 7 | 
 8 | Python port was written by Xavier Grangier for Recrutae
 9 | 
10 | Gravity.com licenses this file
11 | to you under the Apache License, Version 2.0 (the "License");
12 | you may not use this file except in compliance
13 | with the License.  You may obtain a copy of the License at
14 | 
15 | http://www.apache.org/licenses/LICENSE-2.0
16 | 
17 | Unless required by applicable law or agreed to in writing, software
18 | distributed under the License is distributed on an "AS IS" BASIS,
19 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20 | See the License for the specific language governing permissions and
21 | limitations under the License.
22 | """
23 | 
24 | from base import TestExtractionBase
25 | 
26 | 
27 | class TestOpenGraph(TestExtractionBase):
28 | 
29 |     def test_opengraph(self):
30 |         article = self.getArticle()
31 |         fields = ['opengraph']
32 |         self.runArticleAssertions(article=article, fields=fields)
33 | 


--------------------------------------------------------------------------------
/tests/extractors/authors.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """\
 3 | This is a python port of "Goose" orignialy licensed to Gravity.com
 4 | under one or more contributor license agreements.  See the NOTICE file
 5 | distributed with this work for additional information
 6 | regarding copyright ownership.
 7 | 
 8 | Python port was written by Xavier Grangier for Recrutae
 9 | 
10 | Gravity.com licenses this file
11 | to you under the Apache License, Version 2.0 (the "License");
12 | you may not use this file except in compliance
13 | with the License.  You may obtain a copy of the License at
14 | 
15 | http://www.apache.org/licenses/LICENSE-2.0
16 | 
17 | Unless required by applicable law or agreed to in writing, software
18 | distributed under the License is distributed on an "AS IS" BASIS,
19 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20 | See the License for the specific language governing permissions and
21 | limitations under the License.
22 | """
23 | 
24 | from base import TestExtractionBase
25 | 
26 | 
27 | class TestArticleAuthor(TestExtractionBase):
28 | 
29 |     def test_author_schema(self):
30 |         article = self.getArticle()
31 |         fields = ['authors']
32 |         self.runArticleAssertions(article=article, fields=fields)
33 | 


--------------------------------------------------------------------------------
/goose/resources/text/stopwords-nb.txt:
--------------------------------------------------------------------------------
  1 | alle
  2 | andre
  3 | arbeid
  4 | av
  5 | begge
  6 | bort
  7 | bra
  8 | bruke
  9 | da
 10 | denne
 11 | der
 12 | deres
 13 | det
 14 | din
 15 | disse
 16 | du
 17 | eller
 18 | en
 19 | ene
 20 | eneste
 21 | enhver
 22 | enn
 23 | er
 24 | et
 25 | folk
 26 | for
 27 | fordi
 28 | forsÛke
 29 | fra
 30 | fÅ
 31 | fÛr
 32 | fÛrst
 33 | gjorde
 34 | gjÛre
 35 | god
 36 | gÅ
 37 | ha
 38 | hadde
 39 | han
 40 | hans
 41 | hennes
 42 | her
 43 | hva
 44 | hvem
 45 | hver
 46 | hvilken
 47 | hvis
 48 | hvor
 49 | hvordan
 50 | hvorfor
 51 | ikke
 52 | inn
 53 | innen
 54 | kan
 55 | kunne
 56 | lage
 57 | lang
 58 | lik
 59 | like
 60 | makt
 61 | mange
 62 | med
 63 | meg
 64 | meget
 65 | men
 66 | mens
 67 | mer
 68 | mest
 69 | min
 70 | mye
 71 | mÅ
 72 | mÅte
 73 | navn
 74 | nei
 75 | ny
 76 | nÅ
 77 | nÅr
 78 | og
 79 | ogsÅ
 80 | om
 81 | opp
 82 | oss
 83 | over
 84 | part
 85 | punkt
 86 | pÅ
 87 | rett
 88 | riktig
 89 | samme
 90 | sant
 91 | si
 92 | siden
 93 | sist
 94 | skulle
 95 | slik
 96 | slutt
 97 | som
 98 | start
 99 | stille
100 | tid
101 | til
102 | tilbake
103 | tilstand
104 | under
105 | ut
106 | uten
107 | var
108 | ved
109 | verdi
110 | vi
111 | vil
112 | ville
113 | vite
114 | vÅr
115 | vÖre
116 | vÖrt
117 | Å
118 | 


--------------------------------------------------------------------------------
/tests/data/extractors/content/test_time2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "url": "http://newsfeed.time.com/2011/08/24/washington-monument-closes-to-repair-earthquake-induced-crack/", 
 3 |     "expected": {
 4 |         "meta_description": "Despite what the jeers of jaded Californians might suggest, toppled lawn chairs weren't actually the worst of the damage from Tuesday's earthquake that rattled the East Coast.  The Washington Monument developed a crack near its peak from the magnitude-5.", 
 5 |         "domain": "newsfeed.time.com", 
 6 |         "final_url": "http://newsfeed.time.com/2011/08/24/washington-monument-closes-to-repair-earthquake-induced-crack/", 
 7 |         "meta_keywords": "nation, u.s., crack, damage, earthquake, nation, obelisk, virginia earthquake, washington dc, washington monument", 
 8 |         "cleaned_text": "Despite what the jeers of jaded Californians might suggest", 
 9 |         "tags": [
10 |             "obelisk", 
11 |             "virginia earthquake", 
12 |             "damage", 
13 |             "nation", 
14 |             "washington monument", 
15 |             "Crack", 
16 |             "washington dc", 
17 |             "earthquake"
18 |         ], 
19 |         "meta_favicon": "http://1.gravatar.com/blavatar/de038c9fc06774c15706fda5010eb7cb?s=16", 
20 |         "meta_lang": null
21 |     }
22 | }


--------------------------------------------------------------------------------
/tests/data/extractors/content/test_marketplace.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "url": "http://www.marketplace.org/shows/marketplace-tech-report/marketplace-tech-friday-april-19-2013", 
 3 |     "expected": {
 4 |         "meta_description": "Gun control advocates are looking for new options, including some tech strategies. On Wednesday, seven measures failed in the U.S. Senate, including an amendment that would have expanded background checks. Some gun control advocates are now looking in new directions. So-called 'smart guns' are firearms that only authorized users can fire. But so far, there's not a single one on the market in the U.S.", 
 5 |         "domain": "www.marketplace.org", 
 6 |         "final_url": "http://www.marketplace.org/shows/marketplace-tech-report/marketplace-tech-friday-april-19-2013", 
 7 |         "meta_keywords": "", 
 8 |         "cleaned_text": "Gun control advocates are looking for new options, including some tech strategies. On Wednesday", 
 9 |         "tags": [
10 |             "SOPA", 
11 |             "PIPA", 
12 |             "online privacy", 
13 |             "Tech", 
14 |             "sensors", 
15 |             "guns", 
16 |             "CISPA"
17 |         ], 
18 |         "meta_favicon": "http://www.marketplace.org/sites/default/themes/sitetheme/favicon.ico", 
19 |         "meta_lang": null
20 |     }
21 | }


--------------------------------------------------------------------------------
/tests/extractors/links.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """\
 3 | This is a python port of "Goose" orignialy licensed to Gravity.com
 4 | under one or more contributor license agreements.  See the NOTICE file
 5 | distributed with this work for additional information
 6 | regarding copyright ownership.
 7 | 
 8 | Python port was written by Xavier Grangier for Recrutae
 9 | 
10 | Gravity.com licenses this file
11 | to you under the Apache License, Version 2.0 (the "License");
12 | you may not use this file except in compliance
13 | with the License.  You may obtain a copy of the License at
14 | 
15 | http://www.apache.org/licenses/LICENSE-2.0
16 | 
17 | Unless required by applicable law or agreed to in writing, software
18 | distributed under the License is distributed on an "AS IS" BASIS,
19 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20 | See the License for the specific language governing permissions and
21 | limitations under the License.
22 | """
23 | 
24 | from base import TestExtractionBase
25 | 
26 | 
27 | class TestArticleLinks(TestExtractionBase):
28 | 
29 |     def test_links(self):
30 |         article = self.getArticle()
31 |         number_links = len(article.links)
32 |         expected_number_links = self.data['expected']['links']
33 |         self.assertEqual(number_links, expected_number_links)
34 | 


--------------------------------------------------------------------------------
/tests/extractors/tweets.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """\
 3 | This is a python port of "Goose" orignialy licensed to Gravity.com
 4 | under one or more contributor license agreements.  See the NOTICE file
 5 | distributed with this work for additional information
 6 | regarding copyright ownership.
 7 | 
 8 | Python port was written by Xavier Grangier for Recrutae
 9 | 
10 | Gravity.com licenses this file
11 | to you under the Apache License, Version 2.0 (the "License");
12 | you may not use this file except in compliance
13 | with the License.  You may obtain a copy of the License at
14 | 
15 | http://www.apache.org/licenses/LICENSE-2.0
16 | 
17 | Unless required by applicable law or agreed to in writing, software
18 | distributed under the License is distributed on an "AS IS" BASIS,
19 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20 | See the License for the specific language governing permissions and
21 | limitations under the License.
22 | """
23 | 
24 | from base import TestExtractionBase
25 | 
26 | 
27 | class TestArticleTweet(TestExtractionBase):
28 | 
29 |     def test_tweet(self):
30 |         article = self.getArticle()
31 |         number_tweets = len(article.tweets)
32 |         expected_number_tweets = self.data['expected']['tweets']
33 |         self.assertEqual(number_tweets, expected_number_tweets)
34 | 


--------------------------------------------------------------------------------
/goose/extractors/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """\
 3 | This is a python port of "Goose" orignialy licensed to Gravity.com
 4 | under one or more contributor license agreements.  See the NOTICE file
 5 | distributed with this work for additional information
 6 | regarding copyright ownership.
 7 | 
 8 | Python port was written by Xavier Grangier for Recrutae
 9 | 
10 | Gravity.com licenses this file
11 | to you under the Apache License, Version 2.0 (the "License");
12 | you may not use this file except in compliance
13 | with the License.  You may obtain a copy of the License at
14 | 
15 | http://www.apache.org/licenses/LICENSE-2.0
16 | 
17 | Unless required by applicable law or agreed to in writing, software
18 | distributed under the License is distributed on an "AS IS" BASIS,
19 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20 | See the License for the specific language governing permissions and
21 | limitations under the License.
22 | """
23 | 
24 | 
25 | class BaseExtractor(object):
26 | 
27 |     def __init__(self, config, article):
28 |         # config
29 |         self.config = config
30 | 
31 |         # parser
32 |         self.parser = self.config.get_parser()
33 | 
34 |         # article
35 |         self.article = article
36 | 
37 |         # stopwords class
38 |         self.stopwords_class = config.stopwords_class
39 | 


--------------------------------------------------------------------------------
/goose/resources/text/stopwords-zh.txt:
--------------------------------------------------------------------------------
  1 | 的
  2 | 一
  3 | 不
  4 | 在
  5 | 人
  6 | 有
  7 | 是
  8 | 为
  9 | 以
 10 | 于
 11 | 上
 12 | 他
 13 | 而
 14 | 后
 15 | 之
 16 | 来
 17 | 及
 18 | 了
 19 | 因
 20 | 下
 21 | 可
 22 | 到
 23 | 由
 24 | 这
 25 | 与
 26 | 也
 27 | 此
 28 | 但
 29 | 并
 30 | 个
 31 | 其
 32 | 已
 33 | 无
 34 | 小
 35 | 我
 36 | 们
 37 | 起
 38 | 最
 39 | 再
 40 | 今
 41 | 去
 42 | 好
 43 | 只
 44 | 又
 45 | 或
 46 | 很
 47 | 亦
 48 | 某
 49 | 把
 50 | 那
 51 | 你
 52 | 乃
 53 | 它
 54 | 吧
 55 | 被
 56 | 比
 57 | 别
 58 | 趁
 59 | 当
 60 | 从
 61 | 到
 62 | 得
 63 | 打
 64 | 凡
 65 | 儿
 66 | 尔
 67 | 该
 68 | 各
 69 | 给
 70 | 跟
 71 | 和
 72 | 何
 73 | 还
 74 | 即
 75 | 几
 76 | 既
 77 | 看
 78 | 据
 79 | 距
 80 | 靠
 81 | 啦
 82 | 了
 83 | 另
 84 | 么
 85 | 每
 86 | 们
 87 | 嘛
 88 | 拿
 89 | 哪
 90 | 那
 91 | 您
 92 | 凭
 93 | 且
 94 | 却
 95 | 让
 96 | 仍
 97 | 啥
 98 | 如
 99 | 若
100 | 使
101 | 谁
102 | 虽
103 | 随
104 | 同
105 | 所
106 | 她
107 | 哇
108 | 嗡
109 | 往
110 | 哪
111 | 些
112 | 向
113 | 沿
114 | 哟
115 | 用
116 | 于
117 | 咱
118 | 则
119 | 怎
120 | 曾
121 | 至
122 | 致
123 | 着
124 | 诸
125 | 自
126 | 為
127 | 於
128 | 後
129 | 這
130 | 與
131 | 並
132 | 個
133 | 無
134 | 們
135 | 當
136 | 從
137 | 兒
138 | 爾
139 | 該
140 | 給
141 | 還
142 | 幾
143 | 麼
144 | 憑
145 | 卻
146 | 讓
147 | 誰
148 | 雖
149 | 喲
150 | 則
151 | 諸
152 | 


--------------------------------------------------------------------------------
/goose/extractors/links.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """\
 3 | This is a python port of "Goose" orignialy licensed to Gravity.com
 4 | under one or more contributor license agreements.  See the NOTICE file
 5 | distributed with this work for additional information
 6 | regarding copyright ownership.
 7 | 
 8 | Python port was written by Xavier Grangier for Recrutae
 9 | 
10 | Gravity.com licenses this file
11 | to you under the Apache License, Version 2.0 (the "License");
12 | you may not use this file except in compliance
13 | with the License.  You may obtain a copy of the License at
14 | 
15 | http://www.apache.org/licenses/LICENSE-2.0
16 | 
17 | Unless required by applicable law or agreed to in writing, software
18 | distributed under the License is distributed on an "AS IS" BASIS,
19 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20 | See the License for the specific language governing permissions and
21 | limitations under the License.
22 | """
23 | 
24 | from goose.extractors import BaseExtractor
25 | 
26 | 
27 | class LinksExtractor(BaseExtractor):
28 | 
29 |     def extract(self):
30 |         links = []
31 |         items = self.parser.getElementsByTag(self.article.top_node, 'a')
32 |         for i in items:
33 |             attr = self.parser.getAttribute(i, 'href')
34 |             if attr:
35 |                 links.append(attr)
36 |         return links
37 | 


--------------------------------------------------------------------------------
/tests/data/extractors/content/test_engadget.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "url": "http://www.engadget.com/2010/08/18/verizon-fios-set-top-boxes-getting-a-new-hd-guide-external-stor/", 
 3 |     "expected": {
 4 |         "meta_description": "Streaming and downloading TV content to mobiles is nice, but we enjoy watching TV... on the TV, and Verizon had plenty of that to talk", 
 5 |         "domain": "www.engadget.com", 
 6 |         "final_url": "http://www.engadget.com/2010/08/18/verizon-fios-set-top-boxes-getting-a-new-hd-guide-external-stor/", 
 7 |         "meta_keywords": "", 
 8 |         "cleaned_text": "Streaming and downloading TV content to mobiles is nice", 
 9 |         "tags": [
10 |             "img 1.9", 
11 |             "external storage", 
12 |             "chaptering", 
13 |             "verizon", 
14 |             "esata", 
15 |             "Motorola", 
16 |             "multiroom", 
17 |             "gui", 
18 |             "Set-topBox", 
19 |             "set-top box", 
20 |             "1.9", 
21 |             "fios", 
22 |             "Android coverage by humans", 
23 |             "Img1.9", 
24 |             "moca", 
25 |             "fios tv", 
26 |             "ExternalStorage", 
27 |             "FiosTv", 
28 |             "FiOS TV", 
29 |             "dvr"
30 |         ], 
31 |         "meta_favicon": "http://www.blogsmithmedia.com/www.engadget.com/media/favicon.ico", 
32 |         "meta_lang": null
33 |     }
34 | }


--------------------------------------------------------------------------------
/tests/extractors/title.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """\
 3 | This is a python port of "Goose" orignialy licensed to Gravity.com
 4 | under one or more contributor license agreements.  See the NOTICE file
 5 | distributed with this work for additional information
 6 | regarding copyright ownership.
 7 | 
 8 | Python port was written by Xavier Grangier for Recrutae
 9 | 
10 | Gravity.com licenses this file
11 | to you under the Apache License, Version 2.0 (the "License");
12 | you may not use this file except in compliance
13 | with the License.  You may obtain a copy of the License at
14 | 
15 | http://www.apache.org/licenses/LICENSE-2.0
16 | 
17 | Unless required by applicable law or agreed to in writing, software
18 | distributed under the License is distributed on an "AS IS" BASIS,
19 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20 | See the License for the specific language governing permissions and
21 | limitations under the License.
22 | """
23 | 
24 | from base import TestExtractionBase
25 | 
26 | 
27 | class TestTitle(TestExtractionBase):
28 | 
29 |     def test_title_opengraph(self):
30 |         article = self.getArticle()
31 |         fields = ['title']
32 |         self.runArticleAssertions(article=article, fields=fields)
33 | 
34 |     def test_title_empty(self):
35 |         article = self.getArticle()
36 |         fields = ['title']
37 |         self.runArticleAssertions(article=article, fields=fields)
38 | 


--------------------------------------------------------------------------------
/tests/data/extractors/content/test_get_canonical_url.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "url": "http://www.marketplace.org/shows/marketplace-tech-report/marketplace-tech-friday-april-19-2013", 
 3 |     "expected": {
 4 |         "meta_description": "Gun control advocates are looking for new options, including some tech strategies. On Wednesday, seven measures failed in the U.S. Senate, including an amendment that would have expanded background checks. Some gun control advocates are now looking in new directions. So-called 'smart guns' are firearms that only authorized users can fire. But so far, there's not a single one on the market in the U.S.", 
 5 |         "domain": "www.marketplace.org", 
 6 |         "final_url": "http://www.marketplace.org/shows/marketplace-tech-report/marketplace-tech-friday-april-19-2013", 
 7 |         "meta_keywords": "", 
 8 |         "cleaned_text": "Gun control advocates are looking for new options, including some tech strategies. On Wednesday", 
 9 |         "tags": [
10 |             "SOPA", 
11 |             "PIPA", 
12 |             "online privacy", 
13 |             "Tech", 
14 |             "sensors", 
15 |             "guns", 
16 |             "CISPA"
17 |         ], 
18 |         "canonical_link": "http://www.marketplace.org/shows/marketplace-tech-report/marketplace-tech-friday-april-19-2013", 
19 |         "meta_favicon": "http://www.marketplace.org/sites/default/themes/sitetheme/favicon.ico", 
20 |         "meta_lang": null
21 |     }
22 | }


--------------------------------------------------------------------------------
/goose/video.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """\
 3 | This is a python port of "Goose" orignialy licensed to Gravity.com
 4 | under one or more contributor license agreements.  See the NOTICE file
 5 | distributed with this work for additional information
 6 | regarding copyright ownership.
 7 | 
 8 | Python port was written by Xavier Grangier for Recrutae
 9 | 
10 | Gravity.com licenses this file
11 | to you under the Apache License, Version 2.0 (the "License");
12 | you may not use this file except in compliance
13 | with the License.  You may obtain a copy of the License at
14 | 
15 | http://www.apache.org/licenses/LICENSE-2.0
16 | 
17 | Unless required by applicable law or agreed to in writing, software
18 | distributed under the License is distributed on an "AS IS" BASIS,
19 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20 | See the License for the specific language governing permissions and
21 | limitations under the License.
22 | """
23 | 
24 | class Video(object):
25 |     """\
26 |     Video object
27 |     """
28 | 
29 |     def __init__(self):
30 | 
31 |         # type of embed
32 |         # embed, object, iframe
33 |         self.embed_type = None
34 | 
35 |         # video provider name
36 |         self.provider = None
37 | 
38 |         # width
39 |         self.width = None
40 | 
41 |         # height
42 |         self.height = None
43 | 
44 |         # embed code
45 |         self.embed_code = None
46 | 
47 |         # src
48 |         self.src = None
49 | 


--------------------------------------------------------------------------------
/tests/configuration.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """\
 3 | This is a python port of "Goose" orignialy licensed to Gravity.com
 4 | under one or more contributor license agreements.  See the NOTICE file
 5 | distributed with this work for additional information
 6 | regarding copyright ownership.
 7 | 
 8 | Python port was written by Xavier Grangier for Recrutae
 9 | 
10 | Gravity.com licenses this file
11 | to you under the Apache License, Version 2.0 (the "License");
12 | you may not use this file except in compliance
13 | with the License.  You may obtain a copy of the License at
14 | 
15 | http://www.apache.org/licenses/LICENSE-2.0
16 | 
17 | Unless required by applicable law or agreed to in writing, software
18 | distributed under the License is distributed on an "AS IS" BASIS,
19 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20 | See the License for the specific language governing permissions and
21 | limitations under the License.
22 | """
23 | import os
24 | import tempfile
25 | import unittest
26 | 
27 | from goose import Goose
28 | 
29 | 
30 | class TestTempDir(unittest.TestCase):
31 | 
32 |     def test_tmp_defaut(self):
33 |         g = Goose()
34 |         default_local_storage_path = os.path.join(tempfile.gettempdir(), 'goose')
35 |         self.assertEquals(g.config.local_storage_path, default_local_storage_path)
36 | 
37 |     def test_tmp_overwritten(self):
38 |         path = '/tmp/bla'
39 |         g = Goose({'local_storage_path': path})
40 |         self.assertEquals(g.config.local_storage_path, path)
41 | 


--------------------------------------------------------------------------------
/tests/data/extractors/links/test_links.html:
--------------------------------------------------------------------------------
 1 | <html>
 2 |     <body>
 3 |         <div>
 4 |             <p>
 5 |               TextNode 1 - The Scala supported IDE is one of the few pain points of developers who want to start using Scala in their Java project. On existing long term project developed by a team its hard to step in and introduce a new language that is not supported by the existing IDE. On way to go about it is to hid the fact that you use Scala from the Java world by using one way dependency injection. 
 6 |               <a href="https://twitter.com/hashtag/python?src=hash">links</a>
 7 |               Still, if you wish to truly absorb Scala into your existing java environment then you'll soon introduced cross language dependencies.
 8 |             </p>
 9 |         </div>
10 |         <div itemprop="articleBody">
11 |             <p>
12 |               TextNode 1 - The Scala supported IDE is one of the few pain points of developers who want to start using Scala in their Java project. On existing long term project developed by a <a href="https://twitter.com/hashtag/python?src=hash"> team its hard </a> to step in and introduce a new language that is not supported by the existing IDE. On way to go about it is to hid the fact that you use Scala from the Java world by using one way dependency injection. Still, if you wish <a href="https://twitter.com/hashtag/python?src=hash">to truly absorb Scala into your</a> existing java environment then you'll soon introduced cross language dependencies.
13 |             </p>
14 |         </div>
15 |     </body>
16 | </html>
17 | 


--------------------------------------------------------------------------------
/goose/extractors/opengraph.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """\
 3 | This is a python port of "Goose" orignialy licensed to Gravity.com
 4 | under one or more contributor license agreements.  See the NOTICE file
 5 | distributed with this work for additional information
 6 | regarding copyright ownership.
 7 | 
 8 | Python port was written by Xavier Grangier for Recrutae
 9 | 
10 | Gravity.com licenses this file
11 | to you under the Apache License, Version 2.0 (the "License");
12 | you may not use this file except in compliance
13 | with the License.  You may obtain a copy of the License at
14 | 
15 | http://www.apache.org/licenses/LICENSE-2.0
16 | 
17 | Unless required by applicable law or agreed to in writing, software
18 | distributed under the License is distributed on an "AS IS" BASIS,
19 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20 | See the License for the specific language governing permissions and
21 | limitations under the License.
22 | """
23 | 
24 | from goose.extractors import BaseExtractor
25 | 
26 | 
27 | class OpenGraphExtractor(BaseExtractor):
28 | 
29 |     def extract(self):
30 |         opengraph_dict = {}
31 |         node = self.article.doc
32 |         metas = self.parser.getElementsByTag(node, 'meta')
33 |         for meta in metas:
34 |             attr = self.parser.getAttribute(meta, 'property')
35 |             if attr is not None and attr.startswith("og:"):
36 |                 value = self.parser.getAttribute(meta, 'content')
37 |                 if value:
38 |                     opengraph_dict.update({attr.split(":")[1]: value})
39 |         return opengraph_dict
40 | 


--------------------------------------------------------------------------------
/goose/extractors/tweets.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """\
 3 | This is a python port of "Goose" orignialy licensed to Gravity.com
 4 | under one or more contributor license agreements.  See the NOTICE file
 5 | distributed with this work for additional information
 6 | regarding copyright ownership.
 7 | 
 8 | Python port was written by Xavier Grangier for Recrutae
 9 | 
10 | Gravity.com licenses this file
11 | to you under the Apache License, Version 2.0 (the "License");
12 | you may not use this file except in compliance
13 | with the License.  You may obtain a copy of the License at
14 | 
15 | http://www.apache.org/licenses/LICENSE-2.0
16 | 
17 | Unless required by applicable law or agreed to in writing, software
18 | distributed under the License is distributed on an "AS IS" BASIS,
19 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20 | See the License for the specific language governing permissions and
21 | limitations under the License.
22 | """
23 | 
24 | from goose.extractors import BaseExtractor
25 | 
26 | 
27 | class TweetsExtractor(BaseExtractor):
28 | 
29 |     def extract(self):
30 |         tweets = []
31 |         items = self.parser.getElementsByTag(
32 |                         self.article.top_node,
33 |                         tag='blockquote',
34 |                         attr="class",
35 |                         value="twitter-tweet")
36 | 
37 |         for i in items:
38 |             for attr in ['gravityScore', 'gravityNodes']:
39 |                 self.parser.delAttribute(i, attr)
40 |             tweets.append(self.parser.nodeToString(i))
41 | 
42 |         return tweets
43 | 


--------------------------------------------------------------------------------
/tests/data/extractors/content/test_lefigaro.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "url": "http://www.lefigaro.fr/conjoncture/2013/04/05/20002-20130405ARTFIG00473-montebourg-envisage-des-privatisations-partielles.php", 
 3 |     "target_language": "fr",
 4 |     "expected": {
 5 |         "meta_description": "Selon le ministre du Redressement productif interview\u00e9 par le Wall Street Journal, le gouvernement r\u00e9fl\u00e9chit \u00e0 des changements dans les participations d\u00e9tenues par l'\u00c9tat.", 
 6 |         "domain": "www.lefigaro.fr", 
 7 |         "final_url": "http://www.lefigaro.fr/conjoncture/2013/04/05/20002-20130405ARTFIG00473-montebourg-envisage-des-privatisations-partielles.php", 
 8 |         "meta_keywords": "Actualit\u00e9 \u00e9conomique, entreprises, \u00e9conomie, bourse, emploi, imp\u00f4ts, cac 40, creation d'entreprise, chef d'entreprise, grands patrons, consommation, multinationales, privatisation, d\u00e9localisations, concurrence, monopole, crise, bourse, licenciements, union europ\u00e9enne, etats-unis, chine, pmi, pme, tpe, salaires, relance, pib, pnb, aides sociales, japon, r\u00e9cession, \u00e9conomie verte, fmi, reprise, croissance, news, actu", 
 9 |         "cleaned_text": "«Dans le cadre de l'effort de restructuration budgétaire", 
10 |         "tags": [
11 |             "EDF", 
12 |             "Privatisation", 
13 |             "Arnaud Montebourg", 
14 |             "Pierre Moscovici", 
15 |             "Participations de l'Etat", 
16 |             "Entreprises publiques", 
17 |             "AREVA"
18 |         ], 
19 |         "meta_favicon": "http://www.lefigaro.fr/icones/favicon.ico", 
20 |         "meta_lang": null
21 |     }
22 | }
23 | 


--------------------------------------------------------------------------------
/goose/resources/text/stopwords-pt.txt:
--------------------------------------------------------------------------------
  1 | último
  2 | é
  3 | acerca
  4 | agora
  5 | algmas
  6 | alguns
  7 | ali
  8 | ambos
  9 | antes
 10 | apontar
 11 | aquela
 12 | aquelas
 13 | aquele
 14 | aqueles
 15 | aqui
 16 | atrás
 17 | bem
 18 | bom
 19 | cada
 20 | caminho
 21 | cima
 22 | com
 23 | como
 24 | comprido
 25 | conhecido
 26 | corrente
 27 | das
 28 | debaixo
 29 | dentro
 30 | desde
 31 | desligado
 32 | deve
 33 | devem
 34 | deverá
 35 | direita
 36 | diz
 37 | dizer
 38 | dois
 39 | dos
 40 | e
 41 | ela
 42 | ele
 43 | eles
 44 | em
 45 | enquanto
 46 | então
 47 | está
 48 | estão
 49 | estado
 50 | estar
 51 | estará
 52 | este
 53 | estes
 54 | esteve
 55 | estive
 56 | estivemos
 57 | estiveram
 58 | eu
 59 | fará
 60 | faz
 61 | fazer
 62 | fazia
 63 | fez
 64 | fim
 65 | foi
 66 | fora
 67 | horas
 68 | iniciar
 69 | inicio
 70 | ir
 71 | irá
 72 | ista
 73 | iste
 74 | isto
 75 | ligado
 76 | maioria
 77 | maiorias
 78 | mais
 79 | mas
 80 | mesmo
 81 | meu
 82 | muito
 83 | muitos
 84 | nós
 85 | não
 86 | nome
 87 | nosso
 88 | novo
 89 | o
 90 | onde
 91 | os
 92 | ou
 93 | outro
 94 | para
 95 | parte
 96 | pegar
 97 | pelo
 98 | pessoas
 99 | pode
100 | poderá
101 | podia
102 | por
103 | porque
104 | povo
105 | promeiro
106 | quê
107 | qual
108 | qualquer
109 | quando
110 | quem
111 | quieto
112 | são
113 | saber
114 | sem
115 | ser
116 | seu
117 | somente
118 | têm
119 | tal
120 | também
121 | tem
122 | tempo
123 | tenho
124 | tentar
125 | tentaram
126 | tente
127 | tentei
128 | teu
129 | teve
130 | tipo
131 | tive
132 | todos
133 | trabalhar
134 | trabalho
135 | tu
136 | um
137 | uma
138 | umas
139 | uns
140 | usa
141 | usar
142 | valor
143 | veja
144 | ver
145 | verdade
146 | verdadeiro
147 | você
148 | 


--------------------------------------------------------------------------------
/tests/data/extractors/images/test_opengraph_tag/test_opengraph_tag.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/loose.dtd"><html>
 2 | <html>
 3 |     <head>
 4 |         <title>test</title>
 5 |         <meta property="og:image" content="http://go.com/images/465395/" />
 6 |     </head>
 7 | 
 8 |     <body>
 9 |         <div>
10 |             <h1>title</h1>
11 |             <p>
12 |                 TextNode 1 - The Scala supported IDE is one of the few pain points of developers who want to start using Scala in their Java project. On existing long term project developed by a team its hard to step in and introduce a new language that is not supported by the existing IDE. On way to go about it is to hid the fact that you use Scala from the Java world by using one way dependency injection. Still, if you wish to truly absorb Scala into your existing java environment then you'll soon introduced cross language dependencies.
13 |             </p>
14 |             <p>
15 |                 Most of our team is using Eclipse as the main IDE, its incrimental compilation in Java with its tight JUnit integration are great for fast TDD programming. Unfortunately the Eclipse Scala plugin is not there yet, it may hangs the IDE and messes up Java compilation - especially in large (more then 1000 source files) Java/Scala projects. Though the plugin is getting better over time some developers would find the plugin as a majore drag on their productivity.
16 |                 For developers who do not write Scala at all or rather edit Scala with other editors, you can use this alternate path which lets them work on their Java or Scala code without messing with the plugin.
17 |             </p>
18 |         </div>
19 |     </body>
20 | </html>


--------------------------------------------------------------------------------
/tests/extractors/publishdate.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """\
 3 | This is a python port of "Goose" orignialy licensed to Gravity.com
 4 | under one or more contributor license agreements.  See the NOTICE file
 5 | distributed with this work for additional information
 6 | regarding copyright ownership.
 7 | 
 8 | Python port was written by Xavier Grangier for Recrutae
 9 | 
10 | Gravity.com licenses this file
11 | to you under the Apache License, Version 2.0 (the "License");
12 | you may not use this file except in compliance
13 | with the License.  You may obtain a copy of the License at
14 | 
15 | http://www.apache.org/licenses/LICENSE-2.0
16 | 
17 | Unless required by applicable law or agreed to in writing, software
18 | distributed under the License is distributed on an "AS IS" BASIS,
19 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20 | See the License for the specific language governing permissions and
21 | limitations under the License.
22 | """
23 | 
24 | from base import TestExtractionBase
25 | 
26 | 
27 | class TestPublishDate(TestExtractionBase):
28 | 
29 |     def test_publish_date(self):
30 |         article = self.getArticle()
31 |         self.runArticleAssertions(article=article, fields=['publish_date'])
32 | 
33 |     def test_publish_date_rnews(self):
34 |         article = self.getArticle()
35 |         self.runArticleAssertions(article=article, fields=['publish_date'])
36 | 
37 |     def test_publish_date_article(self):
38 |         article = self.getArticle()
39 |         self.runArticleAssertions(article=article, fields=['publish_date'])
40 | 
41 |     def test_publish_date_schema(self):
42 |         article = self.getArticle()
43 |         self.runArticleAssertions(article=article, fields=['publish_date'])
44 | 


--------------------------------------------------------------------------------
/goose/extractors/authors.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """\
 3 | This is a python port of "Goose" orignialy licensed to Gravity.com
 4 | under one or more contributor license agreements.  See the NOTICE file
 5 | distributed with this work for additional information
 6 | regarding copyright ownership.
 7 | 
 8 | Python port was written by Xavier Grangier for Recrutae
 9 | 
10 | Gravity.com licenses this file
11 | to you under the Apache License, Version 2.0 (the "License");
12 | you may not use this file except in compliance
13 | with the License.  You may obtain a copy of the License at
14 | 
15 | http://www.apache.org/licenses/LICENSE-2.0
16 | 
17 | Unless required by applicable law or agreed to in writing, software
18 | distributed under the License is distributed on an "AS IS" BASIS,
19 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20 | See the License for the specific language governing permissions and
21 | limitations under the License.
22 | """
23 | 
24 | from goose.extractors import BaseExtractor
25 | 
26 | 
27 | class AuthorsExtractor(BaseExtractor):
28 | 
29 |     def extract(self):
30 |         authors = []
31 |         author_nodes = self.parser.getElementsByTag(
32 |                             self.article.doc,
33 |                             attr='itemprop',
34 |                             value='author')
35 | 
36 |         for author in author_nodes:
37 |             name_nodes = self.parser.getElementsByTag(
38 |                             author,
39 |                             attr='itemprop',
40 |                             value='name')
41 | 
42 |             if len(name_nodes) > 0:
43 |                 name = self.parser.getText(name_nodes[0])
44 |                 authors.append(name)
45 | 
46 |         return list(set(authors))
47 | 


--------------------------------------------------------------------------------
/tests/data/extractors/content/test_bbc_chinese.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "url": "http://www.bbc.co.uk/zhongwen/simp/chinese_news/2012/12/121210_hongkong_politics.shtml", 
 3 |     "expected": {
 4 |         "meta_description": "\u9999\u6e2f\u884c\u653f\u957f\u5b98\u6881\u632f\u82f1\u5728\u5404\u65b9\u538b\u529b\u4e0b\u5230\u7acb\u6cd5\u4f1a\u63a5\u53d7\u8d28\u8be2\uff0c\u5c31\u5176\u5927\u5b85\u7684\u8fdd\u7ae0\u5efa\u7b51\u95ee\u9898\u9053\u6b49\u3002", 
 5 |         "domain": "www.bbc.co.uk", 
 6 |         "final_url": "http://www.bbc.co.uk/zhongwen/simp/chinese_news/2012/12/121210_hongkong_politics.shtml", 
 7 |         "meta_keywords": "\u9999\u6e2f, \u6881\u632f\u82f1, \u884c\u653f\u957f\u5b98, \u7279\u9996, \u5c71\u9876, \u8fdd\u7ae0, \u8fdd\u4f8b, \u8fdd\u6cd5, \u975e\u6cd5, \u5efa\u7b51, \u50ed\u5efa, \u7acb\u6cd5\u4f1a", 
 8 |         "cleaned_text": "\u9999\u6e2f\u884c\u653f\u957f\u5b98\u6881\u632f\u82f1\u5728\u5404\u65b9\u538b\u529b\u4e0b\u5c31\u5176\u5927\u5b85\u7684\u8fdd\u7ae0\u5efa\u7b51\uff08\u50ed\u5efa\uff09\u95ee\u9898\u5230\u7acb\u6cd5\u4f1a\u63a5\u53d7\u8d28\u8be2\uff0c\u5e76\u5411\u9999\u6e2f\u6c11\u4f17\u9053\u6b49\u3002\n\n\u6881\u632f\u82f1\u5728\u661f\u671f\u4e8c\uff0812\u670810\u65e5\uff09\u7684\u7b54\u95ee\u5927\u4f1a\u5f00\u59cb\u4e4b\u9645\u5728\u5176\u6f14\u8bf4\u4e2d\u9053\u6b49\uff0c\u4f46\u5f3a\u8c03\u4ed6\u5728\u8fdd\u7ae0\u5efa\u7b51\u95ee\u9898\u4e0a\u6ca1\u6709\u9690\u7792\u7684\u610f\u56fe\u548c\u52a8\u673a\u3002\n\n\u4e00\u4e9b\u4eb2\u5317\u4eac\u9635\u8425\u8bae\u5458\u6b22\u8fce\u6881\u632f\u82f1\u9053\u6b49\uff0c\u4e14\u8ba4\u4e3a\u5e94\u80fd\u83b7\u5f97\u9999\u6e2f\u6c11\u4f17\u63a5\u53d7\uff0c\u4f46\u8fd9\u4e9b\u8bae\u5458\u4e5f\u8d28\u95ee\u6881\u632f\u82f1\u6709", 
 9 |         "meta_favicon": "http://www.bbc.co.uk/favicon.ico", 
10 |         "meta_lang": "zh"
11 |     }
12 | }


--------------------------------------------------------------------------------
/tests/data/extractors/images/test_known_image_name_parent/test_known_image_name_parent.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/loose.dtd"><html>
 2 | <html>
 3 |     <head>
 4 |         <title>test</title>
 5 |     </head>
 6 | 
 7 |     <body>
 8 |         <div class="big_photo">
 9 |             <img src="http://go.com/images/465395/"/>
10 |         </div>
11 |         <div>
12 |             <h1>title</h1>
13 |             <p>
14 |                 TextNode 1 - The Scala supported IDE is one of the few pain points of developers who want to start using Scala in their Java project. On existing long term project developed by a team its hard to step in and introduce a new language that is not supported by the existing IDE. On way to go about it is to hid the fact that you use Scala from the Java world by using one way dependency injection. Still, if you wish to truly absorb Scala into your existing java environment then you'll soon introduced cross language dependencies.
15 |             </p>
16 |             <p>
17 |                 Most of our team is using Eclipse as the main IDE, its incrimental compilation in Java with its tight JUnit integration are great for fast TDD programming. Unfortunately the Eclipse Scala plugin is not there yet, it may hangs the IDE and messes up Java compilation - especially in large (more then 1000 source files) Java/Scala projects. Though the plugin is getting better over time some developers would find the plugin as a majore drag on their productivity.
18 |                 For developers who do not write Scala at all or rather edit Scala with other editors, you can use this alternate path which lets them work on their Java or Scala code without messing with the plugin.
19 |             </p>
20 |         </div>
21 |     </body>
22 | </html>


--------------------------------------------------------------------------------
/goose/resources/text/stopwords-ar.txt:
--------------------------------------------------------------------------------
  1 | فى
  2 | في
  3 | كل
  4 | لم
  5 | لن
  6 | له
  7 | من
  8 | هو
  9 | هي
 10 | قوة
 11 | كما
 12 | لها
 13 | منذ
 14 | وقد
 15 | ولا
 16 | نفسه
 17 | لقاء
 18 | مقابل
 19 | هناك
 20 | وقال
 21 | وكان
 22 | نهاية
 23 | وقالت
 24 | وكانت
 25 | للامم
 26 | فيه
 27 | كلم
 28 | لكن
 29 | وفي
 30 | وقف
 31 | ولم
 32 | ومن
 33 | وهو
 34 | وهي
 35 | يوم
 36 | فيها
 37 | منها
 38 | مليار
 39 | لوكالة
 40 | يكون
 41 | يمكن
 42 | مليون
 43 | حيث
 44 | اكد
 45 | الا
 46 | اما
 47 | امس
 48 | السابق
 49 | التى
 50 | التي
 51 | اكثر
 52 | ايار
 53 | ايضا
 54 | ثلاثة
 55 | الذاتي
 56 | الاخيرة
 57 | الثاني
 58 | الثانية
 59 | الذى
 60 | الذي
 61 | الان
 62 | امام
 63 | ايام
 64 | خلال
 65 | حوالى
 66 | الذين
 67 | الاول
 68 | الاولى
 69 | بين
 70 | ذلك
 71 | دون
 72 | حول
 73 | حين
 74 | الف
 75 | الى
 76 | انه
 77 | اول
 78 | ضمن
 79 | انها
 80 | جميع
 81 | الماضي
 82 | الوقت
 83 | المقبل
 84 | اليوم
 85 | ـ
 86 | ف
 87 | و
 88 | و6
 89 | قد
 90 | لا
 91 | ما
 92 | مع
 93 | مساء
 94 | هذا
 95 | واحد
 96 | واضاف
 97 | واضافت
 98 | فان
 99 | قبل
100 | قال
101 | كان
102 | لدى
103 | نحو
104 | هذه
105 | وان
106 | واكد
107 | كانت
108 | واوضح
109 | مايو
110 | ب
111 | ا
112 | أ
113 | ،
114 | عشر
115 | عدد
116 | عدة
117 | عشرة
118 | عدم
119 | عام
120 | عاما
121 | عن
122 | عند
123 | عندما
124 | على
125 | عليه
126 | عليها
127 | زيارة
128 | سنة
129 | سنوات
130 | تم
131 | ضد
132 | بعد
133 | بعض
134 | اعادة
135 | اعلنت
136 | بسبب
137 | حتى
138 | اذا
139 | احد
140 | اثر
141 | برس
142 | باسم
143 | غدا
144 | شخصا
145 | صباح
146 | اطار
147 | اربعة
148 | اخرى
149 | بان
150 | اجل
151 | غير
152 | بشكل
153 | حاليا
154 | بن
155 | به
156 | ثم
157 | اف
158 | ان
159 | او
160 | اي
161 | بها
162 | صفر


--------------------------------------------------------------------------------
/tests/data/extractors/content/test_testHuffingtonPost.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "url": "http://www.huffingtonpost.com/2010/08/13/federal-reserve-pursuing_n_681540.html", 
 3 |     "expected": {
 4 |         "meta_description": "A top regional Federal Reserve official sharply criticized Friday the Fed's ongoing policy of keeping interest rates near zero -- and at record lows -- as a \"dangerous gamble.\"", 
 5 |         "domain": "www.huffingtonpost.com", 
 6 |         "final_url": "http://www.huffingtonpost.com/2010/08/13/federal-reserve-pursuing_n_681540.html", 
 7 |         "meta_keywords": "federal, reserve's, low, rate, policy, is, a, 'dangerous, gamble,', says, top, central, bank, official, business", 
 8 |         "cleaned_text": "A top regional Federal Reserve official sharply criticized Friday", 
 9 |         "tags": [
10 |             "Financial Crisis", 
11 |             "Financial Reform", 
12 |             "Federal Reserve", 
13 |             "Great Recession", 
14 |             "Fomc", 
15 |             "Thomas Hoenig", 
16 |             "Federal Open Market Committee", 
17 |             "Monetary Policy", 
18 |             "Kansas City Fed", 
19 |             "Financial Regulatory Reform", 
20 |             "The Financial Fix", 
21 |             "Wall Street Reform", 
22 |             "Too Big To Fail", 
23 |             "Federal Reserve Bank Of Kansas City", 
24 |             "Interest Rates", 
25 |             "Financial Regulation"
26 |         ], 
27 |         "description": "A top regional Federal Reserve official sharply criticized Friday the Fed's ongoing policy of keeping interest rates near zero -- and at record lows -- as a \"dangerous gamble.\"", 
28 |         "title": "Federal Reserve's Low Rate Policy Is A 'Dangerous Gamble,' Says Top Central Bank Official", 
29 |         "meta_favicon": "/favicon.ico", 
30 |         "meta_lang": "en"
31 |     }
32 | }


--------------------------------------------------------------------------------
/goose/extractors/tags.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """\
 3 | This is a python port of "Goose" orignialy licensed to Gravity.com
 4 | under one or more contributor license agreements.  See the NOTICE file
 5 | distributed with this work for additional information
 6 | regarding copyright ownership.
 7 | 
 8 | Python port was written by Xavier Grangier for Recrutae
 9 | 
10 | Gravity.com licenses this file
11 | to you under the Apache License, Version 2.0 (the "License");
12 | you may not use this file except in compliance
13 | with the License.  You may obtain a copy of the License at
14 | 
15 | http://www.apache.org/licenses/LICENSE-2.0
16 | 
17 | Unless required by applicable law or agreed to in writing, software
18 | distributed under the License is distributed on an "AS IS" BASIS,
19 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20 | See the License for the specific language governing permissions and
21 | limitations under the License.
22 | """
23 | 
24 | from goose.extractors import BaseExtractor
25 | 
26 | A_REL_TAG_SELECTOR = "a[rel=tag]"
27 | A_HREF_TAG_SELECTOR = "a[href*='/tag/'], a[href*='/tags/'], a[href*='/topic/'], a[href*='?keyword=']"
28 | 
29 | 
30 | class TagsExtractor(BaseExtractor):
31 | 
32 |     def extract(self):
33 |         node = self.article.doc
34 |         tags = []
35 | 
36 |         # node doesn't have chidren
37 |         if len(list(node)) == 0:
38 |             return tags
39 | 
40 |         elements = self.parser.css_select(node, A_REL_TAG_SELECTOR)
41 |         if not elements:
42 |             elements = self.parser.css_select(node, A_HREF_TAG_SELECTOR)
43 |             if not elements:
44 |                 return tags
45 | 
46 |         for el in elements:
47 |             tag = self.parser.getText(el)
48 |             if tag:
49 |                 tags.append(tag)
50 | 
51 |         return list(set(tags))
52 | 


--------------------------------------------------------------------------------
/tests/data/extractors/videos/test_object.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "url": "http://foo.bar/index.html", 
 3 |     "expected": {
 4 |         "movies": [
 5 |             {
 6 |                 "src": "https://www.youtube.com/v/M7lc1UVf-VE?version=3&autoplay=1", 
 7 |                 "embed_code": "<object width=\"640\" height=\"390\"><param name=\"movie\" value=\"https://www.youtube.com/v/M7lc1UVf-VE?version=3&amp;autoplay=1\"/><param name=\"allowScriptAccess\" value=\"always\"/><embed src=\"https://www.youtube.com/v/M7lc1UVf-VE?version=3&amp;autoplay=1\" type=\"application/x-shockwave-flash\" allowscriptaccess=\"always\" width=\"640\" height=\"390\"/></object>", 
 8 |                 "height": "390", 
 9 |                 "width": "640", 
10 |                 "provider": "youtube", 
11 |                 "embed_type": "object"
12 |             },
13 |             {
14 |                 "src": "https://www.youtube.com/v/M7lc1UVf-VE?version=3&autoplay=1", 
15 |                 "embed_code": "<object width=\"640\" height=\"390\"><param name=\"movie\" value=\"https://www.youtube.com/v/M7lc1UVf-VE?version=3&amp;autoplay=1\"/><param name=\"allowScriptAccess\" value=\"always\"/></object>", 
16 |                 "height": "390", 
17 |                 "width": "640", 
18 |                 "provider": "youtube", 
19 |                 "embed_type": "object"
20 |             },
21 |             {
22 |                 "src": "https://www.youtube.com/v/M7lc1UVf-VE?version=3&autoplay=1", 
23 |                 "embed_code": "<embed src=\"https://www.youtube.com/v/M7lc1UVf-VE?version=3&amp;autoplay=1\" type=\"application/x-shockwave-flash\" allowscriptaccess=\"always\" width=\"640\" height=\"390\"/>", 
24 |                 "height": "390", 
25 |                 "width": "640", 
26 |                 "provider": "youtube", 
27 |                 "embed_type": "embed"
28 |             }
29 |         ]
30 |     }
31 | }


--------------------------------------------------------------------------------
/tests/data/extractors/images/test_known_image_empty_src/test_known_image_empty_src.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/loose.dtd"><html>
 2 | <html>
 3 |     <head>
 4 |         <title>test</title>
 5 |     </head>
 6 | 
 7 |     <body>
 8 |         <div>
 9 |             <img class="storytext" src="http://bla.com/images/465395/"/>
10 |         </div>
11 |         <div>
12 |             <img class="mediaimage" src=""/>
13 |         </div>
14 |         <div>
15 |             <h1>title</h1>
16 |             <p>
17 |                 TextNode 1 - The Scala supported IDE is one of the few pain points of developers who want to start using Scala in their Java project. On existing long term project developed by a team its hard to step in and introduce a new language that is not supported by the existing IDE. On way to go about it is to hid the fact that you use Scala from the Java world by using one way dependency injection. Still, if you wish to truly absorb Scala into your existing java environment then you'll soon introduced cross language dependencies.
18 |             </p>
19 |             <p>
20 |                 Most of our team is using Eclipse as the main IDE, its incrimental compilation in Java with its tight JUnit integration are great for fast TDD programming. Unfortunately the Eclipse Scala plugin is not there yet, it may hangs the IDE and messes up Java compilation - especially in large (more then 1000 source files) Java/Scala projects. Though the plugin is getting better over time some developers would find the plugin as a majore drag on their productivity.
21 |                 For developers who do not write Scala at all or rather edit Scala with other editors, you can use this alternate path which lets them work on their Java or Scala code without messing with the plugin.
22 |             </p>
23 |         </div>
24 |     </body>
25 | </html>
26 | 


--------------------------------------------------------------------------------
/tests/data/extractors/images/test_known_image_css_id/test_known_image_css_id.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/loose.dtd"><html>
 2 | <html>
 3 |     <head>
 4 |         <title>test</title>
 5 |     </head>
 6 | 
 7 |     <body>
 8 |         <div>
 9 |             <img id="storytext" src="http://bla.com/images/465395/"/>
10 |         </div>
11 |         <div>
12 |             <img id="mediaimage" src="http://go.com/images/465395/"/>
13 |         </div>
14 |         <div>
15 |             <h1>title</h1>
16 |             <p>
17 |                 TextNode 1 - The Scala supported IDE is one of the few pain points of developers who want to start using Scala in their Java project. On existing long term project developed by a team its hard to step in and introduce a new language that is not supported by the existing IDE. On way to go about it is to hid the fact that you use Scala from the Java world by using one way dependency injection. Still, if you wish to truly absorb Scala into your existing java environment then you'll soon introduced cross language dependencies.
18 |             </p>
19 |             <p>
20 |                 Most of our team is using Eclipse as the main IDE, its incrimental compilation in Java with its tight JUnit integration are great for fast TDD programming. Unfortunately the Eclipse Scala plugin is not there yet, it may hangs the IDE and messes up Java compilation - especially in large (more then 1000 source files) Java/Scala projects. Though the plugin is getting better over time some developers would find the plugin as a majore drag on their productivity.
21 |                 For developers who do not write Scala at all or rather edit Scala with other editors, you can use this alternate path which lets them work on their Java or Scala code without messing with the plugin.
22 |             </p>
23 |         </div>
24 |     </body>
25 | </html>


--------------------------------------------------------------------------------
/tests/data/extractors/images/test_known_image_css_class/test_known_image_css_class.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/loose.dtd"><html>
 2 | <html>
 3 |     <head>
 4 |         <title>test</title>
 5 |     </head>
 6 | 
 7 |     <body>
 8 |         <div>
 9 |             <img class="storytext" src="http://bla.com/images/465395/"/>
10 |         </div>
11 |         <div>
12 |             <img class="mediaimage" src="http://go.com/images/465395/"/>
13 |         </div>
14 |         <div>
15 |             <h1>title</h1>
16 |             <p>
17 |                 TextNode 1 - The Scala supported IDE is one of the few pain points of developers who want to start using Scala in their Java project. On existing long term project developed by a team its hard to step in and introduce a new language that is not supported by the existing IDE. On way to go about it is to hid the fact that you use Scala from the Java world by using one way dependency injection. Still, if you wish to truly absorb Scala into your existing java environment then you'll soon introduced cross language dependencies.
18 |             </p>
19 |             <p>
20 |                 Most of our team is using Eclipse as the main IDE, its incrimental compilation in Java with its tight JUnit integration are great for fast TDD programming. Unfortunately the Eclipse Scala plugin is not there yet, it may hangs the IDE and messes up Java compilation - especially in large (more then 1000 source files) Java/Scala projects. Though the plugin is getting better over time some developers would find the plugin as a majore drag on their productivity.
21 |                 For developers who do not write Scala at all or rather edit Scala with other editors, you can use this alternate path which lets them work on their Java or Scala code without messing with the plugin.
22 |             </p>
23 |         </div>
24 |     </body>
25 | </html>


--------------------------------------------------------------------------------
/tests/data/extractors/images/test_known_image_css_parent_id/test_known_image_css_parent_id.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/loose.dtd"><html>
 2 | <html>
 3 |     <head>
 4 |         <title>test</title>
 5 |     </head>
 6 | 
 7 |     <body>
 8 |         <div id="storytext">
 9 |             <img src="http://bla.com/images/465395/"/>
10 |         </div>
11 |         <div id="mediaimage">
12 |             <img src="http://go.com/images/465395/"/>
13 |         </div>
14 |         <div>
15 |             <h1>title</h1>
16 |             <p>
17 |                 TextNode 1 - The Scala supported IDE is one of the few pain points of developers who want to start using Scala in their Java project. On existing long term project developed by a team its hard to step in and introduce a new language that is not supported by the existing IDE. On way to go about it is to hid the fact that you use Scala from the Java world by using one way dependency injection. Still, if you wish to truly absorb Scala into your existing java environment then you'll soon introduced cross language dependencies.
18 |             </p>
19 |             <p>
20 |                 Most of our team is using Eclipse as the main IDE, its incrimental compilation in Java with its tight JUnit integration are great for fast TDD programming. Unfortunately the Eclipse Scala plugin is not there yet, it may hangs the IDE and messes up Java compilation - especially in large (more then 1000 source files) Java/Scala projects. Though the plugin is getting better over time some developers would find the plugin as a majore drag on their productivity.
21 |                 For developers who do not write Scala at all or rather edit Scala with other editors, you can use this alternate path which lets them work on their Java or Scala code without messing with the plugin.
22 |             </p>
23 |         </div>
24 |     </body>
25 | </html>


--------------------------------------------------------------------------------
/tests/data/extractors/images/test_known_image_css_parent_class/test_known_image_css_parent_class.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/loose.dtd"><html>
 2 | <html>
 3 |     <head>
 4 |         <title>test</title>
 5 |     </head>
 6 | 
 7 |     <body>
 8 |         <div class="storytext">
 9 |             <img src="http://bla.com/images/465395/"/>
10 |         </div>
11 |         <div class="mediaimage">
12 |             <img src="http://go.com/images/465395/"/>
13 |         </div>
14 |         <div>
15 |             <h1>title</h1>
16 |             <p>
17 |                 TextNode 1 - The Scala supported IDE is one of the few pain points of developers who want to start using Scala in their Java project. On existing long term project developed by a team its hard to step in and introduce a new language that is not supported by the existing IDE. On way to go about it is to hid the fact that you use Scala from the Java world by using one way dependency injection. Still, if you wish to truly absorb Scala into your existing java environment then you'll soon introduced cross language dependencies.
18 |             </p>
19 |             <p>
20 |                 Most of our team is using Eclipse as the main IDE, its incrimental compilation in Java with its tight JUnit integration are great for fast TDD programming. Unfortunately the Eclipse Scala plugin is not there yet, it may hangs the IDE and messes up Java compilation - especially in large (more then 1000 source files) Java/Scala projects. Though the plugin is getting better over time some developers would find the plugin as a majore drag on their productivity.
21 |                 For developers who do not write Scala at all or rather edit Scala with other editors, you can use this alternate path which lets them work on their Java or Scala code without messing with the plugin.
22 |             </p>
23 |         </div>
24 |     </body>
25 | </html>


--------------------------------------------------------------------------------
/goose/extractors/publishdate.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """\
 3 | This is a python port of "Goose" orignialy licensed to Gravity.com
 4 | under one or more contributor license agreements.  See the NOTICE file
 5 | distributed with this work for additional information
 6 | regarding copyright ownership.
 7 | 
 8 | Python port was written by Xavier Grangier for Recrutae
 9 | 
10 | Gravity.com licenses this file
11 | to you under the Apache License, Version 2.0 (the "License");
12 | you may not use this file except in compliance
13 | with the License.  You may obtain a copy of the License at
14 | 
15 | http://www.apache.org/licenses/LICENSE-2.0
16 | 
17 | Unless required by applicable law or agreed to in writing, software
18 | distributed under the License is distributed on an "AS IS" BASIS,
19 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20 | See the License for the specific language governing permissions and
21 | limitations under the License.
22 | """
23 | 
24 | from goose.extractors import BaseExtractor
25 | 
26 | KNOWN_PUBLISH_DATE_TAGS = [
27 |     {'attribute': 'property', 'value': 'rnews:datePublished', 'content': 'content'},
28 |     {'attribute': 'property', 'value': 'article:published_time', 'content': 'content'},
29 |     {'attribute': 'name', 'value': 'OriginalPublicationDate', 'content': 'content'},
30 |     {'attribute': 'itemprop', 'value': 'datePublished', 'content': 'datetime'},
31 | ]
32 | 
33 | 
34 | class PublishDateExtractor(BaseExtractor):
35 |     def extract(self):
36 |         for known_meta_tag in KNOWN_PUBLISH_DATE_TAGS:
37 |             meta_tags = self.parser.getElementsByTag(
38 |                             self.article.doc,
39 |                             attr=known_meta_tag['attribute'],
40 |                             value=known_meta_tag['value'])
41 |             if meta_tags:
42 |                 return self.parser.getAttribute(
43 |                     meta_tags[0],
44 |                     known_meta_tag['content']
45 |                 )
46 |         return None
47 | 


--------------------------------------------------------------------------------
/goose/network.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """\
 3 | This is a python port of "Goose" orignialy licensed to Gravity.com
 4 | under one or more contributor license agreements.  See the NOTICE file
 5 | distributed with this work for additional information
 6 | regarding copyright ownership.
 7 | 
 8 | Python port was written by Xavier Grangier for Recrutae
 9 | 
10 | Gravity.com licenses this file
11 | to you under the Apache License, Version 2.0 (the "License");
12 | you may not use this file except in compliance
13 | with the License.  You may obtain a copy of the License at
14 | 
15 | http://www.apache.org/licenses/LICENSE-2.0
16 | 
17 | Unless required by applicable law or agreed to in writing, software
18 | distributed under the License is distributed on an "AS IS" BASIS,
19 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20 | See the License for the specific language governing permissions and
21 | limitations under the License.
22 | """
23 | import urllib2
24 | 
25 | 
26 | class HtmlFetcher(object):
27 | 
28 |     def __init__(self, config):
29 |         self.config = config
30 |         # set header
31 |         self.headers = {'User-agent': self.config.browser_user_agent}
32 | 
33 |     def get_url(self):
34 |         # if we have a result
35 |         # get the final_url
36 |         if self.result is not None:
37 |             return self.result.geturl()
38 |         return None
39 | 
40 |     def get_html(self, url):
41 |         # utf-8 encode unicode url
42 |         if isinstance(url, unicode):
43 |             url = url.encode('utf-8')
44 | 
45 |         # set request
46 |         self.request = urllib2.Request(
47 |                         url,
48 |                         headers=self.headers)
49 |         # do request
50 |         try:
51 |             self.result = urllib2.urlopen(
52 |                             self.request,
53 |                             timeout=self.config.http_timeout)
54 |         except Exception:
55 |             self.result = None
56 | 
57 |         # read the result content
58 |         if self.result is not None:
59 |             return self.result.read()
60 |         return None
61 | 


--------------------------------------------------------------------------------
/tests/data/extractors/content/test_msn1.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "url": "http://lifestyle.msn.com/your-life/your-money-today/article.aspx?cp-documentid=31244150", 
 3 |     "expected": {
 4 |         "meta_description": "An anonymous grocery-store manager shares the secrets to lowering your food bill. For more ideas on <a href=\"http://www.realsimple.com/work-life/money/spending/how-to-stop-spending-money-00100000068194/index.html?xid=msn-rs-deck-link-111811\">how to stop spending</a>, check out Real Simple.", 
 5 |         "domain": "lifestyle.msn.com", 
 6 |         "final_url": "http://lifestyle.msn.com/your-life/your-money-today/article.aspx?cp-documentid=31244150", 
 7 |         "meta_keywords": "groceries for less, budget, save money, penny pinching, groceries, food bill", 
 8 |         "cleaned_text": "\"Head to the supermarket an hour before closing time. Some stores mark down prepared foods and bakery items then because they can't sell them the following day. You could get a rotisserie chicken or freshly baked cookies for 50 percent off, or nab two sushi meals for the price of one. If you're planning to host a party or some other gathering, it's worth your time to ask the deli or bakery manager for a 5 to 10 percent discount off your catering order. Also, keep an eye out for online coupons: Some grocery stores accept coupons printed out from sites like TheGroceryGame.com, ShopAtHome.com, and CouponMom.com, even though they rarely publicize the fact. (Find out your store's policy at the customer-service counter.) It also pays to check the market's own website. You could find weekly deals there that it doesn't advertise anywhere else, including its in-store flyers.\n\n\"And even though it's convenient to do all your shopping in one place, avoid going to a grocery store for kitchen supplies, like measuring cups and cookie sheets, or seasonal items, like holiday decorations and gift bags. These products will have inflated prices. Buy them at a big-box chain, like Target or Walmart, instead.\"\n\nMore from Bing and MSN Lifestyle Site Search: Get additional content on saving on your grocery bill", 
 9 |         "meta_favicon": "http://blu.stc.s-msn.com/br/gbl/lg/1/favicon.ico", 
10 |         "meta_lang": "en"
11 |     }
12 | }


--------------------------------------------------------------------------------
/tests/extractors/videos.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """\
 3 | This is a python port of "Goose" orignialy licensed to Gravity.com
 4 | under one or more contributor license agreements.  See the NOTICE file
 5 | distributed with this work for additional information
 6 | regarding copyright ownership.
 7 | 
 8 | Python port was written by Xavier Grangier for Recrutae
 9 | 
10 | Gravity.com licenses this file
11 | to you under the Apache License, Version 2.0 (the "License");
12 | you may not use this file except in compliance
13 | with the License.  You may obtain a copy of the License at
14 | 
15 | http://www.apache.org/licenses/LICENSE-2.0
16 | 
17 | Unless required by applicable law or agreed to in writing, software
18 | distributed under the License is distributed on an "AS IS" BASIS,
19 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20 | See the License for the specific language governing permissions and
21 | limitations under the License.
22 | """
23 | from base import TestExtractionBase
24 | 
25 | 
26 | class ImageExtractionTests(TestExtractionBase):
27 |     """\
28 |     Base Mock test case
29 |     """
30 |     def assert_movies(self, field, expected_value, result_value):
31 |         # check if result_value is a list
32 |         self.assertTrue(isinstance(result_value, list))
33 |         # check number of videos
34 |         self.assertEqual(len(expected_value), len(result_value))
35 | 
36 |         # check values
37 |         for c, video in enumerate(result_value):
38 |             expected = expected_value[c]
39 |             for k, v in expected.items():
40 |                 r = getattr(video, k)
41 |                 self.assertEqual(r, v)
42 | 
43 |     def test_embed(self):
44 |         article = self.getArticle()
45 |         fields = ['movies']
46 |         self.runArticleAssertions(article=article, fields=fields)
47 | 
48 |     def test_iframe(self):
49 |         article = self.getArticle()
50 |         fields = ['movies']
51 |         self.runArticleAssertions(article=article, fields=fields)
52 | 
53 |     def test_object(self):
54 |         article = self.getArticle()
55 |         fields = ['movies']
56 |         self.runArticleAssertions(article=article, fields=fields)
57 | 


--------------------------------------------------------------------------------
/tests/data/extractors/authors/test_author_schema.html:
--------------------------------------------------------------------------------
 1 | <html>
 2 |     <body>
 3 |         <p class="byline-dateline">
 4 |             <span class="byline" itemprop="author creator" itemscope="" itemtype="http://schema.org/Person" itemid="http://topics.nytimes.com/top/reference/timestopics/people/s/kevin_sack/index.html">By <a href="http://topics.nytimes.com/top/reference/timestopics/people/s/kevin_sack/index.html" rel="author" title="More Articles by KEVIN SACK"><span class="byline-author" data-byline-name="KEVIN SACK" itemprop="name" data-twitter-handle="ksacknyt">KEVIN SACK</span></a>, </span><span class="byline" itemprop="author creator" itemscope="" itemtype="http://schema.org/Person" itemid="http://topics.nytimes.com/top/reference/timestopics/people/f/sheri_fink/index.html"><a href="http://topics.nytimes.com/top/reference/timestopics/people/f/sheri_fink/index.html" rel="author" title="More Articles by SHERI FINK"><span class="byline-author" data-byline-name="SHERI FINK" itemprop="name">SHERI FINK</span></a>, </span><span class="byline" itemprop="author creator" itemscope="" itemtype="http://schema.org/Person" itemid="http://topics.nytimes.com/top/reference/timestopics/people/b/pam_belluck/index.html"><a href="http://topics.nytimes.com/top/reference/timestopics/people/b/pam_belluck/index.html" rel="author" title="More Articles by PAM BELLUCK"><span class="byline-author" data-byline-name="PAM BELLUCK" itemprop="name">PAM BELLUCK</span></a> and </span><span class="byline" itemprop="author creator" itemscope="" itemtype="http://schema.org/Person" itemid="http://topics.nytimes.com/top/reference/timestopics/people/n/adam_nossiter/index.html"><a href="http://topics.nytimes.com/top/reference/timestopics/people/n/adam_nossiter/index.html" rel="author" title="More Articles by ADAM NOSSITER"><span class="byline-author" data-byline-name="ADAM NOSSITER" itemprop="name">ADAM NOSSITER</span></a></span>
 5 |         </p>
 6 |         <div itemprop="articleBody">
 7 |             <p>
 8 |               TextNode 1 - The Scala supported IDE is one of the few pain points of developers who want to start using Scala in their Java project. On existing long term project developed by a team its hard to step in and introduce a new language that is not supported by the existing IDE. On way to go about it is to hid the fact that you use Scala from the Java world by using one way dependency injection. Still, if you wish to truly absorb Scala into your existing java environment then you'll soon introduced cross language dependencies.
 9 |             </p>
10 |         </div>
11 |     </body>
12 | </html>
13 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """\
 3 | This is a python port of "Goose" orignialy licensed to Gravity.com
 4 | under one or more contributor license agreements.  See the NOTICE file
 5 | distributed with this work for additional information
 6 | regarding copyright ownership.
 7 | 
 8 | Python port was written by Xavier Grangier for Recrutae
 9 | 
10 | Gravity.com licenses this file
11 | to you under the Apache License, Version 2.0 (the "License");
12 | you may not use this file except in compliance
13 | with the License.  You may obtain a copy of the License at
14 | 
15 | http://www.apache.org/licenses/LICENSE-2.0
16 | 
17 | Unless required by applicable law or agreed to in writing, software
18 | distributed under the License is distributed on an "AS IS" BASIS,
19 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20 | See the License for the specific language governing permissions and
21 | limitations under the License.
22 | """
23 | 
24 | import os
25 | from setuptools import setup, find_packages
26 | from imp import load_source
27 | 
28 | version = load_source("version", os.path.join("goose", "version.py"))
29 | 
30 | CLASSIFIERS = [
31 |     'Development Status :: 4 - Beta',
32 |     'Environment :: Other Environment',
33 |     'Intended Audience :: Developers',
34 |     'License :: OSI Approved :: Apache Software License',
35 |     'Operating System :: MacOS :: MacOS X',
36 |     'Operating System :: POSIX',
37 |     'Operating System :: Microsoft :: Windows',
38 |     'Programming Language :: Python',
39 |     'Programming Language :: Python',
40 |     'Programming Language :: Python :: 2',
41 |     'Programming Language :: Python :: 2.6',
42 |     'Programming Language :: Python :: 2.7',
43 |     'Topic :: Internet',
44 |     'Topic :: Utilities',
45 |     'Topic :: Software Development :: Libraries :: Python Modules']
46 | 
47 | description = "Html Content / Article Extractor, web scrapping"
48 | 
49 | # read long description
50 | try:
51 |     with open(os.path.join(os.path.dirname(__file__), 'README.rst')) as f:
52 |         long_description = f.read()
53 | except Exception:
54 |     long_description = description
55 | 
56 | setup(name='goose-extractor',
57 |     version=version.__version__,
58 |     description=description,
59 |     long_description=long_description,
60 |     keywords='scrapping, extractor, web scrapping',
61 |     classifiers=CLASSIFIERS,
62 |     author='Xavier Grangier',
63 |     author_email='grangier@gmail.com',
64 |     url='https://github.com/grangier/python-goose',
65 |     license='Apache',
66 |     packages=find_packages(),
67 |     include_package_data=True,
68 |     zip_safe=False,
69 |     install_requires=['Pillow', 'lxml', 'cssselect', 'jieba', 'beautifulsoup', 'nltk'],
70 |     test_suite="tests"
71 | )
72 | 


--------------------------------------------------------------------------------
/tests/extractors/tags.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """\
 3 | This is a python port of "Goose" orignialy licensed to Gravity.com
 4 | under one or more contributor license agreements.  See the NOTICE file
 5 | distributed with this work for additional information
 6 | regarding copyright ownership.
 7 | 
 8 | Python port was written by Xavier Grangier for Recrutae
 9 | 
10 | Gravity.com licenses this file
11 | to you under the Apache License, Version 2.0 (the "License");
12 | you may not use this file except in compliance
13 | with the License.  You may obtain a copy of the License at
14 | 
15 | http://www.apache.org/licenses/LICENSE-2.0
16 | 
17 | Unless required by applicable law or agreed to in writing, software
18 | distributed under the License is distributed on an "AS IS" BASIS,
19 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20 | See the License for the specific language governing permissions and
21 | limitations under the License.
22 | """
23 | 
24 | from base import TestExtractionBase
25 | 
26 | 
27 | class TestArticleTags(TestExtractionBase):
28 | 
29 |     def assert_tags(self, field, expected_value, result_value):
30 |         """\
31 | 
32 |         """
33 |         # as we have a set in expected_value and a list in result_value
34 |         # make result_value a set
35 |         expected_value = set(expected_value)
36 | 
37 |         # check if both have the same number of items
38 |         msg = (u"expected tags set and result tags set"
39 |                 u"don't have the same number of items")
40 |         self.assertEqual(len(result_value), len(expected_value), msg=msg)
41 | 
42 |         # check if each tag in result_value is in expected_value
43 |         for tag in result_value:
44 |             self.assertTrue(tag in expected_value)
45 | 
46 |     def test_tags_kexp(self):
47 |         article = self.getArticle()
48 |         fields = ['tags']
49 |         self.runArticleAssertions(article=article, fields=fields)
50 | 
51 |     def test_tags_deadline(self):
52 |         article = self.getArticle()
53 |         fields = ['tags']
54 |         self.runArticleAssertions(article=article, fields=fields)
55 | 
56 |     def test_tags_wnyc(self):
57 |         article = self.getArticle()
58 |         fields = ['tags']
59 |         self.runArticleAssertions(article=article, fields=fields)
60 | 
61 |     def test_tags_cnet(self):
62 |         article = self.getArticle()
63 |         fields = ['tags']
64 |         self.runArticleAssertions(article=article, fields=fields)
65 | 
66 |     def test_tags_abcau(self):
67 |         """
68 |         Test ABC Australia page with "topics" tags
69 |         """
70 |         article = self.getArticle()
71 |         fields = ['tags']
72 |         self.runArticleAssertions(article=article, fields=fields)
73 | 


--------------------------------------------------------------------------------
/tests/data/extractors/tweets/test_tweet.html:
--------------------------------------------------------------------------------
 1 | <html>
 2 |     <body>
 3 |         <div>
 4 |             <p>
 5 |               TextNode 1 - The Scala supported IDE is one of the few pain points of developers who want to start using Scala in their Java project. On existing long term project developed by a team its hard to step in and introduce a new language that is not supported by the existing IDE. On way to go about it is to hid the fact that you use Scala from the Java world by using one way dependency injection. 
 6 |               <blockquote class="twitter-tweet" lang="en"><p>Python-Goose: useful <a href="https://twitter.com/hashtag/python?src=hash">#python</a> library for extracting body text + metadata from a news article or article-type page: <a href="https://t.co/OGKBvxxunu">https://t.co/OGKBvxxunu</a></p>&mdash; Derek Greene (@derekgreene) <a href="https://twitter.com/derekgreene/status/527783221539643393">October 30, 2014</a></blockquote>
 7 |               <script async src="//platform.twitter.com/widgets.js" charset="utf-8"></script>
 8 |               Still, if you wish to truly absorb Scala into your existing java environment then you'll soon introduced cross language dependencies.
 9 |             </p>
10 |         </div>
11 |         <div itemprop="articleBody">
12 |             <p>
13 |               TextNode 1 - The Scala supported IDE is one of the few pain points of developers who want to start using Scala in their Java project. On existing long term project developed by a team its hard to step in and introduce a new language that is not supported by the existing IDE. On way to go about it is to hid the fact that you use Scala from the Java world by using one way dependency injection. Still, if you wish to truly absorb Scala into your existing java environment then you'll soon introduced cross language dependencies.
14 |               <blockquote class="twitter-tweet" lang="en"><p>Python-Goose: useful <a href="https://twitter.com/hashtag/python?src=hash">#python</a> library for extracting body text + metadata from a news article or article-type page: <a href="https://t.co/OGKBvxxunu">https://t.co/OGKBvxxunu</a></p>&mdash; Derek Greene (@derekgreene) <a href="https://twitter.com/derekgreene/status/527783221539643393">October 30, 2014</a></blockquote>
15 |               <script async src="//platform.twitter.com/widgets.js" charset="utf-8"></script>
16 |               <blockquote class="twitter-tweet" lang="en"><p>Python-Goose: useful <a href="https://twitter.com/hashtag/python?src=hash">#python</a> library for extracting body text + metadata from a news article or article-type page: <a href="https://t.co/OGKBvxxunu">https://t.co/OGKBvxxunu</a></p>&mdash; Derek Greene (@derekgreene) <a href="https://twitter.com/derekgreene/status/527783221539643393">October 30, 2014</a></blockquote>
17 |             <script async src="//platform.twitter.com/widgets.js" charset="utf-8"></script>
18 |             </p>
19 |         </div>
20 |     </body>
21 | </html>
22 | 


--------------------------------------------------------------------------------
/goose/image.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """\
 3 | This is a python port of "Goose" orignialy licensed to Gravity.com
 4 | under one or more contributor license agreements.  See the NOTICE file
 5 | distributed with this work for additional information
 6 | regarding copyright ownership.
 7 | 
 8 | Python port was written by Xavier Grangier for Recrutae
 9 | 
10 | Gravity.com licenses this file
11 | to you under the Apache License, Version 2.0 (the "License");
12 | you may not use this file except in compliance
13 | with the License.  You may obtain a copy of the License at
14 | 
15 | http://www.apache.org/licenses/LICENSE-2.0
16 | 
17 | Unless required by applicable law or agreed to in writing, software
18 | distributed under the License is distributed on an "AS IS" BASIS,
19 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20 | See the License for the specific language governing permissions and
21 | limitations under the License.
22 | """
23 | 
24 | 
25 | class Image(object):
26 | 
27 |     def __init__(self):
28 |         # holds the Element node of the image we think is top dog
29 |         self.top_image_node = None
30 | 
31 |         # holds the src of the image
32 |         self.src = ""
33 | 
34 |         # how confident are we in this image extraction?
35 |         # the most images generally the less confident
36 |         self.confidence_score = float(0.0)
37 | 
38 |         # Height of the image in pixels
39 |         self.height = 0
40 | 
41 |         # width of the image in pixels
42 |         self.width = 0
43 | 
44 |         # what kind of image extraction was used for this?
45 |         # bestGuess, linkTag, openGraph tags?
46 |         self.extraction_type = "NA"
47 | 
48 |         # stores how many bytes this image is.
49 |         self.bytes = long(0)
50 | 
51 |     def get_src(self):
52 |         return self.src
53 | 
54 | 
55 | class ImageDetails(object):
56 | 
57 |     def __init__(self):
58 | 
59 |         # the width of the image
60 |         self.width = 0
61 | 
62 |         # height of the image
63 |         self.height = 0
64 | 
65 |         # the mime_type of the image JPEG / PNG
66 |         self.mime_type = None
67 | 
68 |     def get_width(self):
69 |         return self.width
70 | 
71 |     def set_width(self, width):
72 |         self.width = width
73 | 
74 |     def get_height(self):
75 |         return self.height
76 | 
77 |     def set_height(self, height):
78 |         self.height = height
79 | 
80 |     def get_mime_type(self):
81 |         return self.mime_type
82 | 
83 |     def set_mime_type(self, mime_type):
84 |         self.mime_type = mime_type
85 | 
86 | 
87 | class LocallyStoredImage(object):
88 | 
89 |     def __init__(self, src='', local_filename='',
90 |         link_hash='', bytes=long(0), file_extension='', height=0, width=0):
91 |         self.src = src
92 |         self.local_filename = local_filename
93 |         self.link_hash = link_hash
94 |         self.bytes = bytes
95 |         self.file_extension = file_extension
96 |         self.height = height
97 |         self.width = width
98 | 


--------------------------------------------------------------------------------
/goose/resources/text/stopwords-pl.txt:
--------------------------------------------------------------------------------
  1 | a
  2 | aby
  3 | ach
  4 | acz
  5 | aczkolwiek
  6 | aj
  7 | albo
  8 | ale
  9 | ależ
 10 | ani
 11 | aż
 12 | bardziej
 13 | bardzo
 14 | bo
 15 | bowiem
 16 | by
 17 | byli
 18 | bynajmniej
 19 | być
 20 | był
 21 | była
 22 | było
 23 | były
 24 | będzie
 25 | będą
 26 | cali
 27 | cała
 28 | cały
 29 | ci
 30 | cię
 31 | ciebie
 32 | co
 33 | cokolwiek
 34 | coś
 35 | czasami
 36 | czasem
 37 | czemu
 38 | czy
 39 | czyli
 40 | daleko
 41 | dla
 42 | dlaczego
 43 | dlatego
 44 | do
 45 | dobrze
 46 | dokąd
 47 | dość
 48 | dużo
 49 | dwa
 50 | dwaj
 51 | dwie
 52 | dwoje
 53 | dziś
 54 | dzisiaj
 55 | gdy
 56 | gdyby
 57 | gdyż
 58 | gdzie
 59 | gdziekolwiek
 60 | gdzieś
 61 | i
 62 | ich
 63 | ile
 64 | im
 65 | inna
 66 | inne
 67 | inny
 68 | innych
 69 | iż
 70 | ja
 71 | ją
 72 | jak
 73 | jakaś
 74 | jakby
 75 | jaki
 76 | jakichś
 77 | jakie
 78 | jakiś
 79 | jakiż
 80 | jakkolwiek
 81 | jako
 82 | jakoś
 83 | je
 84 | jeden
 85 | jedna
 86 | jedno
 87 | jednak
 88 | jednakże
 89 | jego
 90 | jej
 91 | jemu
 92 | jest
 93 | jestem
 94 | jeszcze
 95 | jeśli
 96 | jeżeli
 97 | już
 98 | ją
 99 | każdy
100 | kiedy
101 | kilka
102 | kimś
103 | kto
104 | ktokolwiek
105 | ktoś
106 | która
107 | które
108 | którego
109 | której
110 | który
111 | których
112 | którym
113 | którzy
114 | ku
115 | lat
116 | lecz
117 | lub
118 | ma
119 | mają
120 | mało
121 | mam
122 | mi
123 | mimo
124 | między
125 | mną
126 | mnie
127 | mogą
128 | moi
129 | moim
130 | moja
131 | moje
132 | może
133 | możliwe
134 | można
135 | mój
136 | mu
137 | musi
138 | my
139 | na
140 | nad
141 | nam
142 | nami
143 | nas
144 | nasi
145 | nasz
146 | nasza
147 | nasze
148 | naszego
149 | naszych
150 | natomiast
151 | natychmiast
152 | nawet
153 | nią
154 | nic
155 | nich
156 | nie
157 | niech
158 | niego
159 | niej
160 | niemu
161 | nigdy
162 | nim
163 | nimi
164 | niż
165 | no
166 | o
167 | obok
168 | od
169 | około
170 | on
171 | ona
172 | one
173 | oni
174 | ono
175 | oraz
176 | oto
177 | owszem
178 | pan
179 | pana
180 | pani
181 | po
182 | pod
183 | podczas
184 | pomimo
185 | ponad
186 | ponieważ
187 | powinien
188 | powinna
189 | powinni
190 | powinno
191 | poza
192 | prawie
193 | przecież
194 | przed
195 | przede
196 | przedtem
197 | przez
198 | przy
199 | roku
200 | również
201 | sam
202 | sama
203 | są
204 | się
205 | skąd
206 | sobie
207 | sobą
208 | sposób
209 | swoje
210 | ta
211 | tak
212 | taka
213 | taki
214 | takie
215 | także
216 | tam
217 | te
218 | tego
219 | tej
220 | temu
221 | ten
222 | teraz
223 | też
224 | to
225 | tobą
226 | tobie
227 | toteż
228 | trzeba
229 | tu
230 | tutaj
231 | twoi
232 | twoim
233 | twoja
234 | twoje
235 | twym
236 | twój
237 | ty
238 | tych
239 | tylko
240 | tym
241 | u
242 | w
243 | wam
244 | wami
245 | was
246 | wasz
247 | wasza
248 | wasze
249 | we
250 | według
251 | wiele
252 | wielu
253 | więc
254 | więcej
255 | wszyscy
256 | wszystkich
257 | wszystkie
258 | wszystkim
259 | wszystko
260 | wtedy
261 | wy
262 | właśnie
263 | z
264 | za
265 | zapewne
266 | zawsze
267 | ze
268 | zł
269 | znowu
270 | znów
271 | został
272 | żaden
273 | żadna
274 | żadne
275 | żadnych
276 | że
277 | żeby


--------------------------------------------------------------------------------
/goose/resources/text/stopwords-fr.txt:
--------------------------------------------------------------------------------
  1 | # Licensed to the Apache Software Foundation (ASF) under one or more
  2 | # contributor license agreements.  See the NOTICE file distributed with
  3 | # this work for additional information regarding copyright ownership.
  4 | # The ASF licenses this file to You under the Apache License, Version 2.0
  5 | # (the "License"); you may not use this file except in compliance with
  6 | # the License.  You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | #-----------------------------------------------------------------------
 17 | # a couple of test stopwords to test that the words are really being
 18 | # configured from this file:
 19 | stopworda
 20 | stopwordb
 21 | 
 22 | #Standard english stop words taken from Lucene's StopAnalyzer
 23 | a
 24 | an
 25 | and
 26 | are
 27 | as
 28 | at
 29 | be
 30 | but
 31 | by
 32 | for
 33 | if
 34 | in
 35 | into
 36 | is
 37 | it
 38 | no
 39 | not
 40 | of
 41 | on
 42 | or
 43 | s
 44 | such
 45 | t
 46 | that
 47 | the
 48 | their
 49 | then
 50 | there
 51 | these
 52 | they
 53 | this
 54 | to
 55 | was
 56 | will
 57 | with
 58 | au
 59 | aux
 60 | avec
 61 | ce
 62 | ces
 63 | dans
 64 | de
 65 | des
 66 | du
 67 | elle
 68 | en
 69 | et
 70 | eux
 71 | il
 72 | je
 73 | la
 74 | le
 75 | leur
 76 | lui
 77 | ma
 78 | mais
 79 | me
 80 | même
 81 | mes
 82 | moi
 83 | mon
 84 | ne
 85 | nos
 86 | notre
 87 | nous
 88 | on
 89 | ou
 90 | par
 91 | pas
 92 | pour
 93 | qu
 94 | que
 95 | qui
 96 | sa
 97 | se
 98 | ses
 99 | son
100 | sur
101 | ta
102 | te
103 | tes
104 | toi
105 | ton
106 | tu
107 | un
108 | une
109 | vos
110 | votre
111 | vous
112 | c
113 | d
114 | j
115 | l
116 | à
117 | m
118 | n
119 | s
120 | t
121 | y
122 | été
123 | étée
124 | étées
125 | étés
126 | étant
127 | suis
128 | es
129 | est
130 | sommes
131 | êtes
132 | sont
133 | serai
134 | seras
135 | sera
136 | serons
137 | serez
138 | seront
139 | serais
140 | serait
141 | serions
142 | seriez
143 | seraient
144 | étais
145 | était
146 | étions
147 | étiez
148 | étaient
149 | fus
150 | fut
151 | fûmes
152 | fûtes
153 | furent
154 | sois
155 | soit
156 | soyons
157 | soyez
158 | soient
159 | fusse
160 | fusses
161 | fût
162 | fussions
163 | fussiez
164 | fussent
165 | ayant
166 | eu
167 | eue
168 | eues
169 | eus
170 | ai
171 | as
172 | avons
173 | avez
174 | ont
175 | aurai
176 | auras
177 | aura
178 | aurons
179 | aurez
180 | auront
181 | aurais
182 | aurait
183 | aurions
184 | auriez
185 | auraient
186 | avais
187 | avait
188 | avions
189 | aviez
190 | avaient
191 | eut
192 | eûmes
193 | eûtes
194 | eurent
195 | aie
196 | aies
197 | ait
198 | ayons
199 | ayez
200 | aient
201 | eusse
202 | eusses
203 | eût
204 | eussions
205 | eussiez
206 | eussent
207 | ceci
208 | celà
209 | cet
210 | cette
211 | ici
212 | ils
213 | les
214 | leurs
215 | quel
216 | quels
217 | quelle
218 | quelles
219 | sans
220 | soi
221 | 


--------------------------------------------------------------------------------
/tests/data/extractors/content/test_donga_korean.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "url": "http://news.donga.com/3/all/20131023/58406128/1",
 3 |     "target_language": "ko",
 4 |     "expected": {
 5 |         "meta_description": "경기도 용인에 자리 잡은 민간 시험인증 전문기업 ㈜디지털이엠씨(www.digitalemc.com). 14년째 세계 각국의 통신·안전·전파 규격 시험과 인증 한 우물만 파고 있는 이 회사 박채규 대표가 만나기로 한 주인공이다. 그는 전기전자·무선통신·자동차 전장품 분야에서 국제적으로 인정받는 전문가다.    “시험인증 분야는 새로운 기술에 대한 준비와 교육 없이는 아무것도 이루어 낼 수 없습니다. 따라서 임직원의 교육과 자질 향상을 위해 많은 시간과 비용을 투자하고 있지요.”    ㈜디지털이엠씨의 하루 일과는 항상 바쁘게 돌아간다. 이 회사에선 원어민이 진행하는 사내 영어강좌가 1년 내내 이어진다. 파트와 팀별로 전문기술교육, 세미나 등도 수시로 이뤄진다. 해외 인증기관과의 교류가 무엇보다 중요한 업무이기에 해당 국가의 규격, 시험, 인증에 대한 교육 강도는 상상을 초월한다. 회사의 막내에서부터 CEO까지 막힘없는 ‘소통경영’이 이루어지는 곳이 ㈜디지털이엠씨다. 이 회사 이동훈 이사는 “소통을 수용하는 열", 
 6 |         "domain": "news.donga.com", 
 7 |         "final_url": "http://news.donga.com/3/all/20131023/58406128/1", 
 8 |         "meta_keywords": "뉴스, 기사, 속보, 정치, 경제, 사회, 국제, 문화, 사설, 컬럼, 동아일보, 동아닷컴, news, donga.com", 
 9 |         "cleaned_text": "경기도 용인에 자리 잡은 민간 시험인증 전문기업 ㈜디지털이엠씨(www.digitalemc.com). 14년째 세계 각국의 통신·안전·전파 규격 시험과 인증 한 우물만 파고 있는 이 회사 박채규 대표가 만나기로 한 주인공이다. 그는 전기전자·무선통신·자동차 전장품 분야에서 국제적으로 인정받는 전문가다.“시험인증 분야는 새로운 기술에 대한 준비와 교육 없이는 아무것도 이루어 낼 수 없습니다. 따라서 임직원의 교육과 자질 향상을 위해 많은 시간과 비용을 투자하고 있지요.”㈜디지털이엠씨의 하루 일과는 항상 바쁘게 돌아간다. 이 회사에선 원어민이 진행하는 사내 영어강좌가 1년 내내 이어진다. 파트와 팀별로 전문기술교육, 세미나 등도 수시로 이뤄진다. 해외 인증기관과의 교류가 무엇보다 중요한 업무이기에 해당 국가의 규격, 시험, 인증에 대한 교육 강도는 상상을 초월한다. 회사의 막내에서부터 CEO까지 막힘없는 ‘소통경영’이 이루어지는 곳이 ㈜디지털이엠씨다. 이 회사 이동훈 이사는 “소통을 수용하는 열린 기업문화, 창의적인 열정을 끌어내는 젊고 합리적인 리더십이 직원들이 몰입할 수 있는 일터를 만든 비결”이라고 귀띔했다.1999년 설립된 ㈜디지털이엠씨는 철저한 업무 분장을 통한 리스크관리 체계를 갖추고 있다. 기술시험 및 인증연구소와 영업팀, 관리팀, 해외지사(베이징·타슈켄트), 합작법인(브라질)으로 조직됐다.분야별 시험시설 장비 기술전문인력 등은 세계 어느 곳과 경쟁해도 손색 없을 정도로 높은 수준을 자랑한다. 삼성, LG를 포함해 1300여 개 고객사를 대상으로 전 세계 150여 개 국가의 규격 인증 서비스를 제공하고 있다.창립 15주년을 준비하는 ㈜디지털이엠씨의 각오는 남다르다. 우선 5월 동종업계가 엄두도 못내는 막대한 투자를 단행해 의료기기 전문 인증센터를 개관했다. 유럽연합(EU) 미국 중국 브라질 등 전 세계 180여 개 국가의 표준 시험과 인증시스템을 구축한 것은 기존 인증서비스 용역의 통념을 깨는 사건이었다.서비스도 특화했다. 중소형 의료기기는 물론이고, 초대형 X선 기기까지 한 곳에서 시험하고 인증을 받을 수 있는 특별함을 갖췄다. 주목되는 것은 이 회사의 의료기기 전문 인증센터가 16일 식품의약품안전처로부터 사설 기관 최초로 시험 기관으로 지정받았다는 점이다.이 일을 계기로 ㈜디지털이엠씨는 더 다양한 스펙트럼의 인증 서비스를 제공하겠다고 선언했다. 이 회사는 이달 말 이란 국가인증(CRA) 공인시험소 및 인증서 발급기관으로 확정된다. 해당 업무가 개시되면 수출입 기업의 고충 해결은 물론이고 일본, 중국 등의 인접국가로부터도 상당한 시험, 인증 수주가 예상되고 있다.㈜디지털이엠씨가 주목하는 ‘이머징마켓(신흥시장)’ 이란은 원유생산량 등 보유자원이 막대해 잠재 구매력이 매우 큰 시장으로 평가되고 있지만, 그동안 서방세계와의 정치적 문제로 인해 경제활동에 많은 제약을 받아 왔다. 하지만 최근 변화의 바람이 불고 있어 향후 이란 시장을 노크하는 수출기업들이 늘어날 것으로 보인다.“다국적 시험인증평가 업체들은 산업혁명이 일어난 이후 무려 200여 년의 역사를 가지고 있습니다. 매출 규모도 엄청나지요. 어려운 싸움이 되겠지만 지금까지 국내에서 탄탄한 기본기를 쌓은 만큼 해외시장 개척도 어렵지 않을 것으로 보입니다.”㈜디지털이엠씨 박채규 대표는 이제는 좁은 국내를 넘어 글로벌 인증기업과 승부해야 할 때라고 강조한다. 한 해 매출 수 조원을 올리며 전 세계에서 보폭을 확대해가는 다국적 기업에 맞서 회사의 면역력을 키워나가겠다는 의지다.그는 시험·인증 업무에 남다른 열정을 표시한다. “세계로 뻗어가는 국내 기업들이 많은 국가들의 법령과 규제에 낙오되지 않고 선제적으로 대응할 수 있도록 지원에 나가는 시험인증 사업을 한다는 것 자체가 자부심과 책임감을 갖게 해준다”는 것이다.회사가 매년 20∼30%의 안정적인 성장세를 기록하고 있지만, 그는 시험·인증 산업에서 더욱 시장 경쟁력을 확보하기 위한 본격적인 행보를 내디뎠다. 최근 국내 첫 식약처 지정 사설 의료기기 전문 인증센터를 설립한 데 이어, 이란 등 특수국가의 공인시험 자격도 획득했다. 2년 후엔 해당 분야 최초로 상장 계획도 가지고 있다. 그는 회사의 발전이 곧 국가의 발전이라는 생각으로 장밋빛 미래를 설계 중이다.“우리가 가지고 있는 목표는 확고합니다. 젊은 조직의 열정과 변화를 두려워하지 않는 도전의식을 가지고 세계 초일류 시험인증 기관으로 도약할 겁니다. 회사가 글로벌 기업으로 발전한다는 청사진에 한 번도 의문을 가진 적이 없습니다.”시험인증 분야에서 100년 기업을 일구겠다는 그는 ㈜디지털이엠씨가 앞으로 세계 시험인증 시장의 강자로 군림할 날이 꼭 올 것이라는 확신에 차 있다.최윤호 기자 uknow@donga.com", 
10 |         "meta_favicon": "", 
11 |         "meta_lang": ""
12 |     }
13 | }
14 | 


--------------------------------------------------------------------------------
/tests/data/extractors/content/test_issue24.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/loose.dtd"><html>
 2 | <html>
 3 |     <head>
 4 |         <title>Paragraph Order Test</title>
 5 |     </head>
 6 | 
 7 |     <body>
 8 |         <div>
 9 |             TextNode 1 - The Scala supported IDE is one of the few pain points of developers who want to start using Scala in their Java project. On existing long term project developed by a team its hard to step in and introduce a new language that is not supported by the existing IDE. On way to go about it is to hid the fact that you use Scala from the Java world by using one way dependency injection. Still, if you wish to truly absorb Scala into your existing java environment then you'll soon introduced cross language dependencies.
10 | 
11 |  Most of our team is using Eclipse as the main IDE, its incrimental compilation in Java with its tight JUnit integration are great for fast TDD programming. Unfortunately the Eclipse Scala plugin is not there yet, it may hangs the IDE and messes up Java compilation - especially in large (more then 1000 source files) Java/Scala projects. Though the plugin is getting better over time some developers would find the plugin as a majore drag on their productivity.
12 |  For developers who do not write Scala at all or rather edit Scala with other editors, you can use this alternate path which lets them work on their Java or Scala code without messing with the plugin.
13 |             <p>Paragraph 1 - The Following script is using the Fast Scala Compiler (fsc). The fsc is a compilation server which always run in the background, as in a warm scalac always ready to receive new work. Is will reduce compilation time dramatically.
14 |  The classpath for compilation is taken from the Eclipse project .classpath file. You may take the source directory from there as well if you wish (exercise to the reader).
15 |  The params are not passed to the fsc in the command line since in my project's case the line is too long for the OS to handle. The alternative is to put it into a file and let fsc handle it for you.</p>
16 | 
17 |             TextNode 2 - As you may know, kaChing is an test driven engineering organization. Test driven is not an option, its a must. We move fast and push code to production few dozens of times a day in a five minutes release cycle, so we must have high confidence in our code.
18 |  In complex systems there is no end to testings, each test system is an another line of defense which eventually gets broken but the more you have, the less chances bugs will reach production. We do not have QA team and do not want to have one, the reasoning is that if a human is involved in testing then there is a higher chance of missing things and you simply can't test all the site dozens of times a day.
19 |             <p>Paragraph 2 - In the next few weeks we are adding a new rule from the "not critical" list every few days. The goal is to have all the rules we think are important without the common "its to noisy, lets ignore it" approche. Only after we're done with that we're going to add the next static analysis tool to build. The good thing about these tools and hudson is that you can run them in parallel to the unit/integration tests, on another machine, so they won't slow down the overall release cycle.</p>
20 |         </div>
21 |     </body>
22 | </html>


--------------------------------------------------------------------------------
/tests/data/extractors/content/test_issue24.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "url": "http://danielspicar.github.com/goose-bug.html", 
 3 |     "expected": {
 4 |         "meta_description": "", 
 5 |         "domain": "danielspicar.github.com", 
 6 |         "final_url": "http://danielspicar.github.com/goose-bug.html", 
 7 |         "meta_keywords": "", 
 8 |         "cleaned_text": "TextNode 1 - The Scala supported IDE is one of the few pain points of developers who want to start using Scala in their Java project. On existing long term project developed by a team its hard to step in and introduce a new language that is not supported by the existing IDE. On way to go about it is to hid the fact that you use Scala from the Java world by using one way dependency injection. Still, if you wish to truly absorb Scala into your existing java environment then you'll soon introduced cross language dependencies. Most of our team is using Eclipse as the main IDE, its incrimental compilation in Java with its tight JUnit integration are great for fast TDD programming. Unfortunately the Eclipse Scala plugin is not there yet, it may hangs the IDE and messes up Java compilation - especially in large (more then 1000 source files) Java/Scala projects. Though the plugin is getting better over time some developers would find the plugin as a majore drag on their productivity. For developers who do not write Scala at all or rather edit Scala with other editors, you can use this alternate path which lets them work on their Java or Scala code without messing with the plugin.\n\nParagraph 1 - The Following script is using the Fast Scala Compiler (fsc). The fsc is a compilation server which always run in the background, as in a warm scalac always ready to receive new work. Is will reduce compilation time dramatically. The classpath for compilation is taken from the Eclipse project .classpath file. You may take the source directory from there as well if you wish (exercise to the reader). The params are not passed to the fsc in the command line since in my project's case the line is too long for the OS to handle. The alternative is to put it into a file and let fsc handle it for you.\n\nTextNode 2 - As you may know, kaChing is an test driven engineering organization. Test driven is not an option, its a must. We move fast and push code to production few dozens of times a day in a five minutes release cycle, so we must have high confidence in our code. In complex systems there is no end to testings, each test system is an another line of defense which eventually gets broken but the more you have, the less chances bugs will reach production. We do not have QA team and do not want to have one, the reasoning is that if a human is involved in testing then there is a higher chance of missing things and you simply can't test all the site dozens of times a day.\n\nParagraph 2 - In the next few weeks we are adding a new rule from the \"not critical\" list every few days. The goal is to have all the rules we think are important without the common \"its to noisy, lets ignore it\" approche. Only after we're done with that we're going to add the next static analysis tool to build. The good thing about these tools and hudson is that you can run them in parallel to the unit/integration tests, on another machine, so they won't slow down the overall release cycle.", 
 9 |         "meta_favicon": "", 
10 |         "meta_lang": null
11 |     }
12 | }


--------------------------------------------------------------------------------
/tests/data/extractors/videos/test_iframe.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/loose.dtd"><html>
 2 | <html>
 3 |     <head>
 4 |         <title>test video</title>
 5 |     </head>
 6 | 
 7 |     <body>
 8 |         <div>
 9 |             <p>
10 |             TextNode 1 - The Scala supported IDE is one of the few pain points of developers who want to start using Scala in their Java project. On existing long term project developed by a team its hard to step in and introduce a new language that is not supported by the existing IDE. On way to go about it is to hid the fact that you use Scala from the Java world by using one way dependency injection. Still, if you wish to truly absorb Scala into your existing java environment then you'll soon introduced cross language dependencies.
11 |             </p>
12 |             <p>
13 |                 <iframe frameborder="0" width="480" height="270" src="http://www.dailymotion.com/embed/video/x130bpf"></iframe>
14 |             </p>
15 |             <p>
16 |             Most of our team is using Eclipse as the main IDE, its incrimental compilation in Java with its tight JUnit integration are great for fast TDD programming. Unfortunately the Eclipse Scala plugin is not there yet, it may hangs the IDE and messes up Java compilation - especially in large (more then 1000 source files) Java/Scala projects. Though the plugin is getting better over time some developers would find the plugin as a majore drag on their productivity.
17 |             For developers who do not write Scala at all or rather edit Scala with other editors, you can use this alternate path which lets them work on their Java or Scala code without messing with the plugin.
18 |             </p>
19 |             <p>
20 |                 Paragraph 1 - The Following script is using the Fast Scala Compiler (fsc). The fsc is a compilation server which always run in the background, as in a warm scalac always ready to receive new work. Is will reduce compilation time dramatically.
21 |             </p>
22 |             <p>
23 |                 <iframe frameborder="0" width="480" height="270" src="http://www.dailymotion.com/embed/video/x130bpf"></iframe>
24 |             </p>
25 |             <p>
26 |             TextNode 2 - As you may know, kaChing is an test driven engineering organization. Test driven is not an option, its a must. We move fast and push code to production few dozens of times a day in a five minutes release cycle, so we must have high confidence in our code.
27 |             In complex systems there is no end to testings, each test system is an another line of defense which eventually gets broken but the more you have, the less chances bugs will reach production. We do not have QA team and do not want to have one, the reasoning is that if a human is involved in testing then there is a higher chance of missing things and you simply can't test all the site dozens of times a day.
28 |             </p>
29 |             <p>
30 |                 Paragraph 2 - In the next few weeks we are adding a new rule from the "not critical" list every few days. The goal is to have all the rules we think are important without the common "its to noisy, lets ignore it" approche. Only after we're done with that we're going to add the next static analysis tool to build. The good thing about these tools and hudson is that you can run them in parallel to the unit/integration tests, on another machine, so they won't slow down the overall release cycle.
31 |             </p>
32 |         </div>
33 |     </body>
34 | </html>


--------------------------------------------------------------------------------
/tests/data/extractors/content/test_elpais.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "url": "http://www.sociedad.elpais.com/sociedad/2012/10/27/actualidad/1351332873_157836.html",
 3 |     "target_language": "es",
 4 |     "expected": {
 5 |         "meta_description": "Los recortes elevan la demora para operarse un 125% en a\u00f1o y medio. El n\u00famero de pacientes que esperan m\u00e1s de 180 d\u00edas para entrar en quir\u00f3fano crece un 178%", 
 6 |         "domain": "www.sociedad.elpais.com", 
 7 |         "final_url": "http://www.sociedad.elpais.com/sociedad/2012/10/27/actualidad/1351332873_157836.html", 
 8 |         "meta_keywords": "lista, espera, agravar, recorte, elevar, demora, operar, 125 %, a\u00f1o, medio, n\u00famero, paciente, aguardar, 180, d\u00eda, entrar, quir\u00f3fano, crecer, 178 %", 
 9 |         "cleaned_text": "Los recortes pasan factura a los pacientes.", 
10 |         "tags": [
11 |             "Asistencia sanitaria", 
12 |             "Igualdad", 
13 |             "Copa Davis", 
14 |             "Copa del Rey de F\u00fatbol", 
15 |             "Motociclismo", 
16 |             "Sistema sanitario", 
17 |             "Defensor del Lector", 
18 |             "Cine", 
19 |             "Vacunaci\u00f3n", 
20 |             "Giro de Italia", 
21 |             "Comunicaci\u00f3n", 
22 |             "Elecciones EE UU 2012", 
23 |             "Sanidad", 
24 |             "\u00daLTIMA HORA", 
25 |             "Columnas", 
26 |             "Pol\u00edtica social", 
27 |             "Medicina", 
28 |             "Ciencia", 
29 |             "Wimbledon", 
30 |             "Educaci\u00f3n", 
31 |             "US Open", 
32 |             "Videos Champions", 
33 |             "Administraci\u00f3n auton\u00f3mica", 
34 |             "Oscars", 
35 |             "Farmacias", 
36 |             "Salud", 
37 |             "M\u00e1s temas \u00bb", 
38 |             "Especialidades m\u00e9dicas", 
39 |             "F\u00fatbol", 
40 |             "Europa Convulsa", 
41 |             "Roland Garros", 
42 |             "Golf", 
43 |             "Gastronom\u00eda", 
44 |             "Copa del Rey Basket", 
45 |             "Tour de Francia", 
46 |             "Vuelta Espa\u00f1a", 
47 |             "C\u00e1ritas", 
48 |             "Otros Deportes", 
49 |             "F\u00f3rmula 1", 
50 |             "Champions League", 
51 |             "Ciclismo", 
52 |             "27 OCT 2012 - 12:14 CET", 
53 |             "Listas espera", 
54 |             "Editoriales", 
55 |             "RTVE", 
56 |             "Custodia hijos", 
57 |             "Juegos Ol\u00edmpicos", 
58 |             "Titulares \u00bb", 
59 |             "Vi\u00f1etas", 
60 |             "Sanidad p\u00fablica", 
61 |             "Atenci\u00f3n al paciente", 
62 |             "Tribunas", 
63 |             "Moda", 
64 |             "M\u00fasica", 
65 |             "Tenis", 
66 |             "Medio Ambiente", 
67 |             "Recortes sociales", 
68 |             "Teatro/Danza", 
69 |             "Baloncesto", 
70 |             "Comunidades aut\u00f3nomas", 
71 |             "Selecci\u00f3n Espa\u00f1ola", 
72 |             "Open Australia", 
73 |             "El Espa\u00f1ol", 
74 |             "Consumo", 
75 |             "Enfermedades raras", 
76 |             "Huelga General", 
77 |             "Declaracion Renta", 
78 |             "Elecciones Francia 2012", 
79 |             "Sociedad", 
80 |             "Elecciones Generales", 
81 |             "Libros", 
82 |             "Coches con Estilo", 
83 |             "El final de ETA", 
84 |             "Administraci\u00f3n p\u00fablica", 
85 |             "Centrales nucleares", 
86 |             "\u00cdndice", 
87 |             "Pacientes"
88 |         ], 
89 |         "meta_favicon": "http://ep01.epimg.net/favicon.png", 
90 |         "meta_lang": "es"
91 |     }
92 | }


--------------------------------------------------------------------------------
/tests/data/extractors/videos/test_embed.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/loose.dtd"><html>
 2 | <html>
 3 |     <head>
 4 |         <title>test video</title>
 5 |     </head>
 6 | 
 7 |     <body>
 8 |         <div>
 9 |             <p>
10 |             TextNode 1 - The Scala supported IDE is one of the few pain points of developers who want to start using Scala in their Java project. On existing long term project developed by a team its hard to step in and introduce a new language that is not supported by the existing IDE. On way to go about it is to hid the fact that you use Scala from the Java world by using one way dependency injection. Still, if you wish to truly absorb Scala into your existing java environment then you'll soon introduced cross language dependencies.
11 |             </p>
12 |             <p>
13 |                 <embed src="https://www.youtube.com/v/M7lc1UVf-VE?version=3&autoplay=1"
14 |                             type="application/x-shockwave-flash"
15 |                             allowscriptaccess="always"
16 |                             width="640" height="390"></embed>
17 |             </p>
18 |             <p>
19 |             Most of our team is using Eclipse as the main IDE, its incrimental compilation in Java with its tight JUnit integration are great for fast TDD programming. Unfortunately the Eclipse Scala plugin is not there yet, it may hangs the IDE and messes up Java compilation - especially in large (more then 1000 source files) Java/Scala projects. Though the plugin is getting better over time some developers would find the plugin as a majore drag on their productivity.
20 |             For developers who do not write Scala at all or rather edit Scala with other editors, you can use this alternate path which lets them work on their Java or Scala code without messing with the plugin.
21 |             </p>
22 |             <p>
23 |                 Paragraph 1 - The Following script is using the Fast Scala Compiler (fsc). The fsc is a compilation server which always run in the background, as in a warm scalac always ready to receive new work. Is will reduce compilation time dramatically.
24 |             </p>
25 |             <p>
26 |                 <embed src="https://www.youtube.com/v/M7lc1UVf-VE?version=3&autoplay=1"
27 |                             type="application/x-shockwave-flash"
28 |                             allowscriptaccess="always"
29 |                             width="640" height="390"></embed>
30 |             </p>
31 |             <p>
32 |             TextNode 2 - As you may know, kaChing is an test driven engineering organization. Test driven is not an option, its a must. We move fast and push code to production few dozens of times a day in a five minutes release cycle, so we must have high confidence in our code.
33 |             In complex systems there is no end to testings, each test system is an another line of defense which eventually gets broken but the more you have, the less chances bugs will reach production. We do not have QA team and do not want to have one, the reasoning is that if a human is involved in testing then there is a higher chance of missing things and you simply can't test all the site dozens of times a day.
34 |             </p>
35 |             <p>
36 |                 Paragraph 2 - In the next few weeks we are adding a new rule from the "not critical" list every few days. The goal is to have all the rules we think are important without the common "its to noisy, lets ignore it" approche. Only after we're done with that we're going to add the next static analysis tool to build. The good thing about these tools and hudson is that you can run them in parallel to the unit/integration tests, on another machine, so they won't slow down the overall release cycle.
37 |             </p>
38 |         </div>
39 |     </body>
40 | </html>


--------------------------------------------------------------------------------
/goose/resources/text/stopwords-es.txt:
--------------------------------------------------------------------------------
  1 | de
  2 | la
  3 | que
  4 | el
  5 | en
  6 | y
  7 | a
  8 | los
  9 | del
 10 | se
 11 | las
 12 | por
 13 | un
 14 | para
 15 | con
 16 | no
 17 | una
 18 | su
 19 | al
 20 | lo
 21 | como
 22 | más
 23 | pero
 24 | sus
 25 | le
 26 | ya
 27 | o
 28 | este
 29 | sí
 30 | porque
 31 | esta
 32 | entre
 33 | cuando
 34 | muy
 35 | sin
 36 | sobre
 37 | también
 38 | me
 39 | hasta
 40 | hay
 41 | donde
 42 | quien
 43 | desde
 44 | todo
 45 | nos
 46 | durante
 47 | todos
 48 | uno
 49 | les
 50 | ni
 51 | contra
 52 | otros
 53 | ese
 54 | eso
 55 | ante
 56 | ellos
 57 | e
 58 | esto
 59 | mí
 60 | antes
 61 | algunos
 62 | qué
 63 | unos
 64 | yo
 65 | otro
 66 | otras
 67 | otra
 68 | él
 69 | tanto
 70 | esa
 71 | estos
 72 | mucho
 73 | quienes
 74 | nada
 75 | muchos
 76 | cual
 77 | poco
 78 | ella
 79 | estar
 80 | estas
 81 | algunas
 82 | algo
 83 | nosotros
 84 | mi
 85 | mis
 86 | tú
 87 | te
 88 | ti
 89 | tu
 90 | tus
 91 | ellas
 92 | nosotras
 93 | vosotros
 94 | vosotras
 95 | os
 96 | mío
 97 | mía
 98 | míos
 99 | mías
100 | tuyo
101 | tuya
102 | tuyos
103 | tuyas
104 | suyo
105 | suya
106 | suyos
107 | suyas
108 | nuestro
109 | nuestra
110 | nuestros
111 | nuestras
112 | vuestro
113 | vuestra
114 | vuestros
115 | vuestras
116 | esos
117 | esas
118 | estoy
119 | estás
120 | está
121 | estamos
122 | estáis
123 | están
124 | esté
125 | estés
126 | estemos
127 | estéis
128 | estén
129 | estaré
130 | estarás
131 | estará
132 | estaremos
133 | estaréis
134 | estarán
135 | estaría
136 | estarías
137 | estaríamos
138 | estaríais
139 | estarían
140 | estaba
141 | estabas
142 | estábamos
143 | estabais
144 | estaban
145 | estuve
146 | estuviste
147 | estuvo
148 | estuvimos
149 | estuvisteis
150 | estuvieron
151 | estuviera
152 | estuvieras
153 | estuviéramos
154 | estuvierais
155 | estuvieran
156 | estuviese
157 | estuvieses
158 | estuviésemos
159 | estuvieseis
160 | estuviesen
161 | estando
162 | estado
163 | estada
164 | estados
165 | estadas
166 | estad
167 | he
168 | has
169 | ha
170 | hemos
171 | habéis
172 | han
173 | haya
174 | hayas
175 | hayamos
176 | hayáis
177 | hayan
178 | habré
179 | habrás
180 | habrá
181 | habremos
182 | habréis
183 | habrán
184 | habría
185 | habrías
186 | habríamos
187 | habríais
188 | habrían
189 | había
190 | habías
191 | habíamos
192 | habíais
193 | habían
194 | hube
195 | hubiste
196 | hubo
197 | hubimos
198 | hubisteis
199 | hubieron
200 | hubiera
201 | hubieras
202 | hubiéramos
203 | hubierais
204 | hubieran
205 | hubiese
206 | hubieses
207 | hubiésemos
208 | hubieseis
209 | hubiesen
210 | habiendo
211 | habido
212 | habida
213 | habidos
214 | habidas
215 | 
216 | # forms of ser, to be (not including the infinitive):
217 | soy
218 | eres
219 | es
220 | somos
221 | sois
222 | son
223 | sea
224 | seas
225 | seamos
226 | seáis
227 | sean
228 | seré
229 | serás
230 | será
231 | seremos
232 | seréis
233 | serán
234 | sería
235 | serías
236 | seríamos
237 | seríais
238 | serían
239 | era
240 | eras
241 | éramos
242 | erais
243 | eran
244 | fui
245 | fuiste
246 | fue
247 | fuimos
248 | fuisteis
249 | fueron
250 | fuera
251 | fueras
252 | fuéramos
253 | fuerais
254 | fueran
255 | fuese
256 | fueses
257 | fuésemos
258 | fueseis
259 | fuesen
260 | siendo
261 | sido
262 | tengo
263 | tienes
264 | tiene
265 | tenemos
266 | tenéis
267 | tienen
268 | tenga
269 | tengas
270 | tengamos
271 | tengáis
272 | tengan
273 | tendré
274 | tendrás
275 | tendrá
276 | tendremos
277 | tendréis
278 | tendrán
279 | tendría
280 | tendrías
281 | tendríamos
282 | tendríais
283 | tendrían
284 | tenía
285 | tenías
286 | teníamos
287 | teníais
288 | tenían
289 | tuve
290 | tuviste
291 | tuvo
292 | tuvimos
293 | tuvisteis
294 | tuvieron
295 | tuviera
296 | tuvieras
297 | tuviéramos
298 | tuvierais
299 | tuvieran
300 | tuviese
301 | tuvieses
302 | tuviésemos
303 | tuvieseis
304 | tuviesen
305 | teniendo
306 | tenido
307 | tenida
308 | tenidos
309 | tenidas
310 | tened
311 | 


--------------------------------------------------------------------------------
/goose/__init__.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """\
  3 | This is a python port of "Goose" orignialy licensed to Gravity.com
  4 | under one or more contributor license agreements.  See the NOTICE file
  5 | distributed with this work for additional information
  6 | regarding copyright ownership.
  7 | 
  8 | Python port was written by Xavier Grangier for Recrutae
  9 | 
 10 | Gravity.com licenses this file
 11 | to you under the Apache License, Version 2.0 (the "License");
 12 | you may not use this file except in compliance
 13 | with the License.  You may obtain a copy of the License at
 14 | 
 15 | http://www.apache.org/licenses/LICENSE-2.0
 16 | 
 17 | Unless required by applicable law or agreed to in writing, software
 18 | distributed under the License is distributed on an "AS IS" BASIS,
 19 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 20 | See the License for the specific language governing permissions and
 21 | limitations under the License.
 22 | """
 23 | import os
 24 | import platform
 25 | from tempfile import mkstemp
 26 | 
 27 | from goose.version import version_info, __version__
 28 | from goose.configuration import Configuration
 29 | from goose.crawler import CrawlCandidate
 30 | from goose.crawler import Crawler
 31 | 
 32 | 
 33 | class Goose(object):
 34 |     """\
 35 | 
 36 |     """
 37 |     def __init__(self, config=None):
 38 |         self.config = config or Configuration()
 39 |         self.extend_config()
 40 |         self.initialize()
 41 | 
 42 |     def extend_config(self):
 43 |         if isinstance(self.config, dict):
 44 |             config = Configuration()
 45 |             for k, v in self.config.items():
 46 |                 if hasattr(config, k):
 47 |                     setattr(config, k, v)
 48 |             self.config = config
 49 | 
 50 |     def extract(self, url=None, raw_html=None):
 51 |         """\
 52 |         Main method to extract an article object from a URL,
 53 |         pass in a url and get back a Article
 54 |         """
 55 |         cc = CrawlCandidate(self.config, url, raw_html)
 56 |         return self.crawl(cc)
 57 | 
 58 |     def shutdown_network(self):
 59 |         pass
 60 | 
 61 |     def crawl(self, crawl_candiate):
 62 |         parsers = list(self.config.available_parsers)
 63 |         parsers.remove(self.config.parser_class)
 64 |         try:
 65 |             crawler = Crawler(self.config)
 66 |             article = crawler.crawl(crawl_candiate)
 67 |         except (UnicodeDecodeError, ValueError):
 68 |             self.config.parser_class = parsers[0]
 69 |             return self.crawl(crawl_candiate)
 70 |         return article
 71 | 
 72 |     def initialize(self):
 73 |         # we don't need to go further if image extractor or
 74 |         # local_storage is not set
 75 |         if not self.config.local_storage_path or \
 76 |            not self.config.enable_image_fetching:
 77 |             return
 78 |         # test if config.local_storage_path
 79 |         # is a directory
 80 |         if not os.path.isdir(self.config.local_storage_path):
 81 |             os.makedirs(self.config.local_storage_path)
 82 | 
 83 |         if not os.path.isdir(self.config.local_storage_path):
 84 |             raise Exception(self.config.local_storage_path +
 85 |                 " directory does not seem to exist, "
 86 |                 "you need to set this for image processing downloads"
 87 |             )
 88 | 
 89 |         # test to write a dummy file to the directory
 90 |         # to check is directory is writtable
 91 |         level, path = mkstemp(dir=self.config.local_storage_path)
 92 |         try:
 93 |             f = os.fdopen(level, "w")
 94 |             f.close()
 95 |             os.remove(path)
 96 |         except IOError:
 97 |             raise Exception(self.config.local_storage_path +
 98 |                 " directory is not writeble, "
 99 |                 "you need to set this for image processing downloads"
100 |             )
101 | 


--------------------------------------------------------------------------------
/goose/extractors/title.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """\
  3 | This is a python port of "Goose" orignialy licensed to Gravity.com
  4 | under one or more contributor license agreements.  See the NOTICE file
  5 | distributed with this work for additional information
  6 | regarding copyright ownership.
  7 | 
  8 | Python port was written by Xavier Grangier for Recrutae
  9 | 
 10 | Gravity.com licenses this file
 11 | to you under the Apache License, Version 2.0 (the "License");
 12 | you may not use this file except in compliance
 13 | with the License.  You may obtain a copy of the License at
 14 | 
 15 | http://www.apache.org/licenses/LICENSE-2.0
 16 | 
 17 | Unless required by applicable law or agreed to in writing, software
 18 | distributed under the License is distributed on an "AS IS" BASIS,
 19 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 20 | See the License for the specific language governing permissions and
 21 | limitations under the License.
 22 | """
 23 | import re
 24 | 
 25 | from goose.extractors import BaseExtractor
 26 | 
 27 | 
 28 | TITLE_SPLITTERS = [u"|", u"-", u"»", u":"]
 29 | 
 30 | 
 31 | class TitleExtractor(BaseExtractor):
 32 | 
 33 |     def clean_title(self, title):
 34 |         """Clean title with the use of og:site_name
 35 |         in this case try to get rid of site name
 36 |         and use TITLE_SPLITTERS to reformat title
 37 |         """
 38 |         # check if we have the site name in opengraph data
 39 |         if "site_name" in self.article.opengraph.keys():
 40 |             site_name = self.article.opengraph['site_name']
 41 |             # remove the site name from title
 42 |             title = title.replace(site_name, '').strip()
 43 | 
 44 |         # try to remove the domain from url
 45 |         if self.article.domain:
 46 |             pattern = re.compile(self.article.domain, re.IGNORECASE)
 47 |             title = pattern.sub("", title).strip()
 48 | 
 49 |         # split the title in words
 50 |         # TechCrunch | my wonderfull article
 51 |         # my wonderfull article | TechCrunch
 52 |         title_words = title.split()
 53 | 
 54 |         # check for an empty title
 55 |         # so that we don't get an IndexError below
 56 |         if len(title_words) == 0:
 57 |             return u""
 58 | 
 59 |         # check if first letter is in TITLE_SPLITTERS
 60 |         # if so remove it
 61 |         if title_words[0] in TITLE_SPLITTERS:
 62 |             title_words.pop(0)
 63 | 
 64 |         # check if last letter is in TITLE_SPLITTERS
 65 |         # if so remove it
 66 |         if title_words[-1] in TITLE_SPLITTERS:
 67 |             title_words.pop(-1)
 68 | 
 69 |         # rebuild the title
 70 |         title = u" ".join(title_words).strip()
 71 | 
 72 |         return title
 73 | 
 74 |     def get_title(self):
 75 |         """\
 76 |         Fetch the article title and analyze it
 77 |         """
 78 |         title = ''
 79 | 
 80 |         # rely on opengraph in case we have the data
 81 |         if "title" in self.article.opengraph.keys():
 82 |             title = self.article.opengraph['title']
 83 |             return self.clean_title(title)
 84 | 
 85 |         # try to fetch the meta headline
 86 |         meta_headline = self.parser.getElementsByTag(
 87 |                             self.article.doc,
 88 |                             tag="meta",
 89 |                             attr="name",
 90 |                             value="headline")
 91 |         if meta_headline is not None and len(meta_headline) > 0:
 92 |             title = self.parser.getAttribute(meta_headline[0], 'content')
 93 |             return self.clean_title(title)
 94 | 
 95 |         # otherwise use the title meta
 96 |         title_element = self.parser.getElementsByTag(self.article.doc, tag='title')
 97 |         if title_element is not None and len(title_element) > 0:
 98 |             title = self.parser.getText(title_element[0])
 99 |             return self.clean_title(title)
100 | 
101 |         return title
102 | 
103 |     def extract(self):
104 |         return self.get_title()
105 | 


--------------------------------------------------------------------------------
/tests/data/extractors/videos/test_object.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/loose.dtd"><html>
 2 | <html>
 3 |     <head>
 4 |         <title>test video</title>
 5 |     </head>
 6 | 
 7 |     <body>
 8 |         <div>
 9 |             <p>
10 |             TextNode 1 - The Scala supported IDE is one of the few pain points of developers who want to start using Scala in their Java project. On existing long term project developed by a team its hard to step in and introduce a new language that is not supported by the existing IDE. On way to go about it is to hid the fact that you use Scala from the Java world by using one way dependency injection. Still, if you wish to truly absorb Scala into your existing java environment then you'll soon introduced cross language dependencies.
11 |             </p>
12 |             <p>
13 |                 <object width="640" height="390">
14 |                       <param name="movie" value="https://www.youtube.com/v/M7lc1UVf-VE?version=3&autoplay=1"></param>
15 |                       <param name="allowScriptAccess" value="always"></param>
16 |                       <embed src="https://www.youtube.com/v/M7lc1UVf-VE?version=3&autoplay=1"
17 |                             type="application/x-shockwave-flash"
18 |                             allowscriptaccess="always"
19 |                             width="640" height="390"></embed>
20 |                 </object>
21 |             </p>
22 |             <p>
23 |             Most of our team is using Eclipse as the main IDE, its incrimental compilation in Java with its tight JUnit integration are great for fast TDD programming. Unfortunately the Eclipse Scala plugin is not there yet, it may hangs the IDE and messes up Java compilation - especially in large (more then 1000 source files) Java/Scala projects. Though the plugin is getting better over time some developers would find the plugin as a majore drag on their productivity.
24 |             For developers who do not write Scala at all or rather edit Scala with other editors, you can use this alternate path which lets them work on their Java or Scala code without messing with the plugin.
25 |             </p>
26 |             <p>
27 |                 Paragraph 1 - The Following script is using the Fast Scala Compiler (fsc). The fsc is a compilation server which always run in the background, as in a warm scalac always ready to receive new work. Is will reduce compilation time dramatically.
28 |             </p>
29 |             <p>
30 |                 <object width="640" height="390">
31 |                       <param name="movie" value="https://www.youtube.com/v/M7lc1UVf-VE?version=3&autoplay=1"></param>
32 |                       <param name="allowScriptAccess" value="always"></param>
33 |                 </object>
34 |             </p>
35 |             <p>
36 |                 <embed src="https://www.youtube.com/v/M7lc1UVf-VE?version=3&autoplay=1"
37 |                             type="application/x-shockwave-flash"
38 |                             allowscriptaccess="always"
39 |                             width="640" height="390"></embed>
40 |             </p>
41 |             <p>
42 |             TextNode 2 - As you may know, kaChing is an test driven engineering organization. Test driven is not an option, its a must. We move fast and push code to production few dozens of times a day in a five minutes release cycle, so we must have high confidence in our code.
43 |             In complex systems there is no end to testings, each test system is an another line of defense which eventually gets broken but the more you have, the less chances bugs will reach production. We do not have QA team and do not want to have one, the reasoning is that if a human is involved in testing then there is a higher chance of missing things and you simply can't test all the site dozens of times a day.
44 |             </p>
45 |             <p>
46 |                 Paragraph 2 - In the next few weeks we are adding a new rule from the "not critical" list every few days. The goal is to have all the rules we think are important without the common "its to noisy, lets ignore it" approche. Only after we're done with that we're going to add the next static analysis tool to build. The good thing about these tools and hudson is that you can run them in parallel to the unit/integration tests, on another machine, so they won't slow down the overall release cycle.
47 |             </p>
48 |         </div>
49 |     </body>
50 | </html>


--------------------------------------------------------------------------------
/goose/resources/text/stopwords-hu.txt:
--------------------------------------------------------------------------------
  1 | a
  2 | á
  3 | ahogy
  4 | ahol
  5 | aki
  6 | akik
  7 | akkor
  8 | alatt
  9 | által
 10 | általában
 11 | amely
 12 | amelyek
 13 | amelyekben
 14 | amelyeket
 15 | amelyet
 16 | amelynek
 17 | ami
 18 | amit
 19 | amolyan
 20 | amp
 21 | amíg
 22 | amikor
 23 | át
 24 | abban
 25 | ahhoz
 26 | annak
 27 | arra
 28 | arról
 29 | az
 30 | azok
 31 | azon
 32 | azt
 33 | azzal
 34 | azért
 35 | aztán
 36 | azután
 37 | azonban
 38 | b
 39 | bár
 40 | be
 41 | belül
 42 | benne
 43 | c
 44 | cikk
 45 | cikkek
 46 | cikkeket
 47 | csak
 48 | d
 49 | de
 50 | e
 51 | é
 52 | eddig
 53 | egész
 54 | egy
 55 | egyes
 56 | egyetlen
 57 | egyéb
 58 | egyik
 59 | egyre
 60 | ekkor
 61 | el
 62 | elég
 63 | ellen
 64 | elő
 65 | először
 66 | előtt
 67 | első
 68 | én
 69 | éppen
 70 | ebben
 71 | ehhez
 72 | emilyen
 73 | ennek
 74 | erre
 75 | ez
 76 | ezt
 77 | ezek
 78 | ezen
 79 | ezzel
 80 | ezért
 81 | és
 82 | f
 83 | fel
 84 | felé
 85 | g
 86 | h
 87 | hanem
 88 | hiszen
 89 | hogy
 90 | hogyan
 91 | i
 92 | í
 93 | igen
 94 | így
 95 | illetve
 96 | ill.
 97 | ill
 98 | ilyen
 99 | ilyenkor
100 | is
101 | ison
102 | ismét
103 | itt
104 | j
105 | jó
106 | jól
107 | jobban
108 | k
109 | kell
110 | kellett
111 | keresztül
112 | keressünk
113 | ki
114 | kívül
115 | között
116 | közül
117 | l
118 | legalább
119 | lehet
120 | lehetett
121 | legyen
122 | lenne
123 | lenni
124 | lesz
125 | lett
126 | m
127 | maga
128 | magát
129 | majd
130 | majd
131 | már
132 | más
133 | másik
134 | meg
135 | még
136 | mellett
137 | mert
138 | mely
139 | melyek
140 | mi
141 | mit
142 | míg
143 | miért
144 | milyen
145 | mikor
146 | minden
147 | mindent
148 | mindenki
149 | mindig
150 | mint
151 | mintha
152 | mivel
153 | most
154 | n
155 | nagy
156 | nagyobb
157 | nagyon
158 | ne
159 | néha
160 | nekem
161 | neki
162 | nem
163 | néhány
164 | nélkül
165 | nincs
166 | o
167 | ó
168 | olyan
169 | ott
170 | össze
171 | ö
172 | ő
173 | ők
174 | őket
175 | p
176 | pedig
177 | persze
178 | q
179 | r
180 | rá
181 | s
182 | saját
183 | sem
184 | semmi
185 | sok
186 | sokat
187 | sokkal
188 | sz
189 | számára
190 | szemben
191 | szerint
192 | szinte
193 | t
194 | talán
195 | tehát
196 | teljes
197 | tovább
198 | továbbá
199 | több
200 | u
201 | ú
202 | úgy
203 | ugyanis
204 | új
205 | újabb
206 | újra
207 | után
208 | utána
209 | utolsó
210 | ü
211 | ű
212 | v
213 | vagy
214 | vagyis
215 | valaki
216 | valamely
217 | valami
218 | valamint
219 | való
220 | vagyok
221 | van
222 | vannak
223 | volt
224 | voltam
225 | voltak
226 | voltunk
227 | vissza
228 | vele
229 | viszont
230 | volna
231 | számolnak
232 | szólnak
233 | szól
234 | w
235 | x
236 | y
237 | z
238 | zs
239 | a
240 | ahogy
241 | ahol
242 | aki
243 | akkor
244 | alatt
245 | általában
246 | által
247 | amely
248 | amíg
249 | amikor
250 | ami
251 | amolyan
252 | arra
253 | át
254 | az
255 | azért
256 | azonban
257 | azon
258 | aztán
259 | azt
260 | azután
261 | azzal
262 | bár
263 | be
264 | belül
265 | benne
266 | cikk
267 | csak
268 | de
269 | eddig
270 | egész
271 | egy
272 | egyéb
273 | egyes
274 | egyetlen
275 | egyik
276 | egyre
277 | ekkor
278 | el
279 | elég
280 | ellen
281 | elő
282 | először
283 | előtt
284 | első
285 | emilyen
286 | én
287 | éppen
288 | erre
289 | és
290 | e
291 | ez
292 | ezen
293 | ezért
294 | ezzel
295 | fel
296 | felé
297 | hanem
298 | hiszen
299 | hogy
300 | hogyan
301 | igen
302 | így
303 | ill.
304 | illetve
305 | ill
306 | ilyen
307 | ilyenkor
308 | ismét
309 | ison
310 | itt
311 | jó
312 | jobban
313 | jól
314 | kell
315 | keres
316 | keresztül
317 | ki
318 | kívül
319 | között
320 | közül
321 | legalább
322 | legyen
323 | lehet
324 | lenni
325 | lett
326 | maga
327 | maga
328 | majd
329 | már
330 | más
331 | másik
332 | még
333 | meg
334 | mellett
335 | mely
336 | mert
337 | miért
338 | míg
339 | mikor
340 | milyen
341 | minden
342 | mindenki
343 | mindig
344 | mi
345 | mint
346 | mintha
347 | mivel
348 | most
349 | nagy
350 | nagyobb
351 | nagyon
352 | ne
353 | néha
354 | néhány
355 | neki
356 | nélkül
357 | nem
358 | nincs
359 | ők
360 | olyan
361 | ő
362 | össze
363 | ott
364 | pedig
365 | persze
366 | rá
367 | saját
368 | s
369 | sem
370 | semmi
371 | sokkal
372 | sok
373 | számára
374 | számol
375 | szemben
376 | szerint
377 | szinte
378 | szól
379 | talán
380 | tehát
381 | teljes
382 | továbbá
383 | tovább
384 | úgy
385 | ugyanis
386 | új
387 | újabb
388 | újra
389 | utána
390 | után
391 | utolsó
392 | vagy
393 | vagyis
394 | valaki
395 | valamely
396 | valami
397 | valamint
398 | való
399 | van
400 | vissza
401 | viszont
402 | volt
403 | 
404 | 


--------------------------------------------------------------------------------
/goose/configuration.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """\
  3 | This is a python port of "Goose" orignialy licensed to Gravity.com
  4 | under one or more contributor license agreements.  See the NOTICE file
  5 | distributed with this work for additional information
  6 | regarding copyright ownership.
  7 | 
  8 | Python port was written by Xavier Grangier for Recrutae
  9 | 
 10 | Gravity.com licenses this file
 11 | to you under the Apache License, Version 2.0 (the "License");
 12 | you may not use this file except in compliance
 13 | with the License.  You may obtain a copy of the License at
 14 | 
 15 | http://www.apache.org/licenses/LICENSE-2.0
 16 | 
 17 | Unless required by applicable law or agreed to in writing, software
 18 | distributed under the License is distributed on an "AS IS" BASIS,
 19 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 20 | See the License for the specific language governing permissions and
 21 | limitations under the License.
 22 | """
 23 | import os
 24 | import tempfile
 25 | from goose.text import StopWords
 26 | from goose.parsers import Parser
 27 | from goose.parsers import ParserSoup
 28 | from goose.version import __version__
 29 | 
 30 | HTTP_DEFAULT_TIMEOUT = 30
 31 | 
 32 | AVAILABLE_PARSERS = {
 33 |     'lxml': Parser,
 34 |     'soup': ParserSoup,
 35 | }
 36 | 
 37 | 
 38 | class Configuration(object):
 39 | 
 40 |     def __init__(self):
 41 |         # What's the minimum bytes for an image we'd accept is,
 42 |         # alot of times we want to filter out the author's little images
 43 |         # in the beginning of the article
 44 |         self.images_min_bytes = 4500
 45 | 
 46 |         # set this guy to false if you don't care about getting images,
 47 |         # otherwise you can either use the default
 48 |         # image extractor to implement the ImageExtractor
 49 |         # interface to build your own
 50 |         self.enable_image_fetching = True
 51 | 
 52 |         # set this valriable to False if you want to force
 53 |         # the article language. OtherWise it will attempt to
 54 |         # find meta language and use the correct stopwords dictionary
 55 |         self.use_meta_language = True
 56 | 
 57 |         # default language
 58 |         # it will be use as fallback
 59 |         # if use_meta_language is set to false, targetlanguage will
 60 |         # be use
 61 |         self.target_language = 'en'
 62 | 
 63 |         # defautl stopwrods class
 64 |         self.stopwords_class = StopWords
 65 | 
 66 |         # path to your imagemagick convert executable,
 67 |         # on the mac using mac ports this is the default listed
 68 |         self.imagemagick_convert_path = "/opt/local/bin/convert"
 69 | 
 70 |         # path to your imagemagick identify executable
 71 |         self.imagemagick_identify_path = "/opt/local/bin/identify"
 72 | 
 73 |         # used as the user agent that
 74 |         # is sent with your web requests to extract an article
 75 |         # self.browser_user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2)"\
 76 |         #                         " AppleWebKit/534.52.7 (KHTML, like Gecko) "\
 77 |         #                         "Version/5.1.2 Safari/534.52.7"
 78 |         self.browser_user_agent = 'Goose/%s' % __version__
 79 | 
 80 |         # debug mode
 81 |         # enable this to have additional debugging information
 82 |         # sent to stdout
 83 |         self.debug = False
 84 | 
 85 |         # TODO
 86 |         self.extract_publishdate = None
 87 | 
 88 |         # TODO
 89 |         self.additional_data_extractor = None
 90 | 
 91 |         # Parser type
 92 |         self.available_parsers = AVAILABLE_PARSERS.keys()
 93 |         self.parser_class = 'lxml'
 94 | 
 95 |         # set the local storage path
 96 |         # make this configurable
 97 |         self.local_storage_path = os.path.join(tempfile.gettempdir(), 'goose')
 98 | 
 99 |         # http timeout
100 |         self.http_timeout = HTTP_DEFAULT_TIMEOUT
101 | 
102 |     def get_parser(self):
103 |         return AVAILABLE_PARSERS[self.parser_class]
104 | 
105 |     def get_additionaldata_extractor(self):
106 |         return self.additional_data_extractor
107 | 
108 |     def set_additionaldata_extractor(self, extractor):
109 |         """\
110 |         Pass in to extract any additional data not defined within
111 |         @param extractor a concrete instance of AdditionalDataExtractor
112 |         """
113 |         if not extractor:
114 |             raise ValueError("extractor must not be null!")
115 |         self.additional_data_extractor = extractor
116 | 


--------------------------------------------------------------------------------
/goose/resources/text/stopwords-it.txt:
--------------------------------------------------------------------------------
  1 | ad            
  2 | al            
  3 | allo          
  4 | ai            
  5 | agli          
  6 | all           
  7 | agl           
  8 | alla          
  9 | alle          
 10 | con           
 11 | col           
 12 | coi           
 13 | da            
 14 | dal           
 15 | dallo         
 16 | dai           
 17 | dagli         
 18 | dall          
 19 | dagl          
 20 | dalla         
 21 | dalle         
 22 | di            
 23 | del           
 24 | dello         
 25 | dei           
 26 | degli         
 27 | dell          
 28 | degl          
 29 | della         
 30 | delle         
 31 | in            
 32 | nel           
 33 | nello         
 34 | nei           
 35 | negli         
 36 | nell          
 37 | negl          
 38 | nella         
 39 | nelle         
 40 | su            
 41 | sul           
 42 | sullo         
 43 | sui           
 44 | sugli         
 45 | sull          
 46 | sugl          
 47 | sulla         
 48 | sulle         
 49 | per           
 50 | tra           
 51 | contro        
 52 | io            
 53 | tu            
 54 | lui           
 55 | lei           
 56 | noi           
 57 | voi           
 58 | loro          
 59 | mio           
 60 | mia           
 61 | miei          
 62 | mie           
 63 | tuo           
 64 | tua           
 65 | tuoi          
 66 | tue           
 67 | suo           
 68 | sua           
 69 | suoi          
 70 | sue           
 71 | nostro        
 72 | nostra        
 73 | nostri        
 74 | nostre        
 75 | vostro        
 76 | vostra        
 77 | vostri        
 78 | vostre        
 79 | mi            
 80 | ti            
 81 | ci            
 82 | vi            
 83 | lo            
 84 | la            
 85 | li            
 86 | le            
 87 | gli           
 88 | ne            
 89 | il            
 90 | un            
 91 | uno           
 92 | una           
 93 | ma            
 94 | ed            
 95 | se            
 96 | perchè        
 97 | perché
 98 | perche
 99 | anche         
100 | come          
101 | dov           
102 | dove          
103 | che           
104 | chi           
105 | cui           
106 | non           
107 | più           
108 | piu
109 | quale         
110 | quanto        
111 | quanti        
112 | quanta        
113 | quante        
114 | quello        
115 | quelli        
116 | quella        
117 | quelle        
118 | questo        
119 | questi        
120 | questa        
121 | queste        
122 | si            
123 | tutto         
124 | tutti         
125 | a             
126 | c             
127 | e             
128 | i             
129 | l             
130 | o             
131 | ho
132 | hai
133 | ha
134 | abbiamo
135 | avete
136 | hanno
137 | abbia
138 | abbiate
139 | abbiano
140 | avrò
141 | avro
142 | avrai
143 | avrà
144 | avra
145 | avremo
146 | avrete
147 | avranno
148 | avrei
149 | avresti
150 | avrebbe
151 | avremmo
152 | avreste
153 | avrebbero
154 | avevo
155 | avevi
156 | aveva
157 | avevamo
158 | avevate
159 | avevano
160 | ebbi
161 | avesti
162 | ebbe
163 | avemmo
164 | aveste
165 | ebbero
166 | avessi
167 | avesse
168 | avessimo
169 | avessero
170 | avendo
171 | avuto
172 | avuta
173 | avuti
174 | avute
175 | sono
176 | sei
177 | è
178 | é
179 | e
180 | siamo
181 | siete
182 | sia
183 | siate
184 | siano
185 | sarà
186 | sarai
187 | sarò
188 | saro
189 | saremo
190 | sarete
191 | saranno
192 | sarei
193 | saresti
194 | sarebbe
195 | saremmo
196 | sareste
197 | sarebbero
198 | ero
199 | eri
200 | era
201 | eravamo
202 | eravate
203 | erano
204 | fui
205 | fosti
206 | fu
207 | fummo
208 | foste
209 | furono
210 | fossi
211 | fosse
212 | fossimo
213 | fossero
214 | essendo
215 | faccio
216 | fai
217 | facciamo
218 | fanno
219 | faccia
220 | facciate
221 | facciano
222 | farà
223 | farai
224 | farò
225 | faremo
226 | farete
227 | faranno
228 | farei
229 | faresti
230 | farebbe
231 | faremmo
232 | fareste
233 | farebbero
234 | facevo
235 | facevi
236 | faceva
237 | facevamo
238 | facevate
239 | facevano
240 | feci
241 | facesti
242 | fece
243 | facemmo
244 | faceste
245 | fecero
246 | facessi
247 | facesse
248 | facessimo
249 | facessero
250 | facendo
251 | sto
252 | stai
253 | sta
254 | stiamo
255 | stanno
256 | stia
257 | stiate
258 | stiano
259 | starà
260 | starai
261 | starò
262 | staremo
263 | starete
264 | staranno
265 | starei
266 | staresti
267 | starebbe
268 | staremmo
269 | stareste
270 | starebbero
271 | stavo
272 | stavi
273 | stava
274 | stavamo
275 | stavate
276 | stavano
277 | stetti
278 | stesti
279 | stette
280 | stemmo
281 | steste
282 | stettero
283 | stessi
284 | stesse
285 | stessimo
286 | stessero
287 | stando
288 | 


--------------------------------------------------------------------------------
/goose/utils/__init__.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """\
  3 | This is a python port of "Goose" orignialy licensed to Gravity.com
  4 | under one or more contributor license agreements.  See the NOTICE file
  5 | distributed with this work for additional information
  6 | regarding copyright ownership.
  7 | 
  8 | Python port was written by Xavier Grangier for Recrutae
  9 | 
 10 | Gravity.com licenses this file
 11 | to you under the Apache License, Version 2.0 (the "License");
 12 | you may not use this file except in compliance
 13 | with the License.  You may obtain a copy of the License at
 14 | 
 15 | http://www.apache.org/licenses/LICENSE-2.0
 16 | 
 17 | Unless required by applicable law or agreed to in writing, software
 18 | distributed under the License is distributed on an "AS IS" BASIS,
 19 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 20 | See the License for the specific language governing permissions and
 21 | limitations under the License.
 22 | """
 23 | import time
 24 | import hashlib
 25 | import re
 26 | import os
 27 | import goose
 28 | import codecs
 29 | import urlparse
 30 | 
 31 | 
 32 | class BuildURL(object):
 33 |     def __init__(self, url, finalurl=None):
 34 |         self.url = url
 35 |         self.finalurl = finalurl
 36 | 
 37 |     def getHostname(self, o):
 38 |         if o.hostname:
 39 |             return o.hotname
 40 |         elif self.finalurl:
 41 |             oo = urlparse(self.finalurl)
 42 |             if oo.hostname:
 43 |                 return oo.hostname
 44 |         return None
 45 | 
 46 |     def getScheme(self, o):
 47 |         if o.scheme:
 48 |             return o.scheme
 49 |         elif self.finalurl:
 50 |             oo = urlparse(self.finalurl)
 51 |             if oo.scheme:
 52 |                 return oo.scheme
 53 |         return 'http'
 54 | 
 55 |     def getUrl(self):
 56 |         """\
 57 | 
 58 |         """
 59 |         url_obj = urlparse(self.url)
 60 |         scheme = self.getScheme(url_obj)
 61 |         hostname = self.getHostname(url_obj)
 62 | 
 63 | 
 64 | class FileHelper(object):
 65 | 
 66 |     @classmethod
 67 |     def loadResourceFile(self, filename):
 68 |         if not os.path.isabs('filename'):
 69 |             dirpath = os.path.dirname(goose.__file__)
 70 |             path = os.path.join(dirpath, 'resources', filename)
 71 |         else:
 72 |             path = filename
 73 |         try:
 74 |             f = codecs.open(path, 'r', 'utf-8')
 75 |             content = f.read()
 76 |             f.close()
 77 |             return content
 78 |         except IOError:
 79 |             raise IOError("Couldn't open file %s" % path)
 80 | 
 81 | 
 82 | class ParsingCandidate(object):
 83 | 
 84 |     def __init__(self, urlString, link_hash):
 85 |         self.urlString = self.url = urlString
 86 |         self.link_hash = link_hash
 87 | 
 88 | 
 89 | class RawHelper(object):
 90 |     @classmethod
 91 |     def get_parsing_candidate(self, url, raw_html):
 92 |         if isinstance(raw_html, unicode):
 93 |             raw_html = raw_html.encode('utf-8')
 94 |         link_hash = '%s.%s' % (hashlib.md5(raw_html).hexdigest(), time.time())
 95 |         return ParsingCandidate(url, link_hash)
 96 | 
 97 | 
 98 | class URLHelper(object):
 99 |     @classmethod
100 |     def get_parsing_candidate(self, url_to_crawl):
101 |         # replace shebang is urls
102 |         final_url = url_to_crawl.replace('#!', '?_escaped_fragment_=') \
103 |                     if '#!' in url_to_crawl else url_to_crawl
104 |         link_hash = '%s.%s' % (hashlib.md5(final_url).hexdigest(), time.time())
105 |         return ParsingCandidate(final_url, link_hash)
106 | 
107 | 
108 | class StringReplacement(object):
109 | 
110 |     def __init__(self, pattern, replaceWith):
111 |         self.pattern = pattern
112 |         self.replaceWith = replaceWith
113 | 
114 |     def replaceAll(self, string):
115 |         if not string:
116 |             return u''
117 |         return string.replace(self.pattern, self.replaceWith)
118 | 
119 | 
120 | class ReplaceSequence(object):
121 | 
122 |     def __init__(self):
123 |         self.replacements = []
124 | 
125 |     #@classmethod
126 |     def create(self, firstPattern, replaceWith=None):
127 |         result = StringReplacement(firstPattern, replaceWith or u'')
128 |         self.replacements.append(result)
129 |         return self
130 | 
131 |     def append(self, pattern, replaceWith=None):
132 |         return self.create(pattern, replaceWith)
133 | 
134 |     def replaceAll(self, string):
135 |         if not string:
136 |             return u''
137 | 
138 |         mutatedString = string
139 | 
140 |         for rp in self.replacements:
141 |             mutatedString = rp.replaceAll(mutatedString)
142 |         return mutatedString
143 | 


--------------------------------------------------------------------------------
/goose/utils/images.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """\
  3 | This is a python port of "Goose" orignialy licensed to Gravity.com
  4 | under one or more contributor license agreements.  See the NOTICE file
  5 | distributed with this work for additional information
  6 | regarding copyright ownership.
  7 | 
  8 | Python port was written by Xavier Grangier for Recrutae
  9 | 
 10 | Gravity.com licenses this file
 11 | to you under the Apache License, Version 2.0 (the "License");
 12 | you may not use this file except in compliance
 13 | with the License.  You may obtain a copy of the License at
 14 | 
 15 | http://www.apache.org/licenses/LICENSE-2.0
 16 | 
 17 | Unless required by applicable law or agreed to in writing, software
 18 | distributed under the License is distributed on an "AS IS" BASIS,
 19 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 20 | See the License for the specific language governing permissions and
 21 | limitations under the License.
 22 | """
 23 | import hashlib
 24 | import os
 25 | import urllib2
 26 | from PIL import Image
 27 | from goose.utils.encoding import smart_str
 28 | from goose.image import ImageDetails
 29 | from goose.image import LocallyStoredImage
 30 | 
 31 | 
 32 | class ImageUtils(object):
 33 | 
 34 |     @classmethod
 35 |     def get_image_dimensions(self, identify_program, path):
 36 |         image_details = ImageDetails()
 37 |         try:
 38 |             image = Image.open(path)
 39 |             image_details.set_mime_type(image.format)
 40 |             width, height = image.size
 41 |             image_details.set_width(width)
 42 |             image_details.set_height(height)
 43 |         except IOError:
 44 |             image_details.set_mime_type('NA')
 45 |         return image_details
 46 | 
 47 |     @classmethod
 48 |     def store_image(self, http_client, link_hash, src, config):
 49 |         """\
 50 |         Writes an image src http string to disk as a temporary file
 51 |         and returns the LocallyStoredImage object
 52 |         that has the info you should need on the image
 53 |         """
 54 |         # check for a cache hit already on disk
 55 |         image = self.read_localfile(link_hash, src, config)
 56 |         if image:
 57 |             return image
 58 | 
 59 |         # no cache found download the image
 60 |         data = self.fetch(http_client, src)
 61 |         if data:
 62 |             image = self.write_localfile(data, link_hash, src, config)
 63 |             if image:
 64 |                 return image
 65 | 
 66 |         return None
 67 | 
 68 |     @classmethod
 69 |     def get_mime_type(self, image_details):
 70 |         mime_type = image_details.get_mime_type().lower()
 71 |         mimes = {
 72 |             'png': '.png',
 73 |             'jpg': '.jpg',
 74 |             'jpeg': '.jpg',
 75 |             'gif': '.gif',
 76 |         }
 77 |         return mimes.get(mime_type, 'NA')
 78 | 
 79 |     @classmethod
 80 |     def read_localfile(self, link_hash, src, config):
 81 |         local_image_name = self.get_localfile_name(link_hash, src, config)
 82 |         if os.path.isfile(local_image_name):
 83 |             identify = config.imagemagick_identify_path
 84 |             image_details = self.get_image_dimensions(identify, local_image_name)
 85 |             file_extension = self.get_mime_type(image_details)
 86 |             bytes = os.path.getsize(local_image_name)
 87 |             return LocallyStoredImage(
 88 |                 src=src,
 89 |                 local_filename=local_image_name,
 90 |                 link_hash=link_hash,
 91 |                 bytes=bytes,
 92 |                 file_extension=file_extension,
 93 |                 height=image_details.get_height(),
 94 |                 width=image_details.get_width()
 95 |             )
 96 |         return None
 97 | 
 98 |     @classmethod
 99 |     def write_localfile(self, entity, link_hash, src, config):
100 |         local_path = self.get_localfile_name(link_hash, src, config)
101 |         f = open(local_path, 'wb')
102 |         f.write(entity)
103 |         f.close()
104 |         return self.read_localfile(link_hash, src, config)
105 | 
106 |     @classmethod
107 |     def get_localfile_name(self, link_hash, src, config):
108 |         image_hash = hashlib.md5(smart_str(src)).hexdigest()
109 |         return os.path.join(config.local_storage_path, '%s_%s' % (link_hash, image_hash))
110 | 
111 |     @classmethod
112 |     def clean_src_string(self, src):
113 |         return src.replace(" ", "%20")
114 | 
115 |     @classmethod
116 |     def fetch(self, http_client, src):
117 |         try:
118 |             req = urllib2.Request(src)
119 |             f = urllib2.urlopen(req)
120 |             data = f.read()
121 |             return data
122 |         except Exception:
123 |             return None
124 | 


--------------------------------------------------------------------------------
/goose/extractors/metas.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """\
  3 | This is a python port of "Goose" orignialy licensed to Gravity.com
  4 | under one or more contributor license agreements.  See the NOTICE file
  5 | distributed with this work for additional information
  6 | regarding copyright ownership.
  7 | 
  8 | Python port was written by Xavier Grangier for Recrutae
  9 | 
 10 | Gravity.com licenses this file
 11 | to you under the Apache License, Version 2.0 (the "License");
 12 | you may not use this file except in compliance
 13 | with the License.  You may obtain a copy of the License at
 14 | 
 15 | http://www.apache.org/licenses/LICENSE-2.0
 16 | 
 17 | Unless required by applicable law or agreed to in writing, software
 18 | distributed under the License is distributed on an "AS IS" BASIS,
 19 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 20 | See the License for the specific language governing permissions and
 21 | limitations under the License.
 22 | """
 23 | 
 24 | import re
 25 | from urlparse import urljoin
 26 | from urlparse import urlparse
 27 | 
 28 | from goose.extractors import BaseExtractor
 29 | 
 30 | 
 31 | RE_LANG = r'^[A-Za-z]{2}$'
 32 | 
 33 | 
 34 | class MetasExtractor(BaseExtractor):
 35 | 
 36 |     def get_domain(self):
 37 |         if self.article.final_url:
 38 |             o = urlparse(self.article.final_url)
 39 |             return o.hostname
 40 |         return None
 41 | 
 42 |     def get_favicon(self):
 43 |         """\
 44 |         Extract the favicon from a website
 45 |         http://en.wikipedia.org/wiki/Favicon
 46 |         <link rel="shortcut icon" type="image/png" href="favicon.png" />
 47 |         <link rel="icon" type="image/png" href="favicon.png" />
 48 |         """
 49 |         kwargs = {'tag': 'link', 'attr': 'rel', 'value': 'icon'}
 50 |         meta = self.parser.getElementsByTag(self.article.doc, **kwargs)
 51 |         if meta:
 52 |             favicon = self.parser.getAttribute(meta[0], 'href')
 53 |             return favicon
 54 |         return ''
 55 | 
 56 |     def get_canonical_link(self):
 57 |         """\
 58 |         if the article has meta canonical link set in the url
 59 |         """
 60 |         if self.article.final_url:
 61 |             kwargs = {'tag': 'link', 'attr': 'rel', 'value': 'canonical'}
 62 |             meta = self.parser.getElementsByTag(self.article.doc, **kwargs)
 63 |             if meta is not None and len(meta) > 0:
 64 |                 href = self.parser.getAttribute(meta[0], 'href')
 65 |                 if href:
 66 |                     href = href.strip()
 67 |                     o = urlparse(href)
 68 |                     if not o.hostname:
 69 |                         z = urlparse(self.article.final_url)
 70 |                         domain = '%s://%s' % (z.scheme, z.hostname)
 71 |                         href = urljoin(domain, href)
 72 |                     return href
 73 |         return self.article.final_url
 74 | 
 75 |     def get_meta_lang(self):
 76 |         """\
 77 |         Extract content language from meta
 78 |         """
 79 |         # we have a lang attribute in html
 80 |         attr = self.parser.getAttribute(self.article.doc, attr='lang')
 81 |         if attr is None:
 82 |             # look up for a Content-Language in meta
 83 |             items = [
 84 |                 {'tag': 'meta', 'attr': 'http-equiv', 'value': 'content-language'},
 85 |                 {'tag': 'meta', 'attr': 'name', 'value': 'lang'}
 86 |             ]
 87 |             for item in items:
 88 |                 meta = self.parser.getElementsByTag(self.article.doc, **item)
 89 |                 if meta:
 90 |                     attr = self.parser.getAttribute(meta[0], attr='content')
 91 |                     break
 92 | 
 93 |         if attr:
 94 |             value = attr[:2]
 95 |             if re.search(RE_LANG, value):
 96 |                 return value.lower()
 97 | 
 98 |         return None
 99 | 
100 |     def get_meta_content(self, metaName):
101 |         """\
102 |         Extract a given meta content form document
103 |         """
104 |         meta = self.parser.css_select(self.article.doc, metaName)
105 |         content = None
106 | 
107 |         if meta is not None and len(meta) > 0:
108 |             content = self.parser.getAttribute(meta[0], 'content')
109 | 
110 |         if content:
111 |             return content.strip()
112 | 
113 |         return ''
114 | 
115 |     def get_meta_description(self):
116 |         """\
117 |         if the article has meta description set in the source, use that
118 |         """
119 |         return self.get_meta_content("meta[name=description]")
120 | 
121 |     def get_meta_keywords(self):
122 |         """\
123 |         if the article has meta keywords set in the source, use that
124 |         """
125 |         return self.get_meta_content("meta[name=keywords]")
126 | 
127 |     def extract(self):
128 |         return {
129 |             "description": self.get_meta_description(),
130 |             "keywords": self.get_meta_keywords(),
131 |             "lang": self.get_meta_lang(),
132 |             "favicon": self.get_favicon(),
133 |             "canonical": self.get_canonical_link(),
134 |             "domain": self.get_domain()
135 |         }
136 | 


--------------------------------------------------------------------------------
/goose/resources/text/stopwords-ru.txt:
--------------------------------------------------------------------------------
  1 | а
  2 | е
  3 | и
  4 | ж
  5 | м
  6 | о
  7 | на
  8 | не
  9 | ни
 10 | об
 11 | но
 12 | он
 13 | мне
 14 | мои
 15 | мож
 16 | она
 17 | они
 18 | оно
 19 | мной
 20 | много
 21 | многочисленное
 22 | многочисленная
 23 | многочисленные
 24 | многочисленный
 25 | мною
 26 | мой
 27 | мог
 28 | могут
 29 | можно
 30 | может
 31 | можхо
 32 | мор
 33 | моя
 34 | моё
 35 | мочь
 36 | над
 37 | нее
 38 | оба
 39 | нам
 40 | нем
 41 | нами
 42 | ними
 43 | мимо
 44 | немного
 45 | одной
 46 | одного
 47 | менее
 48 | однажды
 49 | однако
 50 | меня
 51 | нему
 52 | меньше
 53 | ней
 54 | наверху
 55 | него
 56 | ниже
 57 | мало
 58 | надо
 59 | один
 60 | одиннадцать
 61 | одиннадцатый
 62 | назад
 63 | наиболее
 64 | недавно
 65 | миллионов
 66 | недалеко
 67 | между
 68 | низко
 69 | меля
 70 | нельзя
 71 | нибудь
 72 | непрерывно
 73 | наконец
 74 | никогда
 75 | никуда
 76 | нас
 77 | наш
 78 | нет
 79 | нею
 80 | неё
 81 | них
 82 | мира
 83 | наша
 84 | наше
 85 | наши
 86 | ничего
 87 | начала
 88 | нередко
 89 | несколько
 90 | обычно
 91 | опять
 92 | около
 93 | мы
 94 | ну
 95 | нх
 96 | от
 97 | отовсюду
 98 | особенно
 99 | нужно
100 | очень
101 | отсюда
102 | в
103 | во
104 | вон
105 | вниз
106 | внизу
107 | вокруг
108 | вот
109 | восемнадцать
110 | восемнадцатый
111 | восемь
112 | восьмой
113 | вверх
114 | вам
115 | вами
116 | важное
117 | важная
118 | важные
119 | важный
120 | вдали
121 | везде
122 | ведь
123 | вас
124 | ваш
125 | ваша
126 | ваше
127 | ваши
128 | впрочем
129 | весь
130 | вдруг
131 | вы
132 | все
133 | второй
134 | всем
135 | всеми
136 | времени
137 | время
138 | всему
139 | всего
140 | всегда
141 | всех
142 | всею
143 | всю
144 | вся
145 | всё
146 | всюду
147 | г
148 | год
149 | говорил
150 | говорит
151 | года
152 | году
153 | где
154 | да
155 | ее
156 | за
157 | из
158 | ли
159 | же
160 | им
161 | до
162 | по
163 | ими
164 | под
165 | иногда
166 | довольно
167 | именно
168 | долго
169 | позже
170 | более
171 | должно
172 | пожалуйста
173 | значит
174 | иметь
175 | больше
176 | пока
177 | ему
178 | имя
179 | пор
180 | пора
181 | потом
182 | потому
183 | после
184 | почему
185 | почти
186 | посреди
187 | ей
188 | два
189 | две
190 | двенадцать
191 | двенадцатый
192 | двадцать
193 | двадцатый
194 | двух
195 | его
196 | дел
197 | или
198 | без
199 | день
200 | занят
201 | занята
202 | занято
203 | заняты
204 | действительно
205 | давно
206 | девятнадцать
207 | девятнадцатый
208 | девять
209 | девятый
210 | даже
211 | алло
212 | жизнь
213 | далеко
214 | близко
215 | здесь
216 | дальше
217 | для
218 | лет
219 | зато
220 | даром
221 | первый
222 | перед
223 | затем
224 | зачем
225 | лишь
226 | десять
227 | десятый
228 | ею
229 | её
230 | их
231 | бы
232 | еще
233 | при
234 | был
235 | про
236 | процентов
237 | против
238 | просто
239 | бывает
240 | бывь
241 | если
242 | люди
243 | была
244 | были
245 | было
246 | будем
247 | будет
248 | будете
249 | будешь
250 | прекрасно
251 | буду
252 | будь
253 | будто
254 | будут
255 | ещё
256 | пятнадцать
257 | пятнадцатый
258 | друго
259 | другое
260 | другой
261 | другие
262 | другая
263 | других
264 | есть
265 | пять
266 | быть
267 | лучше
268 | пятый
269 | к
270 | ком
271 | конечно
272 | кому
273 | кого
274 | когда
275 | которой
276 | которого
277 | которая
278 | которые
279 | который
280 | которых
281 | кем
282 | каждое
283 | каждая
284 | каждые
285 | каждый
286 | кажется
287 | как
288 | какой
289 | какая
290 | кто
291 | кроме
292 | куда
293 | кругом
294 | с
295 | т
296 | у
297 | я
298 | та
299 | те
300 | уж
301 | со
302 | то
303 | том
304 | снова
305 | тому
306 | совсем
307 | того
308 | тогда
309 | тоже
310 | собой
311 | тобой
312 | собою
313 | тобою
314 | сначала
315 | только
316 | уметь
317 | тот
318 | тою
319 | хорошо
320 | хотеть
321 | хочешь
322 | хоть
323 | хотя
324 | свое
325 | свои
326 | твой
327 | своей
328 | своего
329 | своих
330 | свою
331 | твоя
332 | твоё
333 | раз
334 | уже
335 | сам
336 | там
337 | тем
338 | чем
339 | сама
340 | сами
341 | теми
342 | само
343 | рано
344 | самом
345 | самому
346 | самой
347 | самого
348 | семнадцать
349 | семнадцатый
350 | самим
351 | самими
352 | самих
353 | саму
354 | семь
355 | чему
356 | раньше
357 | сейчас
358 | чего
359 | сегодня
360 | себе
361 | тебе
362 | сеаой
363 | человек
364 | разве
365 | теперь
366 | себя
367 | тебя
368 | седьмой
369 | спасибо
370 | слишком
371 | так
372 | такое
373 | такой
374 | такие
375 | также
376 | такая
377 | сих
378 | тех
379 | чаще
380 | четвертый
381 | через
382 | часто
383 | шестой
384 | шестнадцать
385 | шестнадцатый
386 | шесть
387 | четыре
388 | четырнадцать
389 | четырнадцатый
390 | сколько
391 | сказал
392 | сказала
393 | сказать
394 | ту
395 | ты
396 | три
397 | эта
398 | эти
399 | что
400 | это
401 | чтоб
402 | этом
403 | этому
404 | этой
405 | этого
406 | чтобы
407 | этот
408 | стал
409 | туда
410 | этим
411 | этими
412 | рядом
413 | тринадцать
414 | тринадцатый
415 | этих
416 | третий
417 | тут
418 | эту
419 | суть
420 | чуть
421 | тысяч
422 | 


--------------------------------------------------------------------------------
/goose/extractors/videos.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """\
  3 | This is a python port of "Goose" orignialy licensed to Gravity.com
  4 | under one or more contributor license agreements.  See the NOTICE file
  5 | distributed with this work for additional information
  6 | regarding copyright ownership.
  7 | 
  8 | Python port was written by Xavier Grangier for Recrutae
  9 | 
 10 | Gravity.com licenses this file
 11 | to you under the Apache License, Version 2.0 (the "License");
 12 | you may not use this file except in compliance
 13 | with the License.  You may obtain a copy of the License at
 14 | 
 15 | http://www.apache.org/licenses/LICENSE-2.0
 16 | 
 17 | Unless required by applicable law or agreed to in writing, software
 18 | distributed under the License is distributed on an "AS IS" BASIS,
 19 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 20 | See the License for the specific language governing permissions and
 21 | limitations under the License.
 22 | """
 23 | 
 24 | from goose.extractors import BaseExtractor
 25 | from goose.video import Video
 26 | 
 27 | VIDEOS_TAGS = ['iframe', 'embed', 'object', 'video']
 28 | VIDEO_PROVIDERS = ['youtube', 'vimeo', 'dailymotion', 'kewego']
 29 | 
 30 | 
 31 | class VideoExtractor(BaseExtractor):
 32 |     """\
 33 |     Extracts a list of video from Article top node
 34 |     """
 35 |     def __init__(self, config, article):
 36 |         super(VideoExtractor, self).__init__(config, article)
 37 | 
 38 |         # candidates
 39 |         self.candidates = []
 40 | 
 41 |         # movies
 42 |         self.movies = []
 43 | 
 44 |     def get_embed_code(self, node):
 45 |         return "".join([line.strip() for line in self.parser.nodeToString(node).splitlines()])
 46 | 
 47 |     def get_embed_type(self, node):
 48 |         return self.parser.getTag(node)
 49 | 
 50 |     def get_width(self, node):
 51 |         return self.parser.getAttribute(node, 'width')
 52 | 
 53 |     def get_height(self, node):
 54 |         return self.parser.getAttribute(node, 'height')
 55 | 
 56 |     def get_src(self, node):
 57 |         return self.parser.getAttribute(node, 'src')
 58 | 
 59 |     def get_provider(self, src):
 60 |         if src:
 61 |             for provider in VIDEO_PROVIDERS:
 62 |                 if provider in src:
 63 |                     return provider
 64 |         return None
 65 | 
 66 |     def get_video(self, node):
 67 |         """
 68 |         Create a video object from a video embed
 69 |         """
 70 |         video = Video()
 71 |         video.embed_code = self.get_embed_code(node)
 72 |         video.embed_type = self.get_embed_type(node)
 73 |         video.width = self.get_width(node)
 74 |         video.height = self.get_height(node)
 75 |         video.src = self.get_src(node)
 76 |         video.provider = self.get_provider(video.src)
 77 |         return video
 78 | 
 79 |     def get_iframe_tag(self, node):
 80 |         return self.get_video(node)
 81 | 
 82 |     def get_video_tag(self, node):
 83 |         """extract html video tags"""
 84 |         return Video()
 85 | 
 86 |     def get_embed_tag(self, node):
 87 |         # embed node may have an object node as parent
 88 |         # in this case we want to retrieve the object node
 89 |         # instead of the embed
 90 |         parent = self.parser.getParent(node)
 91 |         if parent is not None:
 92 |             parent_tag = self.parser.getTag(parent)
 93 |             if parent_tag == 'object':
 94 |                 return self.get_object_tag(node)
 95 |         return self.get_video(node)
 96 | 
 97 |     def get_object_tag(self, node):
 98 |         # test if object tag has en embed child
 99 |         # in this case we want to remove the embed from
100 |         # the candidate list to avoid parsing it twice
101 |         child_embed_tag = self.parser.getElementsByTag(node, 'embed')
102 |         if child_embed_tag and child_embed_tag[0] in self.candidates:
103 |             self.candidates.remove(child_embed_tag[0])
104 | 
105 |         # get the object source
106 |         # if wa don't have a src node don't coninue
107 |         src_node = self.parser.getElementsByTag(node, tag="param", attr="name", value="movie")
108 |         if not src_node:
109 |             return None
110 | 
111 |         src = self.parser.getAttribute(src_node[0], "value")
112 | 
113 |         # check provider
114 |         provider = self.get_provider(src)
115 |         if not provider:
116 |             return None
117 | 
118 |         video = self.get_video(node)
119 |         video.provider = provider
120 |         video.src = src
121 |         return video
122 | 
123 |     def get_videos(self):
124 |         # candidates node
125 |         self.candidates = self.parser.getElementsByTags(self.article.top_node, VIDEOS_TAGS)
126 | 
127 |         # loop all candidates
128 |         # and check if src attribute belongs to a video provider
129 |         for candidate in self.candidates:
130 |             tag = self.parser.getTag(candidate)
131 |             attr = "get_%s_tag" % tag
132 |             if hasattr(self, attr):
133 |                 movie = getattr(self, attr)(candidate)
134 |                 if movie is not None and movie.provider is not None:
135 |                     self.movies.append(movie)
136 | 
137 |         # append movies list to article
138 |         self.article.movies = list(self.movies)
139 | 


--------------------------------------------------------------------------------