├── .formatter.exs
├── .gitignore
├── LICENSE.txt
├── README.md
├── cache
    ├── article
    │   ├── kreuzfahrt-prozente.html
    │   ├── nytimes.html
    │   └── spiegel.html
    ├── domain
    │   ├── spiegel.html
    │   ├── venturebeat.html
    │   └── zeit.html
    └── feed
    │   ├── elixir-lang.xml
    │   ├── heise.xml
    │   ├── latimes.xml
    │   └── spiegel.xml
├── config
    └── config.exs
├── lib
    ├── scrape.ex
    └── scrape
    │   ├── application.ex
    │   ├── flow.ex
    │   ├── flow
    │       ├── article.ex
    │       ├── domain.ex
    │       ├── feed.ex
    │       └── feed_item.ex
    │   ├── ir
    │       ├── feed.ex
    │       ├── feed_item.ex
    │       ├── html.ex
    │       ├── text.ex
    │       └── text
    │       │   ├── rake.ex
    │       │   └── tfidf.ex
    │   ├── options.ex
    │   ├── source
    │       ├── disk.ex
    │       ├── http.ex
    │       └── http
    │       │   ├── charset.ex
    │       │   ├── get.ex
    │       │   └── transcode.ex
    │   └── tools
    │       ├── dom.ex
    │       ├── tree.ex
    │       ├── url.ex
    │       ├── word.ex
    │       └── word
    │           ├── is_stopword.ex
    │           └── stopwords
    │               ├── de.txt
    │               └── en.txt
├── mix.exs
├── mix.lock
└── test
    ├── flow
        ├── article_test.exs
        ├── domain_test.exs
        └── feed_test.exs
    ├── ir
        ├── feed_item_test.exs
        ├── feed_test.exs
        ├── html_test.exs
        └── text_test.exs
    ├── scrape_test.exs
    ├── test_helper.exs
    └── tools
        ├── dom_test.exs
        ├── tree_test.exs
        ├── url_test.exs
        └── word_test.exs


/.formatter.exs:
--------------------------------------------------------------------------------
1 | # Used by "mix format"
2 | [
3 |   inputs: ["{mix,.formatter}.exs", "{config,lib,test}/**/*.{ex,exs}"]
4 | ]
5 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # mac os x stuff
 2 | .DS_Store
 3 | 
 4 | # The directory Mix will write compiled artifacts to.
 5 | /_build/
 6 | 
 7 | # If you run "mix test --cover", coverage assets end up here.
 8 | /cover/
 9 | 
10 | # The directory Mix downloads your dependencies sources to.
11 | /deps/
12 | 
13 | # Where third-party dependencies like ExDoc output generated docs.
14 | /doc/
15 | 
16 | # Ignore .fetch files in case you like to edit your project deps locally.
17 | /.fetch
18 | 
19 | # If the VM crashes, it generates a dump, let's ignore it too.
20 | erl_crash.dump
21 | 
22 | # Also ignore archive artifacts (built via "mix archive.build").
23 | *.ez
24 | 
25 | # Ignore package tarball (built via "mix hex.build").
26 | scrape-*.tar
27 | 
28 | # Ignore VSCode artifacts
29 | .elixir_ls
30 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
  1 |                    GNU LESSER GENERAL PUBLIC LICENSE
  2 |                        Version 3, 29 June 2007
  3 | 
  4 |  Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
  5 |  Everyone is permitted to copy and distribute verbatim copies
  6 |  of this license document, but changing it is not allowed.
  7 | 
  8 | 
  9 |   This version of the GNU Lesser General Public License incorporates
 10 | the terms and conditions of version 3 of the GNU General Public
 11 | License, supplemented by the additional permissions listed below.
 12 | 
 13 |   0. Additional Definitions.
 14 | 
 15 |   As used herein, "this License" refers to version 3 of the GNU Lesser
 16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU
 17 | General Public License.
 18 | 
 19 |   "The Library" refers to a covered work governed by this License,
 20 | other than an Application or a Combined Work as defined below.
 21 | 
 22 |   An "Application" is any work that makes use of an interface provided
 23 | by the Library, but which is not otherwise based on the Library.
 24 | Defining a subclass of a class defined by the Library is deemed a mode
 25 | of using an interface provided by the Library.
 26 | 
 27 |   A "Combined Work" is a work produced by combining or linking an
 28 | Application with the Library.  The particular version of the Library
 29 | with which the Combined Work was made is also called the "Linked
 30 | Version".
 31 | 
 32 |   The "Minimal Corresponding Source" for a Combined Work means the
 33 | Corresponding Source for the Combined Work, excluding any source code
 34 | for portions of the Combined Work that, considered in isolation, are
 35 | based on the Application, and not on the Linked Version.
 36 | 
 37 |   The "Corresponding Application Code" for a Combined Work means the
 38 | object code and/or source code for the Application, including any data
 39 | and utility programs needed for reproducing the Combined Work from the
 40 | Application, but excluding the System Libraries of the Combined Work.
 41 | 
 42 |   1. Exception to Section 3 of the GNU GPL.
 43 | 
 44 |   You may convey a covered work under sections 3 and 4 of this License
 45 | without being bound by section 3 of the GNU GPL.
 46 | 
 47 |   2. Conveying Modified Versions.
 48 | 
 49 |   If you modify a copy of the Library, and, in your modifications, a
 50 | facility refers to a function or data to be supplied by an Application
 51 | that uses the facility (other than as an argument passed when the
 52 | facility is invoked), then you may convey a copy of the modified
 53 | version:
 54 | 
 55 |    a) under this License, provided that you make a good faith effort to
 56 |    ensure that, in the event an Application does not supply the
 57 |    function or data, the facility still operates, and performs
 58 |    whatever part of its purpose remains meaningful, or
 59 | 
 60 |    b) under the GNU GPL, with none of the additional permissions of
 61 |    this License applicable to that copy.
 62 | 
 63 |   3. Object Code Incorporating Material from Library Header Files.
 64 | 
 65 |   The object code form of an Application may incorporate material from
 66 | a header file that is part of the Library.  You may convey such object
 67 | code under terms of your choice, provided that, if the incorporated
 68 | material is not limited to numerical parameters, data structure
 69 | layouts and accessors, or small macros, inline functions and templates
 70 | (ten or fewer lines in length), you do both of the following:
 71 | 
 72 |    a) Give prominent notice with each copy of the object code that the
 73 |    Library is used in it and that the Library and its use are
 74 |    covered by this License.
 75 | 
 76 |    b) Accompany the object code with a copy of the GNU GPL and this license
 77 |    document.
 78 | 
 79 |   4. Combined Works.
 80 | 
 81 |   You may convey a Combined Work under terms of your choice that,
 82 | taken together, effectively do not restrict modification of the
 83 | portions of the Library contained in the Combined Work and reverse
 84 | engineering for debugging such modifications, if you also do each of
 85 | the following:
 86 | 
 87 |    a) Give prominent notice with each copy of the Combined Work that
 88 |    the Library is used in it and that the Library and its use are
 89 |    covered by this License.
 90 | 
 91 |    b) Accompany the Combined Work with a copy of the GNU GPL and this license
 92 |    document.
 93 | 
 94 |    c) For a Combined Work that displays copyright notices during
 95 |    execution, include the copyright notice for the Library among
 96 |    these notices, as well as a reference directing the user to the
 97 |    copies of the GNU GPL and this license document.
 98 | 
 99 |    d) Do one of the following:
100 | 
101 |        0) Convey the Minimal Corresponding Source under the terms of this
102 |        License, and the Corresponding Application Code in a form
103 |        suitable for, and under terms that permit, the user to
104 |        recombine or relink the Application with a modified version of
105 |        the Linked Version to produce a modified Combined Work, in the
106 |        manner specified by section 6 of the GNU GPL for conveying
107 |        Corresponding Source.
108 | 
109 |        1) Use a suitable shared library mechanism for linking with the
110 |        Library.  A suitable mechanism is one that (a) uses at run time
111 |        a copy of the Library already present on the user's computer
112 |        system, and (b) will operate properly with a modified version
113 |        of the Library that is interface-compatible with the Linked
114 |        Version.
115 | 
116 |    e) Provide Installation Information, but only if you would otherwise
117 |    be required to provide such information under section 6 of the
118 |    GNU GPL, and only to the extent that such information is
119 |    necessary to install and execute a modified version of the
120 |    Combined Work produced by recombining or relinking the
121 |    Application with a modified version of the Linked Version. (If
122 |    you use option 4d0, the Installation Information must accompany
123 |    the Minimal Corresponding Source and Corresponding Application
124 |    Code. If you use option 4d1, you must provide the Installation
125 |    Information in the manner specified by section 6 of the GNU GPL
126 |    for conveying Corresponding Source.)
127 | 
128 |   5. Combined Libraries.
129 | 
130 |   You may place library facilities that are a work based on the
131 | Library side by side in a single library together with other library
132 | facilities that are not Applications and are not covered by this
133 | License, and convey such a combined library under terms of your
134 | choice, if you do both of the following:
135 | 
136 |    a) Accompany the combined library with a copy of the same work based
137 |    on the Library, uncombined with any other library facilities,
138 |    conveyed under the terms of this License.
139 | 
140 |    b) Give prominent notice with the combined library that part of it
141 |    is a work based on the Library, and explaining where to find the
142 |    accompanying uncombined form of the same work.
143 | 
144 |   6. Revised Versions of the GNU Lesser General Public License.
145 | 
146 |   The Free Software Foundation may publish revised and/or new versions
147 | of the GNU Lesser General Public License from time to time. Such new
148 | versions will be similar in spirit to the present version, but may
149 | differ in detail to address new problems or concerns.
150 | 
151 |   Each version is given a distinguishing version number. If the
152 | Library as you received it specifies that a certain numbered version
153 | of the GNU Lesser General Public License "or any later version"
154 | applies to it, you have the option of following the terms and
155 | conditions either of that published version or of any later version
156 | published by the Free Software Foundation. If the Library as you
157 | received it does not specify a version number of the GNU Lesser
158 | General Public License, you may choose any version of the GNU Lesser
159 | General Public License ever published by the Free Software Foundation.
160 | 
161 |   If the Library as you received it specifies that a proxy can decide
162 | whether future versions of the GNU Lesser General Public License shall
163 | apply, that proxy's public statement of acceptance of any version is
164 | permanent authorization for you to choose that version for the
165 | Library.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Scrape
 2 | 
 3 | [![Hex.pm](https://img.shields.io/hexpm/dt/scrape.svg)](https://hex.pm/packages/scrape)
 4 | [![Hex.pm](https://img.shields.io/hexpm/v/scrape.svg)](https://hex.pm/packages/scrape)
 5 | [![Hex.pm](https://img.shields.io/hexpm/l/scrape.svg)](https://hex.pm/packages/scrape)
 6 | 
 7 | Structured Data extraction from common web resources, using information-retrieval techniques. See the [docs](https://hexdocs.pm/scrape/Scrape.html)
 8 | 
 9 | ## Installation
10 | 
11 | The package can be installed by adding `scrape` to your list of dependencies in `mix.exs`:
12 | 
13 | ```elixir
14 | def deps do
15 |   [
16 |     {:scrape, "~> 3.0.0"}
17 |   ]
18 | end
19 | ```
20 | 
21 | ## Known Issues
22 | 
23 | * This package uses an outdated version of `httpoison` because of `keepcosmos/readability`. You can override this in your app with `override: true` and everything should work.
24 | * The current version 3.X is a complete rewrite from scratch, so some new issues might occur and the API has changed. Please provide some URL to a HTML/Feed document when submitting issues, so I can look into it for bugfixing.
25 | 
26 | ## Usage
27 | 
28 | * `Scrape.domain!(url)` -> get structured data of a domain-type url (like https://bbc.com)
29 | * `Scrape.feed!(url)` -> get structured data of a RSS/Atom feed
30 | * `Scrape.article!(url)` -> get structured data of an article-type url 
31 | 
32 | ## License
33 | 
34 | LGPLv3. You can use this package any way you want (including commercially), but I want bugfixes and improvements to flow back into this package for everyone's benefit.
35 | 


--------------------------------------------------------------------------------
/cache/feed/latimes.xml:
--------------------------------------------------------------------------------
  1 | 
  2 | <?xml version="1.0" encoding="UTF-8"?>
  3 | <rss version="2.0" xmlns:media="http://search.yahoo.com/mrss/" xmlns:dc="http://purl.org/dc/elements/1.1/">
  4 |   <channel>
  5 |     <title>latimes.com - Los Angeles Times</title>
  6 |     <link>http://www.latimes.com/rss2.0.xml</link>
  7 |     <description>The LA Times is a leading source of breaking news, entertainment, sports, politics, and more for Southern California and the world.</description>
  8 |     <language>en-US</language>
  9 |     <copyright>&#xA9;2016 Los Angeles Times</copyright>
 10 |     <lastBuildDate>Sun, 3 Jul 2016 11:14:41 -0700</lastBuildDate>
 11 |     <item>
 12 |     <title>Essential tracks: Instrumental guitar music from Marisa Anderson, William Tyler and Harry Taussig</title>
 13 | 	<dc:creator>Randall Roberts</dc:creator>
 14 | 	<link>http://www.latimes.com/la-et-ms-essential-tracks-marisa-anderson-tyler-20160624-snap-story.html</link>
 15 | 	<description><![CDATA[
 16 | 	  <p>Summer playlists usually teem with high-energy party music, but nights spent camping under the stars at Joshua Tree or amid the Big Sur redwoods call for sounds more contemplative, organic, earthen. This month has seen the arrival of a few remarkable guitar-based instrumental albums that fit the...</p>
 17 | 	]]></description>
 18 | 	<pubDate>Sun, 3 Jul 2016 11:07:00 PDT</pubDate>
 19 | <media:thumbnail url="http://www.trbimg.com/img-57794a61/turbine/la-et-ms-essential-tracks-marisa-anderson-tyler-20160624-snap/600"/>
 20 |       </item>
 21 | <item>
 22 |     <title>Pine Fire continues to burn in wilderness area north of Ojai</title>
 23 | 	<dc:creator>Alice Walton</dc:creator>
 24 | 	<link>http://www.latimes.com/la-me-pine-fire-update-20160703-snap-story.html</link>
 25 | 	<description><![CDATA[
 26 | 	  <p>Hundreds of firefighters continued to battle a wildfire in the Sespe Wilderness north of Ojai on Sunday with no containment in sight.</p> <p>The Pine fire started Thursday morning about 11 miles north of Ojai. As of Sunday morning, the fire had consumed 1,590 acres and was threatening 50 structures, said...</p>
 27 | 	]]></description>
 28 | 	<pubDate>Sun, 3 Jul 2016 11:05:00 PDT</pubDate>
 29 | </item>
 30 | <item>
 31 |     <title>Ex-campaign manager dismisses complaints about Donald Trump&apos;s six-point-star tweet</title>
 32 | 	<dc:creator>David Willman</dc:creator>
 33 | 	<link>http://www.latimes.com/la-na-trailguide-updates-former-campaign-manager-dismisses-1467557105-htmlstory.html</link>
 34 | 	<description><![CDATA[
 35 | 	  FBI questions Hillary Clinton about her private email server July 3, 2016, 9:56 a.m. Hillary Clinton is interviewed by federal investigators for several hours about her use of private email to conduct government business. The FBI is looking into whether Clinton or anyone else mishandled classified...
 36 | 	]]></description>
 37 | 	<pubDate>Sun, 3 Jul 2016 10:22:00 PDT</pubDate>
 38 | <media:thumbnail url="http://www.trbimg.com/img-57794abd/turbine/la-na-trailguide-updates-former-campaign-manager-dismisses-1467557105/600"/>
 39 |       </item>
 40 | <item>
 41 |     <title>GOP Sen. Cotton says Trump can &apos;make the case for himself&apos;</title>
 42 | 	<dc:creator>Christi Parsons</dc:creator>
 43 | 	<link>http://www.latimes.com/la-na-trailguide-updates-gop-sen-cotton-says-trump-can-make-1467565002-htmlstory.html</link>
 44 | 	<description><![CDATA[
 45 | 	  FBI questions Hillary Clinton about her private email server July 3, 2016, 9:56 a.m. Hillary Clinton is interviewed by federal investigators for several hours about her use of private email to conduct government business. The FBI is looking into whether Clinton or anyone else mishandled classified...
 46 | 	]]></description>
 47 | 	<pubDate>Sun, 3 Jul 2016 10:13:00 PDT</pubDate>
 48 | <media:thumbnail url="http://www.trbimg.com/img-57794b74/turbine/la-na-trailguide-updates-gop-sen-cotton-says-trump-can-make-1467565002/600"/>
 49 |       </item>
 50 | <item>
 51 |     <title>Driver who escaped police pursuit in San Diego is finally arrested - two months later</title>
 52 | 	<dc:creator>David Hernandez</dc:creator>
 53 | 	<link>http://www.latimes.com/la-me-pursuit-arrest-20160703-snap-story.html</link>
 54 | 	<description><![CDATA[
 55 | 	  <p>Police say a man accused of leading officers on a chase through several freeways before ditching the car in downtown San Diego was arrested last week — two months after the pursuit.</p> <p>Officials say Ahran Haugley, 41, drove off on April 28 when officers approached the Honda Accord in which he was...</p>
 56 | 	]]></description>
 57 | 	<pubDate>Sun, 3 Jul 2016 10:00:00 PDT</pubDate>
 58 | </item>
 59 | <item>
 60 |     <title>FBI questions Hillary Clinton about her private email server</title>
 61 | 	<link>http://www.latimes.com/la-na-trailguide-updates-07032016-htmlstory.html</link>
 62 | 	<description><![CDATA[
 63 | 	  FBI questions Hillary Clinton about her private email server July 3, 2016, 9:56 a.m. Hillary Clinton is interviewed by federal investigators for several hours about her use of private email to conduct government business. The FBI is looking into whether Clinton or anyone else mishandled classified...
 64 | 	]]></description>
 65 | 	<pubDate>Sun, 3 Jul 2016 09:56:52 PDT</pubDate>
 66 | </item>
 67 | <item>
 68 |     <title>Serena Williams cruises to third-round win at Wimbledon</title>
 69 | 	<dc:creator>Associated Press</dc:creator>
 70 | 	<link>http://www.latimes.com/la-sp-wimbledon-20160703-snap-story.html</link>
 71 | 	<description><![CDATA[
 72 | 	  <p>Serena Williams earned a decent day's rest on the middle Sunday at Wimbledon while Jo-Wilfried Tsonga had to work overtime — 19-17 in the fifth set — in another marathon involving John Isner. </p> <p>Williams, the defending women's champion and six-time winner, overwhelmed Annika Beck, 6-3, 6-0, in just...</p>
 73 | 	]]></description>
 74 | 	<pubDate>Sun, 3 Jul 2016 09:50:00 PDT</pubDate>
 75 | <media:thumbnail url="http://www.trbimg.com/img-57794275/turbine/la-sp-wimbledon-20160703-snap/600"/>
 76 |       </item>
 77 | <item>
 78 |     <title>Lewis Hamilton wins Austrian Grand Prix after final-lap pass of Nico Rosberg</title>
 79 | 	<dc:creator>Associated Press</dc:creator>
 80 | 	<link>http://www.latimes.com/la-sp-formula-one-austrian-grand-prix-20160703-snap-story.html</link>
 81 | 	<description><![CDATA[
 82 | 	  <p>Lewis Hamilton won the Austrian Grand Prix on Sunday after colliding with Nico Rosberg on the final lap, an incident he blamed on his German teammate. </p> <p>The two Mercedes drivers touched as Hamilton sought to overtake and Formula One championship leader Rosberg ended up losing his front wing, which...</p>
 83 | 	]]></description>
 84 | 	<pubDate>Sun, 3 Jul 2016 09:40:00 PDT</pubDate>
 85 | <media:thumbnail url="http://www.trbimg.com/img-57794016/turbine/la-sp-formula-one-austrian-grand-prix-20160703-snap/600"/>
 86 |       </item>
 87 | <item>
 88 |     <title>Peter Sagan moves into Tour de France lead with Stage 2 win</title>
 89 | 	<dc:creator>Associated Press</dc:creator>
 90 | 	<link>http://www.latimes.com/la-sp-tour-de-france-20160703-snap-story.html</link>
 91 | 	<description><![CDATA[
 92 | 	  <p>World champion Peter Sagan made the most of a steep, short climb in a frenzied finale to win the second stage of the Tour de France and claim the race leader's yellow jersey on Sunday. </p> <p>Sagan, who pulled on the coveted shirt for the first time, used his power on the 1.9-kilometer Cote de la Glacerie...</p>
 93 | 	]]></description>
 94 | 	<pubDate>Sun, 3 Jul 2016 09:30:00 PDT</pubDate>
 95 | <media:thumbnail url="http://www.trbimg.com/img-5779458c/turbine/la-sp-tour-de-france-20160703-snap/600"/>
 96 |       </item>
 97 | <item>
 98 |     <title>Man injured after explosion reported in Central Park</title>
 99 | 	<dc:creator>Associated Press</dc:creator>
100 | 	<link>http://www.latimes.com/la-na-central-park-explosion-reported-20160703-snap-story.html</link>
101 | 	<description><![CDATA[
102 | 	  <p>Authorities say a man was seriously hurt in New York City’s Central Park after people near the area reported hearing some kind of explosion.</p> <p>Fire officials say it happened shortly before 11 a.m., inside the park at 68th Street and Fifth Avenue. The man suffered serious injuries, possibly requiring...</p>
103 | 	]]></description>
104 | 	<pubDate>Sun, 3 Jul 2016 09:29:00 PDT</pubDate>
105 | <media:thumbnail url="http://www.trbimg.com/img-57793db0/turbine/la-na-central-park-explosion-reported-20160703-snap/600"/>
106 |       </item>
107 | <item>
108 |     <title>At least 120 people - including 15 children - killed in dual Baghdad bombings</title>
109 | 	<dc:creator>Associated Press</dc:creator>
110 | 	<link>http://www.latimes.com/la-fg-ap-baghdad-bombing-20160702-snap-story.html</link>
111 | 	<description><![CDATA[
112 | 	  <p> A suicide truck bomb in downtown Baghdad killed 115 people and wounded nearly 200 others who were out shopping and celebrating early Sunday ahead of the holiday marking the end of Ramadan, security and medical officials said.</p> <p>The attack, claimed by Islamic State, was the deadliest in months in...</p>
113 | 	]]></description>
114 | 	<pubDate>Sun, 3 Jul 2016 09:16:00 PDT</pubDate>
115 | <media:thumbnail url="http://www.trbimg.com/img-5778cda9/turbine/la-fg-ap-baghdad-bombing-20160702-snap/600"/>
116 |       </item>
117 | <item>
118 |     <title>Possible Clinton running mates audition with attacks on Trump and defenses of their views on trade</title>
119 | 	<dc:creator>Christi Parsons</dc:creator>
120 | 	<link>http://www.latimes.com/la-na-trailguide-updates-possible-clinton-running-mates-audition-1467553920-htmlstory.html</link>
121 | 	<description><![CDATA[
122 | 	  FBI questions Hillary Clinton about her private email server July 3, 2016, 9:56 a.m. Hillary Clinton is interviewed by federal investigators for several hours about her use of private email to conduct government business. The FBI is looking into whether Clinton or anyone else mishandled classified...
123 | 	]]></description>
124 | 	<pubDate>Sun, 3 Jul 2016 09:06:00 PDT</pubDate>
125 | <media:thumbnail url="http://www.trbimg.com/img-577939c8/turbine/la-na-trailguide-updates-possible-clinton-running-mates-audition-1467553920/600"/>
126 |       </item>
127 | <item>
128 |     <title>Scott Kazmir gets a first-inning adjustment</title>
129 | 	<dc:creator>Jesse Dougherty</dc:creator>
130 | 	<link>http://www.latimes.com/la-sp-scott-kazmir-first-inning-20160702-snap-story.html</link>
131 | 	<description><![CDATA[
132 | 	  <p>Scott Kazmir’s up-and-down season has carried one glaring theme: The first inning is his toughest obstacle. </p> <p>Heading into his start against the Rockies on Saturday, Kazmir had a 9.00 ERA in the first inning of his 16 starts. Opponents were hitting a healthy .342 against him in the first. More than...</p>
133 | 	]]></description>
134 | 	<pubDate>Sun, 3 Jul 2016 09:00:00 PDT</pubDate>
135 | <media:thumbnail url="http://www.trbimg.com/img-5778c0b4/turbine/la-sp-scott-kazmir-first-inning-20160702-snap/600"/>
136 |       </item>
137 | <item>
138 |     <title>Former campaign bus rolls into Los Angeles as anti-Trump protest art</title>
139 | 	<dc:creator>Javier Panzar</dc:creator>
140 | 	<link>http://www.latimes.com/politics/la-pol-ca-donald-trump-bus-iowa-california-20160702-snap-story.html</link>
141 | 	<description><![CDATA[
142 | 	  FBI questions Hillary Clinton about her private email server July 3, 2016, 9:56 a.m. Hillary Clinton is interviewed by federal investigators for several hours about her use of private email to conduct government business. The FBI is looking into whether Clinton or anyone else mishandled classified...
143 | 	]]></description>
144 | 	<pubDate>Sun, 3 Jul 2016 08:43:00 PDT</pubDate>
145 | <media:thumbnail url="http://www.trbimg.com/img-57792881/turbine/la-na-trailguide-updates-former-campaign-bus-rolls-into-los-1467557275/600"/>
146 |       </item>
147 | <item>
148 |     <title>Woman saves pet cockatiel, but home burned in fast-moving brush fire</title>
149 | 	<dc:creator>Howard Blume and Shelby Grad</dc:creator>
150 | 	<link>http://www.latimes.com/la-me-sb-fire-update-20160703-snap-story.html</link>
151 | 	<description><![CDATA[
152 | 	  <p>The fire seemed to come out of nowhere and spread rapidly Saturday in a San Bernardino neighborhood.</p> <p>Resident Martha Hall <a href="http://www.sbsun.com/20160702/kendall-fire-vegetation-fire-burns-5-homes-injures-3-in-san-bernardino" target="_blank">told the </a>San Bernardino Sun that she saw the flames rushing up the hill toward her home. She ran into her house, grabbed her pet cockatiel and fled. As she was leaving, she...</p>
153 | 	]]></description>
154 | 	<pubDate>Sun, 3 Jul 2016 08:33:00 PDT</pubDate>
155 | <media:thumbnail url="http://www.trbimg.com/img-57794149/turbine/la-me-sb-fire-update-20160703-snap/600"/>
156 |       </item>
157 | <item>
158 |     <title>In Colorado, conservatives grapple with the Trump conundrum</title>
159 | 	<dc:creator>Melanie Mason</dc:creator>
160 | 	<link>http://www.latimes.com/la-na-pol-trump-colorado-conservatives-20160703-snap-story.html</link>
161 | 	<description><![CDATA[
162 | 	  <p>To understand the dilemma Colorado Republicans wrestled with at a conservative gathering this weekend, one only had to look at the range of speakers, whose positions on Donald Trump ran the gamut from enthusiastic support to vehement opposition.</p> <p>Trump himself came to Colorado for the Western Conservative...</p>
163 | 	]]></description>
164 | 	<pubDate>Sun, 3 Jul 2016 08:15:00 PDT</pubDate>
165 | <media:thumbnail url="http://www.trbimg.com/img-5777e558/turbine/la-na-pol-trump-colorado-conservatives-20160703-snap/600"/>
166 |       </item>
167 | <item>
168 |     <title>Porter Ranch&apos;s future after massive gas leak is in the eye of the beholder</title>
169 | 	<dc:creator>Alice Walton</dc:creator>
170 | 	<link>http://www.latimes.com/la-me-porter-ranch-20160629-snap-story.html</link>
171 | 	<description><![CDATA[
172 | 	  <p>In the hills above the 118 freeway, mansions are being built. Restaurants and grocery stores are packed. Cyclists pedal up and down wide-open streets.</p> <p>On the surface, the community of Porter Ranch is returning to normal four months after the largest methane leak in American history was capped in...</p>
173 | 	]]></description>
174 | 	<pubDate>Sun, 3 Jul 2016 08:00:00 PDT</pubDate>
175 | <media:thumbnail url="http://www.trbimg.com/img-577591a0/turbine/la-me-porter-ranch-20160629-snap/600"/>
176 |       </item>
177 | <item>
178 |     <title>Web Buzz: With the Lola app, personal travel advice and service are a quick text away</title>
179 | 	<dc:creator>Jen Leo</dc:creator>
180 | 	<link>http://www.latimes.com/la-tr-webbuzz-20160624-snap-story.html</link>
181 | 	<description><![CDATA[
182 | 	  <p>Need immediate advice about a flight or hotel? Here’s an instant messaging app that connects you to a helpful online concierge.</p> <p>Name: <a href="https://www.lolatravel.com/" target="_blank">Lola </a>app</p> <p>What it does: Connects travelers with travel agents who can find the best options for you based on your preferences, including favorite airlines, hotel...</p>
183 | 	]]></description>
184 | 	<pubDate>Sun, 3 Jul 2016 08:00:00 PDT</pubDate>
185 | </item>
186 | <item>
187 |     <title>&apos;Deer Hunter,&apos; &apos;Heaven&apos;s Gate&apos; director Michael Cimino dies at 77; the film community reacts</title>
188 | 	<dc:creator>Deborah Vankin</dc:creator>
189 | 	<link>http://www.latimes.com/la-et-mn-michael-cimino-20160702-snap-htmlstory.html</link>
190 | 	<description><![CDATA[
191 | 	  The first tweet came in French, from Cannes Film Festival director Thierry Fremaux - that director Michael Cimino had passed away. Fremaux wrote that Cimino "died peacefully, surrounded by his family and the two women who loved him. We loved him, too." https://twitter.com/THIERRYFREMAUX/status/749329806257430528...
192 | 	]]></description>
193 | 	<pubDate>Sun, 3 Jul 2016 07:38:00 PDT</pubDate>
194 | <media:thumbnail url="http://www.trbimg.com/img-57794737/turbine/la-et-mn-michael-cimino-20160702-snap/600"/>
195 |       </item>
196 | <item>
197 |     <title>Inside Donald Trump&apos;s secret smear campaign against a tribal casino</title>
198 | 	<dc:creator>Joseph Tanfani</dc:creator>
199 | 	<link>http://www.latimes.com/politics/la-na-pol-trump-anti-indian-campaign-20160630-snap-story.html</link>
200 | 	<description><![CDATA[
201 | 	  FBI questions Hillary Clinton about her private email server July 3, 2016, 9:56 a.m. Hillary Clinton is interviewed by federal investigators for several hours about her use of private email to conduct government business. The FBI is looking into whether Clinton or anyone else mishandled classified...
202 | 	]]></description>
203 | 	<pubDate>Sun, 3 Jul 2016 07:13:00 PDT</pubDate>
204 | <media:thumbnail url="http://www.trbimg.com/img-577928dc/turbine/la-na-trailguide-updates-inside-donald-trump-s-secret-smear-1467552306/600"/>
205 |       </item>
206 | <item>
207 |     <title>Letters: To tip or not to tip, plus spritzing while driving</title>
208 | 	<link>http://www.latimes.com/la-tr-letters-20160626-snap-story.html</link>
209 | 	<description><![CDATA[
210 | 	  <p>I just returned from two weeks in England: one week in Bath, in the Cotswolds, and one week in London. I tipped everywhere I would in L.A., and the recipients were very appreciative [“<a href="http://www.latimes.com/travel/europe/la-tr-spot-20160620-snap-story.html" target="_blank">Tips on Tipping</a>,” On the Spot by Catharine Hamm, June 26].</p> <p>Upon leaving for London, I took a taxi to the train...</p>
211 | 	]]></description>
212 | 	<pubDate>Sun, 3 Jul 2016 06:30:00 PDT</pubDate>
213 | <media:thumbnail url="http://www.trbimg.com/img-57794fad/turbine/la-tr-letters-20160626-snap/600"/>
214 |       </item>
215 | <item>
216 |     <title>Tesla and Google are both driving toward autonomous vehicles. Which company is taking the better route?</title>
217 | 	<dc:creator>Tracey Lien</dc:creator>
218 | 	<link>http://www.latimes.com/la-fi-hy-tesla-google-20160701-snap-story.html</link>
219 | 	<description><![CDATA[
220 | 	  <p>Google and Tesla agree autonomous vehicles will make streets safer, and both are racing toward a driverless future. But when Google tested its self-driving car prototype on employees a few years ago, it noticed something that would take it down a different path from Tesla. </p> <p>Once behind the wheel...</p>
221 | 	]]></description>
222 | 	<pubDate>Sun, 3 Jul 2016 06:00:00 PDT</pubDate>
223 | <media:thumbnail url="http://www.trbimg.com/img-5779255a/turbine/la-fi-hy-tesla-google-20160701-snap/600"/>
224 |       </item>
225 | <item>
226 |     <title>CEOs are getting more political, but consumers aren&apos;t buying it</title>
227 | 	<dc:creator>Jena McGregor</dc:creator>
228 | 	<link>http://www.latimes.com/la-fi-on-leadership-ceo-activism-20160630-snap-story.html</link>
229 | 	<description><![CDATA[
230 | 	  <p>Starbucks Chief Executive Howard Schultz has spoken out on gun control, race relations and the "cynicism, despair, division, exclusion, fear and yes -- indifference" in America today.</p> <p>Facebook founder and CEO Mark Zuckerberg said at a developer conference this year that "I hear fearful voices calling...</p>
231 | 	]]></description>
232 | 	<pubDate>Sun, 3 Jul 2016 06:00:00 PDT</pubDate>
233 | <media:thumbnail url="http://www.trbimg.com/img-5775dc79/turbine/la-fi-on-leadership-ceo-activism-20160630-snap/600"/>
234 |       </item>
235 | <item>
236 |     <title>Feedback: Why type when you can write?</title>
237 | 	<link>http://www.latimes.com/la-ca-0703-feedback-20160627-snap-story.html</link>
238 | 	<description><![CDATA[
239 | 	  <p>Buried within Laila Lalami’s entertaining essay [“<a href="http://www.latimes.com/books/la-ca-jc-essay-lalami-20160601-snap-story.html" target="_blank">The Power of Procrastination</a>,” June 24] are the words “the blank screen.” Therein lies her problem. For years,  well-intentioned friends have urged me to use a computer instead of writing longhand on yellow lined paper, because, they say, “it is...</p>
240 | 	]]></description>
241 | 	<pubDate>Sun, 3 Jul 2016 06:00:00 PDT</pubDate>
242 | <media:thumbnail url="http://www.trbimg.com/img-5776aa46/turbine/la-ca-0703-feedback-20160627-snap/600"/>
243 |       </item>
244 | <item>
245 |     <title>Learn all about exploring Yosemite National Park from the experts at REI</title>
246 | 	<link>http://www.latimes.com/la-tr-films-20160620-snap-story.html</link>
247 | 	<description><![CDATA[
248 | 	  <p>YOSEMITE</p> <p>Workshop</p> <p> REI experts will share tips on exploring Yosemite National Park.</p> <p>When, where: 7 p.m. Thursday at  the REI store in Arcadia, 214 N. Santa Anita Ave.</p> <p>Admission, info: Free. (626) 447-1062</p> <p>PATAGONIA</p> <p>Presentation</p> <p>Explorer and mountain guide Tad McCrea will share his experiences and...</p>
249 | 	]]></description>
250 | 	<pubDate>Sun, 3 Jul 2016 06:00:00 PDT</pubDate>
251 | </item>
252 | </channel>
253 | </rss>
254 | 


--------------------------------------------------------------------------------
/cache/feed/spiegel.xml:
--------------------------------------------------------------------------------
  1 | 
  2 | <?xml version="1.0" encoding="ISO-8859-1" standalone="yes"?>
  3 | <rss xmlns:content="http://purl.org/rss/1.0/modules/content/" version="2.0">
  4 | <channel>
  5 | <title>SPIEGEL ONLINE - Schlagzeilen</title>
  6 | <link>http://www.spiegel.de</link>
  7 | <description>Deutschlands führende Nachrichtenseite. Alles Wichtige aus Politik, Wirtschaft, Sport, Kultur, Wissenschaft, Technik und mehr.</description>
  8 | <language>de</language>
  9 | <pubDate>Thu, 05 Nov 2015 23:50:24 +0100</pubDate>
 10 | <lastBuildDate>Thu, 05 Nov 2015 23:50:24 +0100</lastBuildDate>
 11 | <image>
 12 | <title>SPIEGEL ONLINE</title>
 13 | <link>http://www.spiegel.de</link>
 14 | <url>http://www.spiegel.de/static/sys/logo_120x61.gif</url>
 15 | </image>
 16 | <item>
 17 | <title>Unglück in Bayern: Zug erfasst Schwertransporter - mehrere Tote</title>
 18 | <link>http://www.spiegel.de/panorama/gesellschaft/bayern-zug-erfasst-schwertransporter-auf-bahnuebergang-a-1061387.html#ref=rss</link>
 19 | <description>Schweres Unglück in der Oberpfalz: Ein Lkw und ein Zug sind auf einem Bahnübergang kollidiert, beide fingen Feuer. Laut Polizei gab es Tote und Verletzte. </description>
 20 | <category>Panorama</category>
 21 | <pubDate>Thu, 05 Nov 2015 23:42:00 +0100</pubDate>
 22 | <guid>http://www.spiegel.de/panorama/gesellschaft/bayern-zug-erfasst-schwertransporter-auf-bahnuebergang-a-1061387.html</guid>
 23 | <content:encoded><![CDATA[Schweres Unglück in der Oberpfalz: Ein Lkw und ein Zug sind auf einem Bahnübergang kollidiert, beide fingen Feuer. Laut Polizei gab es Tote und Verletzte. ]]></content:encoded>
 24 | </item>
 25 | <item>
 26 | <title>US-Republikaner: Trump und Carson bekommen Schutz vom Secret Service</title>
 27 | <link>http://www.spiegel.de/politik/ausland/usa-secret-service-bewacht-trump-und-carson-von-den-republikanern-a-1061385.html#ref=rss</link>
 28 | <description>Moderne Insignien der Macht für Donald Trump und Ben Carson: Die beiden Präsidentschaftskandidaten der US-Republikaner sollen nun rund um die Uhr vom Secret Service beschützt werden - auch, weil sie in Umfragen vorne liegen.</description>
 29 | <category>Politik</category>
 30 | <pubDate>Thu, 05 Nov 2015 23:23:51 +0100</pubDate>
 31 | <guid>http://www.spiegel.de/politik/ausland/usa-secret-service-bewacht-trump-und-carson-von-den-republikanern-a-1061385.html</guid>
 32 | <content:encoded><![CDATA[<img src="http://www.spiegel.de/images/image-914196-thumbsmall-vlwz.jpg" hspace="5" align="left" >Moderne Insignien der Macht für Donald Trump und Ben Carson: Die beiden Präsidentschaftskandidaten der US-Republikaner sollen nun rund um die Uhr vom Secret Service beschützt werden - auch, weil sie in Umfragen vorne liegen.]]></content:encoded>
 33 | <enclosure type="image/jpeg" url="http://www.spiegel.de/images/image-914196-thumbsmall-vlwz.jpg"/>
 34 | </item>
 35 | <item>
 36 | <title>Europa League: Bobadilla lässt Augsburg jubeln</title>
 37 | <link>http://www.spiegel.de/sport/fussball/europa-league-fc-augsburg-gewinnt-gegen-az-alkmaar-deutlich-a-1061379.html#ref=rss</link>
 38 | <description>Chance gewahrt: Der FC Augsburg hat nach einem deutlichen Sieg gegen AZ Alkmaar gute Aussichten auf das Erreichen der K.o.-Phase. Beim Bundesligisten traf ein Spieler dreifach.</description>
 39 | <category>Sport</category>
 40 | <pubDate>Thu, 05 Nov 2015 22:59:00 +0100</pubDate>
 41 | <guid>http://www.spiegel.de/sport/fussball/europa-league-fc-augsburg-gewinnt-gegen-az-alkmaar-deutlich-a-1061379.html</guid>
 42 | <content:encoded><![CDATA[<img src="http://www.spiegel.de/images/image-918662-thumbsmall-ihsk.jpg" hspace="5" align="left" >Chance gewahrt: Der FC Augsburg hat nach einem deutlichen Sieg gegen AZ Alkmaar gute Aussichten auf das Erreichen der K.o.-Phase. Beim Bundesligisten traf ein Spieler dreifach.]]></content:encoded>
 43 | <enclosure type="image/jpeg" url="http://www.spiegel.de/images/image-918662-thumbsmall-ihsk.jpg"/>
 44 | </item>
 45 | <item>
 46 | <title>Remis gegen Sparta Prag: Schalkes Krise geht in der Europa League weiter</title>
 47 | <link>http://www.spiegel.de/sport/fussball/fc-schalke-spielt-in-der-europa-league-bei-sparta-prag-remis-a-1061382.html#ref=rss</link>
 48 | <description>Der FC Schalke geht angeschlagen ins Derby gegen Borussia Dortmund. In der Europa League reichte es bei Sparta Prag nur zu einem Unentschieden, vor allem die verletzten Abwehrspieler bereiten Trainer André Breitenreiter Sorgen.</description>
 49 | <category>Sport</category>
 50 | <pubDate>Thu, 05 Nov 2015 22:58:00 +0100</pubDate>
 51 | <guid>http://www.spiegel.de/sport/fussball/fc-schalke-spielt-in-der-europa-league-bei-sparta-prag-remis-a-1061382.html</guid>
 52 | <content:encoded><![CDATA[<img src="http://www.spiegel.de/images/image-918667-thumbsmall-dxye.jpg" hspace="5" align="left" >Der FC Schalke geht angeschlagen ins Derby gegen Borussia Dortmund. In der Europa League reichte es bei Sparta Prag nur zu einem Unentschieden, vor allem die verletzten Abwehrspieler bereiten Trainer André Breitenreiter Sorgen.]]></content:encoded>
 53 | <enclosure type="image/jpeg" url="http://www.spiegel.de/images/image-918667-thumbsmall-dxye.jpg"/>
 54 | </item>
 55 | <item>
 56 | <title>Koalitionsstreit in der Flüchtlingskrise: Erledigt</title>
 57 | <link>http://www.spiegel.de/politik/deutschland/fluechtlingskrise-merkel-seehofer-gabriel-finden-kompromiss-a-1061380.html#ref=rss</link>
 58 | <description>Transitzonen + Einreisezentren = Aufnahme-Einrichtungen. Das ist der Kompromiss der Großen Koalition. CSU-Chef Seehofer erscheint als Verlierer des Gipfeltreffens. Ist er das wirklich?</description>
 59 | <category>Politik</category>
 60 | <pubDate>Thu, 05 Nov 2015 22:18:00 +0100</pubDate>
 61 | <guid>http://www.spiegel.de/politik/deutschland/fluechtlingskrise-merkel-seehofer-gabriel-finden-kompromiss-a-1061380.html</guid>
 62 | <content:encoded><![CDATA[<img src="http://www.spiegel.de/images/image-918661-thumbsmall-rswk.jpg" hspace="5" align="left" >Transitzonen + Einreisezentren = Aufnahme-Einrichtungen. Das ist der Kompromiss der Großen Koalition. CSU-Chef Seehofer erscheint als Verlierer des Gipfeltreffens. Ist er das wirklich?]]></content:encoded>
 63 | <enclosure type="image/jpeg" url="http://www.spiegel.de/images/image-918661-thumbsmall-rswk.jpg"/>
 64 | </item>
 65 | <item>
 66 | <title>Schwedischer Minister zu Flüchtlingen: "Bleibt in Deutschland"</title>
 67 | <link>http://www.spiegel.de/politik/ausland/schweden-kann-fluechtlinge-laut-minister-nicht-mehr-unterbringen-a-1061378.html#ref=rss</link>
 68 | <description>Schweden ist für seine liberale Asypolitik bekannt, doch jetzt will der Migrationsminister Flüchtlinge fernhalten - das Land habe nicht genug Unterkünfte. Morgan Johansson hat Asylsuchende aufgefordert, nach Deutschland zurückzukehren.</description>
 69 | <category>Politik</category>
 70 | <pubDate>Thu, 05 Nov 2015 21:48:00 +0100</pubDate>
 71 | <guid>http://www.spiegel.de/politik/ausland/schweden-kann-fluechtlinge-laut-minister-nicht-mehr-unterbringen-a-1061378.html</guid>
 72 | <content:encoded><![CDATA[<img src="http://www.spiegel.de/images/image-918653-thumbsmall-xmxj.jpg" hspace="5" align="left" >Schweden ist für seine liberale Asypolitik bekannt, doch jetzt will der Migrationsminister Flüchtlinge fernhalten - das Land habe nicht genug Unterkünfte. Morgan Johansson hat Asylsuchende aufgefordert, nach Deutschland zurückzukehren.]]></content:encoded>
 73 | <enclosure type="image/jpeg" url="http://www.spiegel.de/images/image-918653-thumbsmall-xmxj.jpg"/>
 74 | </item>
 75 | <item>
 76 | <title>Pope-Pop: Franziskus veröffentlicht Rockalbum</title>
 77 | <link>http://www.spiegel.de/panorama/gesellschaft/papst-cd-franziskus-veroeffentlicht-rock-album-a-1061376.html#ref=rss</link>
 78 | <description>Röhrende Bässe, E-Gitarren-Soli und dazu die Predigten von Franziskus: Über die musikalische Qualität des ersten Rock-Pop-Albums des Papstes lässt sich streiten, doch das Anliegen ist hehr.</description>
 79 | <category>Panorama</category>
 80 | <pubDate>Thu, 05 Nov 2015 21:38:00 +0100</pubDate>
 81 | <guid>http://www.spiegel.de/panorama/gesellschaft/papst-cd-franziskus-veroeffentlicht-rock-album-a-1061376.html</guid>
 82 | <content:encoded><![CDATA[<img src="http://www.spiegel.de/images/image-918643-thumbsmall-mgjf.jpg" hspace="5" align="left" >Röhrende Bässe, E-Gitarren-Soli und dazu die Predigten von Franziskus: Über die musikalische Qualität des ersten Rock-Pop-Albums des Papstes lässt sich streiten, doch das Anliegen ist hehr.]]></content:encoded>
 83 | <enclosure type="image/jpeg" url="http://www.spiegel.de/images/image-918643-thumbsmall-mgjf.jpg"/>
 84 | </item>
 85 | <item>
 86 | <title>Europa League: Klopp siegt mit Liverpool auch international</title>
 87 | <link>http://www.spiegel.de/sport/fussball/juergen-klopp-siegt-mit-liverpool-in-der-europa-league-a-1061373.html#ref=rss</link>
 88 | <description>Mehr als 3000 Kilometer Anreise und frostige Temperaturen haben sich gelohnt: Jürgen Klopp hat mit Liverpool in Kasan seinen dritten Sieg in Folge gefeiert. Neapel steht schon in der Zwischenrunde.</description>
 89 | <category>Sport</category>
 90 | <pubDate>Thu, 05 Nov 2015 20:56:00 +0100</pubDate>
 91 | <guid>http://www.spiegel.de/sport/fussball/juergen-klopp-siegt-mit-liverpool-in-der-europa-league-a-1061373.html</guid>
 92 | <content:encoded><![CDATA[<img src="http://www.spiegel.de/images/image-918648-thumbsmall-vmuh.jpg" hspace="5" align="left" >Mehr als 3000 Kilometer Anreise und frostige Temperaturen haben sich gelohnt: Jürgen Klopp hat mit Liverpool in Kasan seinen dritten Sieg in Folge gefeiert. Neapel steht schon in der Zwischenrunde.]]></content:encoded>
 93 | <enclosure type="image/jpeg" url="http://www.spiegel.de/images/image-918648-thumbsmall-vmuh.jpg"/>
 94 | </item>
 95 | <item>
 96 | <title>Europa League: Dortmund siegt und schafft es vorzeitig in die Zwischenrunde</title>
 97 | <link>http://www.spiegel.de/sport/fussball/europa-league-borussia-dortmund-bezwingt-fk-qaebaelae-a-1061372.html#ref=rss</link>
 98 | <description>Borussia Dortmund ist dank eines überzeugenden Sieges gegen FK Qäbälä frühzeitig in die K.o-Runde der Europa League eingezogen. Der BVB konnte es sich dabei sogar erlauben, einige Leistungsträger zu schonen.</description>
 99 | <category>Sport</category>
100 | <pubDate>Thu, 05 Nov 2015 20:53:22 +0100</pubDate>
101 | <guid>http://www.spiegel.de/sport/fussball/europa-league-borussia-dortmund-bezwingt-fk-qaebaelae-a-1061372.html</guid>
102 | <content:encoded><![CDATA[<img src="http://www.spiegel.de/images/image-918646-thumbsmall-hxnu.jpg" hspace="5" align="left" >Borussia Dortmund ist dank eines überzeugenden Sieges gegen FK Qäbälä frühzeitig in die K.o-Runde der Europa League eingezogen. Der BVB konnte es sich dabei sogar erlauben, einige Leistungsträger zu schonen.]]></content:encoded>
103 | <enclosure type="image/jpeg" url="http://www.spiegel.de/images/image-918646-thumbsmall-hxnu.jpg"/>
104 | </item>
105 | <item>
106 | <title>Nasa-Daten zeigen: Sonneneruptionen reißen Mars-Atmosphäre weg</title>
107 | <link>http://www.spiegel.de/wissenschaft/weltall/sonneneruptionen-reissen-mars-atmosphaere-weg-a-1061367.html#ref=rss</link>
108 | <description>Verglichen mit der Erde hat der Mars eine extrem dünne Atmosphäre. Schuld daran sind wohl massive Sonnenstürme. Darauf deuten Messungen der Nasa-Sonde "Maven" hin.</description>
109 | <category>Wissenschaft</category>
110 | <pubDate>Thu, 05 Nov 2015 20:04:00 +0100</pubDate>
111 | <guid>http://www.spiegel.de/wissenschaft/weltall/sonneneruptionen-reissen-mars-atmosphaere-weg-a-1061367.html</guid>
112 | <content:encoded><![CDATA[<img src="http://www.spiegel.de/images/image-918601-thumbsmall-mavw.jpg" hspace="5" align="left" >Verglichen mit der Erde hat der Mars eine extrem dünne Atmosphäre. Schuld daran sind wohl massive Sonnenstürme. Darauf deuten Messungen der Nasa-Sonde "Maven" hin.]]></content:encoded>
113 | <enclosure type="image/jpeg" url="http://www.spiegel.de/images/image-918601-thumbsmall-mavw.jpg"/>
114 | </item>
115 | <item>
116 | <title>James-Bond-Quiz: Lizenz zum Danebenliegen</title>
117 | <link>http://www.spiegel.de/kultur/kino/james-bond-quiz-haben-sie-das-zeug-zum-doppel-null-agenten-a-1061142.html#ref=rss</link>
118 | <description>Sie wissen alles über James Bond? Kennen alle Girls, alle Autos, alle Action-Szenen? Bestimmt nicht! Das schwerste Bond-Quiz, zu dem SPIEGEL ONLINE imstande ist, wird Sie eines Besseren belehren.</description>
119 | <category>Kultur</category>
120 | <pubDate>Thu, 05 Nov 2015 19:40:30 +0100</pubDate>
121 | <guid>http://www.spiegel.de/kultur/kino/james-bond-quiz-haben-sie-das-zeug-zum-doppel-null-agenten-a-1061142.html</guid>
122 | <content:encoded><![CDATA[<img src="http://www.spiegel.de/images/image-917268-thumbsmall-scau.jpg" hspace="5" align="left" >Sie wissen alles über James Bond? Kennen alle Girls, alle Autos, alle Action-Szenen? Bestimmt nicht! Das schwerste Bond-Quiz, zu dem SPIEGEL ONLINE imstande ist, wird Sie eines Besseren belehren.]]></content:encoded>
123 | <enclosure type="image/jpeg" url="http://www.spiegel.de/images/image-917268-thumbsmall-scau.jpg"/>
124 | </item>
125 | <item>
126 | <title>James-Bond-Quiz: Haben Sie das Zeug zum Doppel-Null-Agenten?</title>
127 | <link>http://www.spiegel.de/quiztool/quiztool-64516.html#ref=rss</link>
128 | <description>Mit welchem Bond-Girl war Womanizer James Bond verheiratet? Wie tötete 007 den Bösewicht Sanchez in "Octopussy" und welcher der bislang vierundzwanzig Streifen ist kein "richtiger" Bond? Testen Sie Ihr Geheimagenten-Wissen im Quiz.</description>
129 | <category>Kultur</category>
130 | <pubDate>Thu, 05 Nov 2015 19:39:31 +0100</pubDate>
131 | <guid>http://www.spiegel.de/quiztool/quiztool-64516.html</guid>
132 | <content:encoded><![CDATA[Mit welchem Bond-Girl war Womanizer James Bond verheiratet? Wie tötete 007 den Bösewicht Sanchez in "Octopussy" und welcher der bislang vierundzwanzig Streifen ist kein "richtiger" Bond? Testen Sie Ihr Geheimagenten-Wissen im Quiz.]]></content:encoded>
133 | </item>
134 | <item>
135 | <title>Drama "El Club": Priester ohne Reue</title>
136 | <link>http://www.spiegel.de/kultur/kino/el-club-filmkritik-die-suenden-der-seelsorger-a-1060686.html#ref=rss</link>
137 | <description>Vier suspendierte Priester leben an der Nordküste Chiles, schuldig und isoliert. Nach einer Gewalttat zwingt ein Ermittler sie dazu, sich ihrer eigenen Vergangenheit zu stellen. "El Club" ist packender Mystery-Thriller und Abrechnung zugleich.</description>
138 | <category>Kultur</category>
139 | <pubDate>Thu, 05 Nov 2015 19:25:00 +0100</pubDate>
140 | <guid>http://www.spiegel.de/kultur/kino/el-club-filmkritik-die-suenden-der-seelsorger-a-1060686.html</guid>
141 | <content:encoded><![CDATA[Vier suspendierte Priester leben an der Nordküste Chiles, schuldig und isoliert. Nach einer Gewalttat zwingt ein Ermittler sie dazu, sich ihrer eigenen Vergangenheit zu stellen. "El Club" ist packender Mystery-Thriller und Abrechnung zugleich.]]></content:encoded>
142 | </item>
143 | <item>
144 | <title>Berühmter Historiker: Hans Mommsen ist tot</title>
145 | <link>http://www.spiegel.de/kultur/gesellschaft/hans-mommsen-ist-tot-a-1061374.html#ref=rss</link>
146 | <description>Er zählte zu den streitbarsten Historikern der Nachkriegszeit: Hans Mommsen prägte die Forschung über die NS-Zeit. Jetzt ist er an seinem 85. Geburtstag in Bayern gestorben.</description>
147 | <category>Kultur</category>
148 | <pubDate>Thu, 05 Nov 2015 19:24:00 +0100</pubDate>
149 | <guid>http://www.spiegel.de/kultur/gesellschaft/hans-mommsen-ist-tot-a-1061374.html</guid>
150 | <content:encoded><![CDATA[<img src="http://www.spiegel.de/images/image-918636-thumbsmall-olhv.jpg" hspace="5" align="left" >Er zählte zu den streitbarsten Historikern der Nachkriegszeit: Hans Mommsen prägte die Forschung über die NS-Zeit. Jetzt ist er an seinem 85. Geburtstag in Bayern gestorben.]]></content:encoded>
151 | <enclosure type="image/jpeg" url="http://www.spiegel.de/images/image-918636-thumbsmall-olhv.jpg"/>
152 | </item>
153 | <item>
154 | <title>Mafia-Prozess in Italien: "Die Stadträte müssen unseren Befehlen folgen"</title>
155 | <link>http://www.spiegel.de/panorama/justiz/rom-mafia-capitale-um-massimo-carminati-vor-gericht-a-1061316.html#ref=rss</link>
156 | <description>In Italien hat ein Riesenprozess gegen die Hauptstadtmafia begonnen: 250 Zeugen sollen gehört werden, 46 Angeklagte stehen vor Gericht. Im Mittelpunkt steht Massimo "der Schwarze" Carminati, selbsternannter König von Rom.</description>
157 | <category>Panorama</category>
158 | <pubDate>Thu, 05 Nov 2015 19:00:00 +0100</pubDate>
159 | <guid>http://www.spiegel.de/panorama/justiz/rom-mafia-capitale-um-massimo-carminati-vor-gericht-a-1061316.html</guid>
160 | <content:encoded><![CDATA[<img src="http://www.spiegel.de/images/image-918583-thumbsmall-xsbo.jpg" hspace="5" align="left" >In Italien hat ein Riesenprozess gegen die Hauptstadtmafia begonnen: 250 Zeugen sollen gehört werden, 46 Angeklagte stehen vor Gericht. Im Mittelpunkt steht Massimo "der Schwarze" Carminati, selbsternannter König von Rom.]]></content:encoded>
161 | <enclosure type="image/jpeg" url="http://www.spiegel.de/images/image-918583-thumbsmall-xsbo.jpg"/>
162 | </item>
163 | <item>
164 | <title>Push-Nachrichten-App Notify: Facebooks erdrückende Umarmung</title>
165 | <link>http://www.spiegel.de/netzwelt/netzpolitik/facebook-notify-news-app-soll-push-nachrichten-schicken-a-1061368.html#ref=rss</link>
166 | <description>Medienberichten zufolge will Facebook eine Nachrichten-App auf den Markt bringen. Push-Meldungen diverser Medien sollen darin nach den Wünschen der Nutzer zusammengestellt werden. Das passt zu Facebooks Umarmungsstrategie gegenüber Medien. </description>
167 | <category>Netzwelt</category>
168 | <pubDate>Thu, 05 Nov 2015 18:43:00 +0100</pubDate>
169 | <guid>http://www.spiegel.de/netzwelt/netzpolitik/facebook-notify-news-app-soll-push-nachrichten-schicken-a-1061368.html</guid>
170 | <content:encoded><![CDATA[<img src="http://www.spiegel.de/images/image-910523-thumbsmall-moto.jpg" hspace="5" align="left" >Medienberichten zufolge will Facebook eine Nachrichten-App auf den Markt bringen. Push-Meldungen diverser Medien sollen darin nach den Wünschen der Nutzer zusammengestellt werden. Das passt zu Facebooks Umarmungsstrategie gegenüber Medien. ]]></content:encoded>
171 | <enclosure type="image/jpeg" url="http://www.spiegel.de/images/image-910523-thumbsmall-moto.jpg"/>
172 | </item>
173 | <item>
174 | <title>Emma Watson trifft Malala Yousafzai: Endlich Feministin</title>
175 | <link>http://www.spiegel.de/panorama/leute/emma-watson-und-malala-yousafzai-ueber-frauenrechte-a-1061346.html#ref=rss</link>
176 | <description>Malala Yousafzai und Emma Watson setzen sich beide für die Rechte der Frauen ein. Als Feministin wollte sich Yousafzai jedoch nie bezeichnen - bis sie von Watson überzeugt wurde. </description>
177 | <category>Panorama</category>
178 | <pubDate>Thu, 05 Nov 2015 18:41:00 +0100</pubDate>
179 | <guid>http://www.spiegel.de/panorama/leute/emma-watson-und-malala-yousafzai-ueber-frauenrechte-a-1061346.html</guid>
180 | <content:encoded><![CDATA[Malala Yousafzai und Emma Watson setzen sich beide für die Rechte der Frauen ein. Als Feministin wollte sich Yousafzai jedoch nie bezeichnen - bis sie von Watson überzeugt wurde. ]]></content:encoded>
181 | </item>
182 | <item>
183 | <title>Nach Hackerangriff: Abgeordnete müssen jetzt Passwörter mit mindestens acht Zeichen verwenden</title>
184 | <link>http://www.spiegel.de/politik/deutschland/hackerangriff-bundestag-ruestet-ein-bisschen-auf-a-1061332.html#ref=rss</link>
185 | <description>Was hat der Bundestag aus dem schweren Hackerangriff gelernt? Das Parlament schränkt jetzt sogar das Internet für Abgeordnete ein - die Angreifer sind weiterhin unentdeckt. </description>
186 | <category>Politik</category>
187 | <pubDate>Thu, 05 Nov 2015 18:36:00 +0100</pubDate>
188 | <guid>http://www.spiegel.de/politik/deutschland/hackerangriff-bundestag-ruestet-ein-bisschen-auf-a-1061332.html</guid>
189 | <content:encoded><![CDATA[<img src="http://www.spiegel.de/images/image-549428-thumbsmall-ldps.jpg" hspace="5" align="left" >Was hat der Bundestag aus dem schweren Hackerangriff gelernt? Das Parlament schränkt jetzt sogar das Internet für Abgeordnete ein - die Angreifer sind weiterhin unentdeckt. ]]></content:encoded>
190 | <enclosure type="image/jpeg" url="http://www.spiegel.de/images/image-549428-thumbsmall-ldps.jpg"/>
191 | </item>
192 | <item>
193 | <title>Kraftfahrt-Bundesamt: Machtlose Aufseher</title>
194 | <link>http://www.spiegel.de/wirtschaft/soziales/kraftfahrt-bundesamt-die-ohnmaechtigen-aufseher-a-1061338.html#ref=rss</link>
195 | <description>Die Abgasaffäre bei Volkswagen hat auch den Ruf des Kraftfahrt-Bundesamts schwer beschädigt. Doch die Vorwürfe zielen in die falsche Richtung. Denn die Ansagen kommen aus Berlin. </description>
196 | <category>Wirtschaft</category>
197 | <pubDate>Thu, 05 Nov 2015 18:16:00 +0100</pubDate>
198 | <guid>http://www.spiegel.de/wirtschaft/soziales/kraftfahrt-bundesamt-die-ohnmaechtigen-aufseher-a-1061338.html</guid>
199 | <content:encoded><![CDATA[Die Abgasaffäre bei Volkswagen hat auch den Ruf des Kraftfahrt-Bundesamts schwer beschädigt. Doch die Vorwürfe zielen in die falsche Richtung. Denn die Ansagen kommen aus Berlin. ]]></content:encoded>
200 | </item>
201 | <item>
202 | <title>Mutmaßlicher NSU-Helfer: Die Handy-Kontakte des André E.</title>
203 | <link>http://www.spiegel.de/panorama/nsu-prozess-die-handy-kontakte-des-andre-e-a-1020286.html#ref=rss</link>
204 | <description>Die große Aufmerksamkeit für Beate Zschäpe kann dem Mitangeklagten André E. nur recht sein: Er unterstützte den NSU laut Anklage über Jahre. Seine Handydaten zeigen, wie viele Kontakte er zu mutmaßlichen Kriminellen pflegte.</description>
205 | <category>Panorama</category>
206 | <pubDate>Thu, 05 Nov 2015 18:04:00 +0100</pubDate>
207 | <guid>http://www.spiegel.de/panorama/nsu-prozess-die-handy-kontakte-des-andre-e-a-1020286.html</guid>
208 | <content:encoded><![CDATA[<img src="http://www.spiegel.de/images/image-917815-thumbsmall-eegc.jpg" hspace="5" align="left" >Die große Aufmerksamkeit für Beate Zschäpe kann dem Mitangeklagten André E. nur recht sein: Er unterstützte den NSU laut Anklage über Jahre. Seine Handydaten zeigen, wie viele Kontakte er zu mutmaßlichen Kriminellen pflegte.]]></content:encoded>
209 | <enclosure type="image/jpeg" url="http://www.spiegel.de/images/image-917815-thumbsmall-eegc.jpg"/>
210 | </item>
211 | <item>
212 | <title>Gipfel in Berlin: Koalition einigt sich auf Registrierzentren für Flüchtlinge</title>
213 | <link>http://www.spiegel.de/politik/deutschland/fluechtlinge-koalition-einigt-sich-auf-registrierzentren-a-1061370.html#ref=rss</link>
214 | <description>Keine Transitzonen an der Grenze, dafür spezielle Aufnahme-Einrichtungen für Flüchtlinge aus sicheren Herkunftsländern: Auf diesen Kompromiss haben sich die Spitzen der Großen Koalition bei ihrem Flüchtlingsgipfel geeinigt.</description>
215 | <category>Politik</category>
216 | <pubDate>Thu, 05 Nov 2015 17:59:00 +0100</pubDate>
217 | <guid>http://www.spiegel.de/politik/deutschland/fluechtlinge-koalition-einigt-sich-auf-registrierzentren-a-1061370.html</guid>
218 | <content:encoded><![CDATA[<img src="http://www.spiegel.de/images/image-918633-thumbsmall-kysk.jpg" hspace="5" align="left" >Keine Transitzonen an der Grenze, dafür spezielle Aufnahme-Einrichtungen für Flüchtlinge aus sicheren Herkunftsländern: Auf diesen Kompromiss haben sich die Spitzen der Großen Koalition bei ihrem Flüchtlingsgipfel geeinigt.]]></content:encoded>
219 | <enclosure type="image/jpeg" url="http://www.spiegel.de/images/image-918633-thumbsmall-kysk.jpg"/>
220 | </item>
221 | <item>
222 | <title>Vorratsdatenspeicherung: Thüringen will im Bundesrat Vermittlungsausschuss anrufen</title>
223 | <link>http://www.spiegel.de/netzwelt/web/vorratsdatenspeicherung-thueringen-will-vermittlungsausschuss-anrufen-a-1061363.html#ref=rss</link>
224 | <description>Am Freitag ist die umstrittene Vorratsdatenspeicherung Thema im Bundesrat. Thüringens Justizminister hat bereits angekündigt, wegen Datenschutzbedenken den Vermittlungsausschuss anrufen zu wollen. Seine Chancen stehen aber schlecht.</description>
225 | <category>Netzwelt</category>
226 | <pubDate>Thu, 05 Nov 2015 17:49:00 +0100</pubDate>
227 | <guid>http://www.spiegel.de/netzwelt/web/vorratsdatenspeicherung-thueringen-will-vermittlungsausschuss-anrufen-a-1061363.html</guid>
228 | <content:encoded><![CDATA[Am Freitag ist die umstrittene Vorratsdatenspeicherung Thema im Bundesrat. Thüringens Justizminister hat bereits angekündigt, wegen Datenschutzbedenken den Vermittlungsausschuss anrufen zu wollen. Seine Chancen stehen aber schlecht.]]></content:encoded>
229 | </item>
230 | <item>
231 | <title>Teuerste H&amp;M-Kollektion: Elitäres für die Massen</title>
232 | <link>http://www.spiegel.de/stil/balmain-entwirft-fuer-h-m-elitaeres-fuer-die-massen-a-1060771.html#ref=rss</link>
233 | <description>Es hat mit Karl Lagerfeld funktioniert und mit Stella McCartney: Top-Designer entwerfen für H&amp;M, Kunden stürmen die Läden - und zahlen ein Vielfaches der üblichen Preise. Die neue Kollektion kommt von Olivier Rousteing und ist so teuer wie keine zuvor. Zu teuer? Stimmen Sie ab!</description>
234 | <category>Stil</category>
235 | <pubDate>Thu, 05 Nov 2015 17:47:00 +0100</pubDate>
236 | <guid>http://www.spiegel.de/stil/balmain-entwirft-fuer-h-m-elitaeres-fuer-die-massen-a-1060771.html</guid>
237 | <content:encoded><![CDATA[Es hat mit Karl Lagerfeld funktioniert und mit Stella McCartney: Top-Designer entwerfen für H&M, Kunden stürmen die Läden - und zahlen ein Vielfaches der üblichen Preise. Die neue Kollektion kommt von Olivier Rousteing und ist so teuer wie keine zuvor. Zu teuer? Stimmen Sie ab!]]></content:encoded>
238 | </item>
239 | <item>
240 | <title>Flüchtlingsfamilien: Kinder ohne Papiere dürfen zur Schule</title>
241 | <link>http://www.spiegel.de/schulspiegel/kinder-ohne-aufenthaltsrecht-duerfen-zur-schule-a-1060431.html#ref=rss</link>
242 | <description>Alle Kinder haben ein Recht auf Bildung - auch wenn sie nicht alle Papiere haben, die Deutschland vorschreibt. Doch nehmen Schulen solche Kinder überhaupt auf? Und müssen sie das an die Behörden melden? Die Antworten auf die wichtigsten Fragen.</description>
243 | <category>SchulSPIEGEL</category>
244 | <pubDate>Thu, 05 Nov 2015 17:41:00 +0100</pubDate>
245 | <guid>http://www.spiegel.de/schulspiegel/kinder-ohne-aufenthaltsrecht-duerfen-zur-schule-a-1060431.html</guid>
246 | <content:encoded><![CDATA[<img src="http://www.spiegel.de/images/image-915598-thumbsmall-jhrf.jpg" hspace="5" align="left" >Alle Kinder haben ein Recht auf Bildung - auch wenn sie nicht alle Papiere haben, die Deutschland vorschreibt. Doch nehmen Schulen solche Kinder überhaupt auf? Und müssen sie das an die Behörden melden? Die Antworten auf die wichtigsten Fragen.]]></content:encoded>
247 | <enclosure type="image/jpeg" url="http://www.spiegel.de/images/image-915598-thumbsmall-jhrf.jpg"/>
248 | </item>
249 | <item>
250 | <title>Afghanistan: Ärzte ohne Grenzen wirft USA vorsätzlichen Angriff auf Klinik vor</title>
251 | <link>http://www.spiegel.de/politik/ausland/aerzte-ohne-grenzen-werfen-usa-vorsaetzlichen-angriff-auf-klinik-in-kunduz-vor-a-1061362.html#ref=rss</link>
252 | <description>Bei einem US-Luftangriff auf ein Krankenhaus in Kunduz wurden vor einem Monat 30 Menschen getötet. Die Hilfsorganisation Ärzte ohne Grenzen bezweifelt, dass die Armee aus Versehen handelte. </description>
253 | <category>Politik</category>
254 | <pubDate>Thu, 05 Nov 2015 17:25:00 +0100</pubDate>
255 | <guid>http://www.spiegel.de/politik/ausland/aerzte-ohne-grenzen-werfen-usa-vorsaetzlichen-angriff-auf-klinik-in-kunduz-vor-a-1061362.html</guid>
256 | <content:encoded><![CDATA[<img src="http://www.spiegel.de/images/image-918595-thumbsmall-irga.jpg" hspace="5" align="left" >Bei einem US-Luftangriff auf ein Krankenhaus in Kunduz wurden vor einem Monat 30 Menschen getötet. Die Hilfsorganisation Ärzte ohne Grenzen bezweifelt, dass die Armee aus Versehen handelte. ]]></content:encoded>
257 | <enclosure type="image/jpeg" url="http://www.spiegel.de/images/image-918595-thumbsmall-irga.jpg"/>
258 | </item>
259 | <item>
260 | <title>Elendspanorama "Vorbereitung auf das nächste Leben": Amerika ist erledigt</title>
261 | <link>http://www.spiegel.de/kultur/literatur/vorbereitung-auf-das-naechste-leben-von-atticus-lish-rezension-a-1060858.html#ref=rss</link>
262 | <description>Eine Geschichte vom unteren Rand der Gesellschaft: Atticus Lish gelang mit seinem Debütroman "Vorbereitung auf das nächste Leben" ein Überraschungshit in den USA, der das Pathos zurück in die Gegenwartsliteratur bringt.</description>
263 | <category>Kultur</category>
264 | <pubDate>Thu, 05 Nov 2015 17:17:00 +0100</pubDate>
265 | <guid>http://www.spiegel.de/kultur/literatur/vorbereitung-auf-das-naechste-leben-von-atticus-lish-rezension-a-1060858.html</guid>
266 | <content:encoded><![CDATA[Eine Geschichte vom unteren Rand der Gesellschaft: Atticus Lish gelang mit seinem Debütroman "Vorbereitung auf das nächste Leben" ein Überraschungshit in den USA, der das Pathos zurück in die Gegenwartsliteratur bringt.]]></content:encoded>
267 | </item>
268 | <item>
269 | <title>Mehrtägiger Streik: Was Lufthansa-Passagiere jetzt wissen müssen</title>
270 | <link>http://www.spiegel.de/reise/aktuell/lufthansa-streik-was-passagiere-wissen-muessen-a-1061342.html#ref=rss</link>
271 | <description>Lufthansa-Kunden müssen sich auf einen mehrtägigen Streik mit vielen Flugausfällen einrichten. Die Flugbegleiter wollen aber nicht vor Freitagmittag mit ihrem Ausstand beginnen. Hier finden Sie Antworten auf die wichtigsten Fragen.</description>
272 | <category>Reise</category>
273 | <pubDate>Thu, 05 Nov 2015 17:08:00 +0100</pubDate>
274 | <guid>http://www.spiegel.de/reise/aktuell/lufthansa-streik-was-passagiere-wissen-muessen-a-1061342.html</guid>
275 | <content:encoded><![CDATA[<img src="http://www.spiegel.de/images/image-916526-thumbsmall-bwyt.jpg" hspace="5" align="left" >Lufthansa-Kunden müssen sich auf einen mehrtägigen Streik mit vielen Flugausfällen einrichten. Die Flugbegleiter wollen aber nicht vor Freitagmittag mit ihrem Ausstand beginnen. Hier finden Sie Antworten auf die wichtigsten Fragen.]]></content:encoded>
276 | <enclosure type="image/jpeg" url="http://www.spiegel.de/images/image-916526-thumbsmall-bwyt.jpg"/>
277 | </item>
278 | <item>
279 | <title>Verhandlungen geplatzt: Lufthansa-Streik startet am Freitag</title>
280 | <link>http://www.spiegel.de/wirtschaft/unternehmen/lufthansa-streik-am-freitag-a-1061357.html#ref=rss</link>
281 | <description>Auch der letzte Einigungsversuch ist gescheitert: Die Flugbegleiter der Lufthansa streiken ab Freitag - mit Rücksicht auf die Kunden aber erst ab 12 Uhr. </description>
282 | <category>Wirtschaft</category>
283 | <pubDate>Thu, 05 Nov 2015 17:04:00 +0100</pubDate>
284 | <guid>http://www.spiegel.de/wirtschaft/unternehmen/lufthansa-streik-am-freitag-a-1061357.html</guid>
285 | <content:encoded><![CDATA[Auch der letzte Einigungsversuch ist gescheitert: Die Flugbegleiter der Lufthansa streiken ab Freitag - mit Rücksicht auf die Kunden aber erst ab 12 Uhr. ]]></content:encoded>
286 | </item>
287 | <item>
288 | <title>1000 Kilometer entfernt: US-Ermittler entdecken vermissten Jungen nach 13 Jahren</title>
289 | <link>http://www.spiegel.de/panorama/ohio-vermisster-junge-in-den-usa-nach-13-jahren-aufgetaucht-a-1061350.html#ref=rss</link>
290 | <description>Mit fünf Jahren verschwand ein Junge aus Alabama. Die Mutter vermutete, der Vater habe den Jungen entführt. Jahre später haben ihn nun Ermittler in Ohio entdeckt - durch einen Zufall. </description>
291 | <category>Panorama</category>
292 | <pubDate>Thu, 05 Nov 2015 17:00:00 +0100</pubDate>
293 | <guid>http://www.spiegel.de/panorama/ohio-vermisster-junge-in-den-usa-nach-13-jahren-aufgetaucht-a-1061350.html</guid>
294 | <content:encoded><![CDATA[Mit fünf Jahren verschwand ein Junge aus Alabama. Die Mutter vermutete, der Vater habe den Jungen entführt. Jahre später haben ihn nun Ermittler in Ohio entdeckt - durch einen Zufall. ]]></content:encoded>
295 | </item>
296 | <item>
297 | <title>Pressekompass: Sterbehilfevereine legal oder illegal? Das sagen die Medien</title>
298 | <link>http://www.spiegel.de/gesundheit/diagnose/pressekompass-zur-sterbehilfe-debatte-im-bundestag-a-1061351.html#ref=rss</link>
299 | <description>Totales Verbot oder liberale Freigabe - die vier Gesetzentwürfe zur Sterbehilfe könnten unterschiedlicher nicht sein. Wie entscheidet der Bundestag am Freitag? Der Pressekompass zeigt Meinungstrends der Medien.</description>
300 | <category>Gesundheit</category>
301 | <pubDate>Thu, 05 Nov 2015 16:59:00 +0100</pubDate>
302 | <guid>http://www.spiegel.de/gesundheit/diagnose/pressekompass-zur-sterbehilfe-debatte-im-bundestag-a-1061351.html</guid>
303 | <content:encoded><![CDATA[<img src="http://www.spiegel.de/images/image-918162-thumbsmall-cjbm.jpg" hspace="5" align="left" >Totales Verbot oder liberale Freigabe - die vier Gesetzentwürfe zur Sterbehilfe könnten unterschiedlicher nicht sein. Wie entscheidet der Bundestag am Freitag? Der Pressekompass zeigt Meinungstrends der Medien.]]></content:encoded>
304 | <enclosure type="image/jpeg" url="http://www.spiegel.de/images/image-918162-thumbsmall-cjbm.jpg"/>
305 | </item>
306 | </channel>
307 | </rss>
308 | 


--------------------------------------------------------------------------------
/config/config.exs:
--------------------------------------------------------------------------------
 1 | # This file is responsible for configuring your application
 2 | # and its dependencies with the aid of the Mix.Config module.
 3 | use Mix.Config
 4 | 
 5 | # This configuration is loaded before any dependency and is restricted
 6 | # to this project. If another project depends on this project, this
 7 | # file won't be loaded nor affect the parent project. For this reason,
 8 | # if you want to provide default values for your application for
 9 | # third-party users, it should be done in your "mix.exs" file.
10 | 
11 | # You can configure your application as:
12 | #
13 | #     config :scrape, key: :value
14 | #
15 | # and access this configuration in your application as:
16 | #
17 | #     Application.get_env(:scrape, :key)
18 | #
19 | # You can also configure a third-party app:
20 | #
21 | #     config :logger, level: :info
22 | #
23 | 
24 | # It is also possible to import configuration files, relative to this
25 | # directory. For example, you can emulate configuration per environment
26 | # by uncommenting the line below and defining dev.exs, test.exs and such.
27 | # Configuration from the imported file will override the ones defined
28 | # here (which is why it is important to import them last).
29 | #
30 | #     import_config "#{Mix.env()}.exs"
31 | 


--------------------------------------------------------------------------------
/lib/scrape.ex:
--------------------------------------------------------------------------------
 1 | defmodule Scrape do
 2 |   @moduledoc """
 3 |   Elixir Toolkit for extracting meaningful structured data out of
 4 |   common web resources.
 5 | 
 6 |   This process is often called "web-scraping". Actually, the normalization
 7 |   and transformation of data into a well-known structured form is also
 8 |   known as "data engineering", which in turn is the prerequisite for most
 9 |   data-science/machine-learning/... algorithms in the wild.
10 | 
11 |   Currently Scrape supports 3 types of common web data:
12 | 
13 |   * Feeds: RSS or Atom XML feeds
14 |   * Domains: "root" pages of a web presence
15 |   * Articles: "content" pages of a web presence
16 |   """
17 | 
18 |   @doc """
19 |   Given a valid url, return structured data of the content.
20 | 
21 |   This function is intended for "content" pages.
22 |   """
23 | 
24 |   @spec article(String.t()) :: {:ok, map()} | {:error, any()}
25 |   @spec article(String.t(), [{atom(), any()}]) :: {:ok, map()} | {:error, any()}
26 | 
27 |   def article(url, opts \\ []) do
28 |     Scrape.Flow.Article.from_url(url, opts)
29 |   end
30 | 
31 |   @doc """
32 |   Same as `article/2` but will return the result directly or raise an
33 |   error if the result is not `:ok`
34 |   """
35 | 
36 |   def article!(url, opts \\ []) do
37 |     {:ok, article} = Scrape.Flow.Article.from_url(url, opts)
38 |     article
39 |   end
40 | 
41 |   @doc """
42 |   Given a valid url, return structured data of the domain.
43 | 
44 |   This function is intended for "root" pages of a web presence. The most
45 |   important usecase for Scrape is to detect possible feeds for the domain.
46 |   """
47 | 
48 |   @spec domain(String.t()) :: {:ok, map()} | {:error, any()}
49 |   @spec domain(String.t(), [{atom(), any()}]) :: {:ok, map()} | {:error, any()}
50 | 
51 |   def domain(url, opts \\ []) do
52 |     Scrape.Flow.Domain.from_url(url, opts)
53 |   end
54 | 
55 |   @doc """
56 |   Same as `domain/2` but will return the result directly or raise an
57 |   error if the result is not `:ok`.
58 |   """
59 | 
60 |   def domain!(url, opts \\ []) do
61 |     {:ok, domain} = Scrape.Flow.Domain.from_url(url, opts)
62 |     domain
63 |   end
64 | 
65 |   @doc """
66 |   Given a valid url, return structured data of the feed.
67 |   """
68 | 
69 |   @spec feed(String.t()) :: {:ok, map()} | {:error, any()}
70 |   @spec feed(String.t(), [{atom(), any()}]) :: {:ok, map()} | {:error, any()}
71 | 
72 |   def feed(url, opts \\ []) do
73 |     Scrape.Flow.Feed.from_url(url, opts)
74 |   end
75 | 
76 |   @doc """
77 |   Same as `feed/2` but will return the result directly or raise an error
78 |   if the result is not `:ok`.
79 |   """
80 | 
81 |   def feed!(url, opts \\ []) do
82 |     {:ok, feed} = Scrape.Flow.Feed.from_url(url, opts)
83 |     feed
84 |   end
85 | end
86 | 


--------------------------------------------------------------------------------
/lib/scrape/application.ex:
--------------------------------------------------------------------------------
 1 | defmodule Scrape.Application do
 2 |   # See https://hexdocs.pm/elixir/Application.html
 3 |   # for more information on OTP Applications
 4 |   @moduledoc false
 5 | 
 6 |   use Application
 7 | 
 8 |   def start(_type, _args) do
 9 |     # List all child processes to be supervised
10 |     children = [
11 |       # Starts a worker by calling: Scrape.Worker.start_link(arg)
12 |       # {Scrape.Worker, arg}
13 |     ]
14 | 
15 |     # See https://hexdocs.pm/elixir/Supervisor.html
16 |     # for other strategies and supported options
17 |     opts = [strategy: :one_for_one, name: Scrape.Supervisor]
18 |     Supervisor.start_link(children, opts)
19 |   end
20 | end
21 | 


--------------------------------------------------------------------------------
/lib/scrape/flow.ex:
--------------------------------------------------------------------------------
 1 | defmodule Scrape.Flow do
 2 |   @moduledoc """
 3 |   Logic Module for implementing linear data processing workflows.
 4 | 
 5 |   Uses a "token" approach to store/retrieve values and persists a pipeline
 6 |   state that can be halted at any time. In case that something goes wrong,
 7 |   the pipeline will be halted and an error object will be returned with the
 8 |   occured error. Therefore, the pipeline should never raise an actual exception.
 9 |   """
10 | 
11 |   @typedoc """
12 |   Intermediate state object that holds everything relevant for the data
13 |   processing work flow. `state` holds general processing information, `assigns`
14 |   are the user-level data fields and `options` contains a keyword list for,
15 |   well, configuration options.
16 |   """
17 | 
18 |   @type flow :: %__MODULE__{
19 |           state: %{
20 |             halted: boolean(),
21 |             error: nil | any()
22 |           },
23 |           assigns: map(),
24 |           options: [{atom(), any()}]
25 |         }
26 | 
27 |   defstruct(state: %{halted: false, error: nil}, assigns: %{}, options: [])
28 | 
29 |   @doc """
30 |   Initiate a new data processing flow with optional configuration.
31 | 
32 |   NOTE: the options are currently not used but will be in upcoming versions.
33 | 
34 |   ## Example
35 |           iex> Flow.start()
36 |           %Flow{state: %{halted: false, error: nil}, assigns: %{}, options: []}
37 |   """
38 | 
39 |   @spec start([{atom(), any()}]) :: flow
40 | 
41 |   def start(opts \\ []) do
42 |     %__MODULE__{options: opts}
43 |   end
44 | 
45 |   @doc """
46 |   Declare a new value in the data flow.
47 | 
48 |   Will do nothing if the flow got halted previously. If a function is given,
49 |   and it raises an exception, the pipeline will catch the error and transform
50 |   into a halted state.
51 |   """
52 | 
53 |   @spec assign(flow, [{atom(), any()}]) :: flow
54 | 
55 |   def assign(%__MODULE__{state: %{halted: true}} = flow, _) do
56 |     flow
57 |   end
58 | 
59 |   def assign(%__MODULE__{} = flow, [{k, v}]) when not is_function(v) do
60 |     %{flow | assigns: Map.put(flow.assigns, k, v)}
61 |   end
62 | 
63 |   def assign(%__MODULE__{} = flow, [{k, v}]) do
64 |     try do
65 |       %{flow | assigns: Map.put(flow.assigns, k, v.(flow.assigns))}
66 |     rescue
67 |       error -> %{flow | state: %{halted: true, error: {:assign, k, error}}}
68 |     end
69 |   end
70 | 
71 |   @doc """
72 |   Select keys from the flow assigns and return a map with the chosen fields.
73 | 
74 |   Will result in an error object if the flow got halted previously.
75 |   """
76 | 
77 |   @spec finish(flow, [atom()]) :: {:ok, map()} | {:error, any()}
78 | 
79 |   def finish(_flow, keys \\ [])
80 | 
81 |   def finish(%__MODULE__{state: %{halted: true, error: error}}, _) do
82 |     {:error, error}
83 |   end
84 | 
85 |   def finish(%__MODULE__{assigns: assigns}, []), do: {:ok, assigns}
86 | 
87 |   def finish(%__MODULE__{assigns: assigns}, keys) do
88 |     {:ok, Map.take(assigns, keys)}
89 |   end
90 | end
91 | 


--------------------------------------------------------------------------------
/lib/scrape/flow/article.ex:
--------------------------------------------------------------------------------
 1 | defmodule Scrape.Flow.Article do
 2 |   @moduledoc false
 3 | 
 4 |   alias Scrape.Flow
 5 |   alias Scrape.IR.HTML
 6 |   alias Scrape.IR.Text
 7 | 
 8 |   def from_url(url, opts \\ []) do
 9 |     Flow.start(opts)
10 |     |> Flow.assign(url: url)
11 |     |> Flow.assign(html: &Scrape.Source.HTTP.get!(&1[:url]))
12 |     |> process_html()
13 |   end
14 | 
15 |   def from_file(path, opts \\ []) do
16 |     Flow.start(opts)
17 |     |> Flow.assign(path: path)
18 |     |> Flow.assign(html: &Scrape.Source.Disk.get!(&1[:path]))
19 |     |> process_html()
20 |   end
21 | 
22 |   def from_string(html, opts \\ []) do
23 |     Flow.start(opts)
24 |     |> Flow.assign(html: html)
25 |     |> process_html()
26 |   end
27 | 
28 |   defp process_html(%{assigns: %{html: nil}}), do: {:error, :html_invalid}
29 | 
30 |   defp process_html(%{assigns: %{html: ""}}), do: {:error, :html_invalid}
31 | 
32 |   defp process_html(flow) do
33 |     flow
34 |     |> Flow.assign(dom: &Floki.parse(&1[:html]))
35 |     |> Flow.assign(title: &HTML.title(&1[:dom]))
36 |     |> Flow.assign(image_url: &HTML.image_url(&1[:dom], &1[:url]))
37 |     |> Flow.assign(readable_html: &HTML.simple(&1[:dom]))
38 |     |> Flow.assign(text: fn %{html: html} -> HTML.content(html) || HTML.sentences(html) end)
39 |     |> Flow.assign(language: &Text.detect_language(&1[:text]))
40 |     |> Flow.assign(stems: &Text.semantic_keywords(&1[:text], 30, &1[:language]))
41 |     |> Flow.assign(summary: &Text.extract_summary(&1[:text], &1[:stems], &1[:language]))
42 |     |> Flow.finish([:url, :title, :text, :summary, :language, :stems, :image_url, :readable_html])
43 |   end
44 | end
45 | 


--------------------------------------------------------------------------------
/lib/scrape/flow/domain.ex:
--------------------------------------------------------------------------------
 1 | defmodule Scrape.Flow.Domain do
 2 |   @moduledoc false
 3 | 
 4 |   alias Scrape.Flow
 5 |   alias Scrape.IR.HTML
 6 | 
 7 |   def from_url(url, opts \\ []) do
 8 |     Flow.start(opts)
 9 |     |> Flow.assign(url: url)
10 |     |> Flow.assign(html: &Scrape.Source.HTTP.get!(&1[:url]))
11 |     |> process_html()
12 |   end
13 | 
14 |   def from_file(path, opts \\ []) do
15 |     Flow.start(opts)
16 |     |> Flow.assign(path: path)
17 |     |> Flow.assign(html: &Scrape.Source.Disk.get!(&1[:path]))
18 |     |> process_html()
19 |   end
20 | 
21 |   def from_string(html, opts \\ []) do
22 |     Flow.start(opts)
23 |     |> Flow.assign(html: html)
24 |     |> process_html()
25 |   end
26 | 
27 |   defp process_html(%{assigns: %{html: nil}}), do: {:error, :html_invalid}
28 | 
29 |   defp process_html(%{assigns: %{html: ""}}), do: {:error, :html_invalid}
30 | 
31 |   defp process_html(flow) do
32 |     flow
33 |     |> Flow.assign(dom: &Floki.parse(&1[:html]))
34 |     |> Flow.assign(title: &HTML.title(&1[:dom]))
35 |     |> Flow.assign(description: &HTML.description(&1[:dom]))
36 |     |> Flow.assign(icon_url: &HTML.icon_url(&1[:dom], &1[:url]))
37 |     |> Flow.assign(feed_urls: &HTML.feed_urls(&1[:dom], &1[:url]))
38 |     |> Flow.finish([:url, :title, :description, :icon_url, :feed_urls])
39 |   end
40 | end
41 | 


--------------------------------------------------------------------------------
/lib/scrape/flow/feed.ex:
--------------------------------------------------------------------------------
 1 | defmodule Scrape.Flow.Feed do
 2 |   @moduledoc false
 3 | 
 4 |   alias Scrape.Flow
 5 |   alias Scrape.IR.Feed
 6 | 
 7 |   def from_url(url, opts \\ []) do
 8 |     Flow.start(opts)
 9 |     |> Flow.assign(url: url)
10 |     |> Flow.assign(xml: &Scrape.Source.HTTP.get!(&1[:url]))
11 |     |> process_xml()
12 |   end
13 | 
14 |   def from_file(path, opts \\ []) do
15 |     Flow.start(opts)
16 |     |> Flow.assign(path: path)
17 |     |> Flow.assign(url: nil)
18 |     |> Flow.assign(xml: &Scrape.Source.Disk.get!(&1[:path]))
19 |     |> process_xml()
20 |   end
21 | 
22 |   def from_string(xml, opts \\ []) do
23 |     Flow.start(opts)
24 |     |> Flow.assign(xml: xml)
25 |     |> Flow.assign(url: nil)
26 |     |> process_xml()
27 |   end
28 | 
29 |   defp process_xml(%{assigns: %{xml: nil}}), do: {:error, :xml_invalid}
30 | 
31 |   defp process_xml(%{assigns: %{xml: ""}}), do: {:error, :xml_invalid}
32 | 
33 |   defp process_xml(flow) do
34 |     flow
35 |     |> Flow.assign(tree: &Scrape.Tools.Tree.from_xml_string(&1[:xml]))
36 |     |> Flow.assign(title: &Feed.title(&1[:tree]))
37 |     |> Flow.assign(description: &Feed.description(&1[:tree]))
38 |     |> Flow.assign(website_url: &Feed.website_url(&1[:tree]))
39 |     |> Flow.assign(items: &items/1)
40 |     |> Flow.finish([:url, :title, :description, :website_url, :items])
41 |   end
42 | 
43 |   defp items(%{tree: tree, url: url}) do
44 |     tree
45 |     |> Feed.items()
46 |     |> Enum.map(fn item -> Scrape.Flow.FeedItem.from_tree(item, url) end)
47 |     |> Enum.filter(fn {status, _} -> status == :ok end)
48 |     |> Enum.map(&elem(&1, 1))
49 |   end
50 | end
51 | 


--------------------------------------------------------------------------------
/lib/scrape/flow/feed_item.ex:
--------------------------------------------------------------------------------
 1 | defmodule Scrape.Flow.FeedItem do
 2 |   @moduledoc false
 3 | 
 4 |   alias Scrape.Flow
 5 |   alias Scrape.IR.FeedItem, as: Item
 6 | 
 7 |   def from_tree(tree, url, opts \\ []) do
 8 |     Flow.start(opts)
 9 |     |> Flow.assign(tree: tree)
10 |     |> Flow.assign(url: url)
11 |     |> Flow.assign(title: &Item.title(&1[:tree]))
12 |     |> Flow.assign(description: &Item.description(&1[:tree]))
13 |     |> Flow.assign(article_url: &Item.article_url(&1[:tree], &1[:url]))
14 |     |> Flow.assign(tags: &Item.tags(&1[:tree]))
15 |     |> Flow.assign(author: &Item.author(&1[:tree]))
16 |     |> Flow.assign(image_url: &Item.image_url(&1[:tree], &1[:url]))
17 |     |> Flow.finish([:title, :description, :article_url, :tags, :author, :image_url])
18 |   end
19 | end
20 | 


--------------------------------------------------------------------------------
/lib/scrape/ir/feed.ex:
--------------------------------------------------------------------------------
 1 | defmodule Scrape.IR.Feed do
 2 |   @moduledoc """
 3 |   Information Retrieval implementations to extract data from feeds (RSS or Atom).
 4 | 
 5 |   Makes intense use of `Scrape.Tools.Tree` and it's functions to operate on
 6 |   nested maps instead of raw XML documents.
 7 |   """
 8 | 
 9 |   alias Scrape.Tools.Tree
10 |   alias Scrape.Tools.URL
11 | 
12 |   @doc """
13 |   Extract the (best) title from the feed.
14 | 
15 |   ## Example
16 |       iex> Feed.title("<feed><title>abc</title></feed>")
17 |       "abc"
18 |   """
19 | 
20 |   @spec title(String.t() | map()) :: nil | String.t() | map()
21 | 
22 |   def title(feed) when is_binary(feed) do
23 |     feed |> Tree.from_xml_string() |> title()
24 |   end
25 | 
26 |   def title(feed) when is_map(feed) do
27 |     Tree.first(feed, ["rss.channel.title", "feed.title"])
28 |   end
29 | 
30 |   @doc """
31 |   Extract the (best) description from the feed.
32 | 
33 |   ## Example
34 |       iex> Feed.description("<feed><description>abc</description></feed>")
35 |       "abc"
36 |   """
37 | 
38 |   @spec description(String.t() | map()) :: nil | String.t() | map()
39 | 
40 |   def description(feed) when is_binary(feed) do
41 |     feed |> Tree.from_xml_string() |> description()
42 |   end
43 | 
44 |   def description(feed) when is_map(feed) do
45 |     Tree.first(feed, [
46 |       "rss.channel.description",
47 |       "rss.channel.subtitle",
48 |       "feed.description",
49 |       "feed.subtitle"
50 |     ])
51 |   end
52 | 
53 |   @doc """
54 |   Extract the website_url from the feed.
55 | 
56 |   ## Example
57 |       iex> Feed.website_url("<feed><link href='http://example.com' /></feed>")
58 |       "http://example.com"
59 |   """
60 | 
61 |   @spec website_url(String.t() | map()) :: nil | String.t() | map()
62 | 
63 |   def website_url(feed) when is_binary(feed) do
64 |     feed |> Tree.from_xml_string() |> website_url()
65 |   end
66 | 
67 |   def website_url(feed) when is_map(feed) do
68 |     feed
69 |     |> Tree.first(["rss.channel.link", "feed.link.href"])
70 |     |> normalize()
71 |   end
72 | 
73 |   defp normalize(nil), do: nil
74 |   defp normalize(""), do: nil
75 |   defp normalize(url), do: url |> URL.base()
76 | 
77 |   @doc """
78 |   Returns the list of all feed items.
79 | 
80 |   ## Example
81 |       iex> Feed.items("<feed><entry><title>abc</title></entry></feed>")
82 |       [%{"title" => "abc"}]
83 |   """
84 | 
85 |   @spec items(String.t() | map()) :: nil | [map()]
86 | 
87 |   def items(feed) when is_binary(feed) do
88 |     feed |> Tree.from_xml_string() |> items()
89 |   end
90 | 
91 |   def items(feed) when is_map(feed) do
92 |     Tree.find_all(feed, ["feed.entry", "rss.channel.item"])
93 |   end
94 | end
95 | 


--------------------------------------------------------------------------------
/lib/scrape/ir/feed_item.ex:
--------------------------------------------------------------------------------
  1 | defmodule Scrape.IR.FeedItem do
  2 |   @moduledoc """
  3 |   Similar (and used by) `Scrape.IR.Feed`, but has specialized selectors
  4 |   to extract data from feed items/entries.
  5 |   """
  6 | 
  7 |   alias Scrape.Tools.Tree
  8 |   alias Scrape.Tools.URL
  9 | 
 10 |   @doc """
 11 |   Extract the (best) title from the feed item.
 12 | 
 13 |   ## Example
 14 |       iex> FeedItem.title("<title>abc</title>")
 15 |       "abc"
 16 |   """
 17 | 
 18 |   @spec title(String.t() | map()) :: nil | String.t()
 19 | 
 20 |   def title(tree) when is_binary(tree) do
 21 |     tree |> Tree.from_xml_string() |> title()
 22 |   end
 23 | 
 24 |   def title(tree) when is_map(tree) do
 25 |     tree
 26 |     |> Tree.first(["title"])
 27 |     |> normalize_to_string()
 28 |   end
 29 | 
 30 |   @doc """
 31 |   Extract the (best) description from the feed item.
 32 | 
 33 |   ## Example
 34 |       iex> FeedItem.description("<description>abc</description>")
 35 |       "abc"
 36 |   """
 37 | 
 38 |   @spec description(String.t() | map()) :: nil | String.t()
 39 | 
 40 |   def description(tree) when is_binary(tree) do
 41 |     tree |> Tree.from_xml_string() |> description()
 42 |   end
 43 | 
 44 |   def description(tree) when is_map(tree) do
 45 |     tree
 46 |     |> Tree.first(["description", "summary", "content"])
 47 |     |> normalize_to_string()
 48 |   end
 49 | 
 50 |   @doc """
 51 |   Extract the article_url from the feed item.
 52 | 
 53 |   ## Example
 54 |       iex> FeedItem.article_url("<link href='http://example.com' />")
 55 |       "http://example.com"
 56 | 
 57 |       iex> FeedItem.article_url("<link href='/url' />", "http://example.com")
 58 |       "http://example.com/url"
 59 |   """
 60 | 
 61 |   @spec article_url(String.t() | map(), nil | String.t()) :: nil | String.t()
 62 | 
 63 |   def article_url(tree, url \\ "")
 64 | 
 65 |   def article_url(tree, url) when is_binary(tree) do
 66 |     tree |> Tree.from_xml_string() |> article_url(url)
 67 |   end
 68 | 
 69 |   def article_url(tree, url) when is_map(tree) do
 70 |     tree
 71 |     |> Tree.first(["link.href", "link"])
 72 |     |> normalize_to_string()
 73 |     |> normalize_url(url)
 74 |   end
 75 | 
 76 |   @doc """
 77 |   Extract the possible tags from the feed item.
 78 | 
 79 |   ## Example
 80 |       iex> FeedItem.tags("<category>abc</category>")
 81 |       ["abc"]
 82 | 
 83 |       iex> FeedItem.tags("<feed></feed>")
 84 |       []
 85 |   """
 86 | 
 87 |   @spec tags(String.t() | map()) :: [String.t()]
 88 | 
 89 |   def tags(tree) when is_binary(tree) do
 90 |     tree |> Tree.from_xml_string() |> tags()
 91 |   end
 92 | 
 93 |   def tags(tree) when is_map(tree) do
 94 |     tree
 95 |     |> Tree.find("category")
 96 |     |> List.wrap()
 97 |     |> Enum.map(&normalize_to_string/1)
 98 |     |> Enum.reject(&is_nil/1)
 99 |     |> Enum.map(&Scrape.IR.Text.clean/1)
100 |     |> Enum.map(&String.downcase/1)
101 |   end
102 | 
103 |   @doc """
104 |   Extract the author from the feed item.
105 | 
106 |   ## Example
107 |       iex> FeedItem.author("<author>abc</author>")
108 |       "abc"
109 |   """
110 | 
111 |   @spec author(String.t() | map()) :: nil | String.t()
112 | 
113 |   def author(tree) when is_binary(tree) do
114 |     tree |> Tree.from_xml_string() |> author()
115 |   end
116 | 
117 |   def author(tree) when is_map(tree) do
118 |     tree
119 |     |> Tree.first(["~creator", "author.name", "author"])
120 |     |> normalize_to_string()
121 |   end
122 | 
123 |   @doc """
124 |   Extract the image_url from the feed item.
125 | 
126 |   ## Example
127 |       iex> FeedItem.image_url("<enclosure url='abc' />")
128 |       "abc"
129 |   """
130 | 
131 |   @spec image_url(String.t() | map(), nil | String.t()) :: nil | String.t()
132 | 
133 |   def image_url(tree, url \\ "")
134 | 
135 |   def image_url(tree, url) when is_binary(tree) do
136 |     tree |> Tree.from_xml_string() |> image_url(url)
137 |   end
138 | 
139 |   def image_url(tree, url) when is_map(tree) do
140 |     tree
141 |     |> Tree.first(["enclosure.url", "media.content"])
142 |     |> normalize_to_string()
143 |     |> inline_image(tree)
144 |     |> normalize_url(url)
145 |   end
146 | 
147 |   defp inline_image(nil, %{"content" => content}) do
148 |     rx = ~r/\ssrc=["']*(([^'"\s]+)\.(jpe?g)|(png))["'\s]/i
149 | 
150 |     case Regex.run(rx, content, capture: :all_but_first) do
151 |       [match] when is_binary(match) -> match
152 |       [match | _] when is_binary(match) -> match
153 |       _ -> nil
154 |     end
155 |   end
156 | 
157 |   defp inline_image(img, _), do: img
158 | 
159 |   # ensure that a value is either a string or nil, but nothing else
160 |   defp normalize_to_string(value) when is_binary(value), do: value
161 |   defp normalize_to_string(_), do: nil
162 | 
163 |   # merge an relative url into an absolute url if possible
164 |   defp normalize_url(link, url) when is_binary(url), do: URL.merge(link, url)
165 |   defp normalize_url(link, _), do: link
166 | end
167 | 


--------------------------------------------------------------------------------
/lib/scrape/ir/html.ex:
--------------------------------------------------------------------------------
  1 | defmodule Scrape.IR.HTML do
  2 |   @moduledoc """
  3 |   Information Retrieval functions for extracting data out of HTML documents.
  4 | 
  5 |   Makes extensive use of `Scrape.Tools.DOM` under the hood, so a customized
  6 |   jQuery-like approach can be taken.
  7 |   """
  8 | 
  9 |   alias Scrape.Tools.DOM
 10 |   alias Scrape.Tools.URL
 11 | 
 12 |   @doc """
 13 |   Extract the best possible title from a HTML document (string or DOM) or nil.
 14 | 
 15 |   ## Examples
 16 |       iex> HTML.title("")
 17 |       nil
 18 | 
 19 |       iex> HTML.title("<title>abc</title>")
 20 |       "abc"
 21 |   """
 22 | 
 23 |   @spec title(DOM.dom()) :: nil | String.t()
 24 | 
 25 |   @title_queries [
 26 |     {"meta[property='og:title']", "content"},
 27 |     {"meta[property='twitter:title']", "content"},
 28 |     {"h1"},
 29 |     {"title"}
 30 |   ]
 31 | 
 32 |   def title(dom) do
 33 |     case DOM.first(dom, @title_queries) do
 34 |       nil -> nil
 35 |       match -> strip_suffix(match)
 36 |     end
 37 |   end
 38 | 
 39 |   defp strip_suffix(value) do
 40 |     rx = ~r/\s[|-].{1}.+$/
 41 | 
 42 |     case String.match?(value, rx) do
 43 |       true -> value |> String.split(rx) |> List.first()
 44 |       false -> value
 45 |     end
 46 |   end
 47 | 
 48 |   @doc """
 49 |   Extract the best possible description from a HTML document or nil.
 50 | 
 51 |   ## Examples
 52 |       iex> HTML.description("")
 53 |       nil
 54 | 
 55 |       iex> HTML.description("<meta name='description' content='abc' />")
 56 |       "abc"
 57 |   """
 58 | 
 59 |   @spec description(DOM.dom() | String.t()) :: nil | String.t()
 60 | 
 61 |   @description_queries [
 62 |     {"meta[property='og:description']", "content"},
 63 |     {"meta[name='twitter:description']", "content"},
 64 |     {"meta[name='description']", "content"}
 65 |   ]
 66 | 
 67 |   def description(dom) do
 68 |     DOM.first(dom, @description_queries)
 69 |   end
 70 | 
 71 |   @doc """
 72 |   Attempts to find the best image_url for the website or nil.
 73 | 
 74 |   If a root url is given, will transform relative images to absolute urls.
 75 | 
 76 |   ## Examples
 77 |       iex> HTML.image_url("")
 78 |       nil
 79 | 
 80 |       iex> HTML.image_url("<meta property='og:image' content='img.jpg' />")
 81 |       "img.jpg"
 82 |   """
 83 |   @spec image_url(DOM.dom()) :: nil | String.t()
 84 |   @spec image_url(DOM.dom(), String.t()) :: nil | String.t()
 85 | 
 86 |   @image_url_queries [
 87 |     {"meta[property='og:image']", "content"},
 88 |     {"meta[name='twitter:image']", "content"}
 89 |   ]
 90 | 
 91 |   def image_url(dom, url \\ "") do
 92 |     case DOM.first(dom, @image_url_queries) do
 93 |       nil -> nil
 94 |       match -> URL.merge(match, url)
 95 |     end
 96 |   end
 97 | 
 98 |   @doc """
 99 |   Attempts to find something resembling a favicon url or nil.
100 | 
101 |   If a root url is given, will transform relative images to absolute urls.
102 | 
103 |   ## Examples
104 |       iex> HTML.icon_url("")
105 |       nil
106 | 
107 |       iex> HTML.icon_url("<link rel='shortcut icon' href='img.jpg' />")
108 |       "img.jpg"
109 |   """
110 | 
111 |   @spec icon_url(DOM.dom()) :: nil | String.t()
112 |   @spec icon_url(DOM.dom(), String.t()) :: nil | String.t()
113 | 
114 |   @icon_url_queries [
115 |     {"link[rel='apple-touch-icon']", "href"},
116 |     {"link[rel='apple-touch-icon-precomposed']", "href"},
117 |     {"link[rel='shortcut icon']", "href"},
118 |     {"link[rel='icon']", "href"}
119 |   ]
120 | 
121 |   def icon_url(dom, url \\ "") do
122 |     case DOM.first(dom, @icon_url_queries) do
123 |       nil -> nil
124 |       match -> URL.merge(match, url)
125 |     end
126 |   end
127 | 
128 |   @doc """
129 |   Attempts to fetch all possible feed_urls from the given HTML document.
130 | 
131 |   ## Examples
132 |       iex> HTML.feed_urls("")
133 |       []
134 | 
135 |       iex> HTML.feed_urls("<link rel='alternate' href='/feed.rss' />")
136 |       ["/feed.rss"]
137 |   """
138 | 
139 |   @spec feed_urls(DOM.dom()) :: [String.t()]
140 |   @spec feed_urls(DOM.dom(), String.t()) :: [String.t()]
141 | 
142 |   def feed_urls(dom, url \\ "") do
143 |     list = feed_meta_tag(dom) ++ feed_inline(dom)
144 | 
145 |     list
146 |     |> Enum.filter(&URL.is_http?(&1))
147 |     |> Enum.map(&URL.merge(&1, url))
148 |     |> Enum.uniq()
149 |   end
150 | 
151 |   defp feed_meta_tag(dom) do
152 |     selector = """
153 |       link[type='application/rss+xml'],
154 |       link[type='application/atom+xml'],
155 |       link[rel='alternate']
156 |     """
157 | 
158 |     DOM.attrs(dom, selector, "href")
159 |   end
160 | 
161 |   defp feed_inline(dom) do
162 |     rx = ~r{href=['"]([^'"]*(rss|atom|feed|xml)[^'"]*)['"]}
163 |     str = Floki.raw_html(dom)
164 |     matches = Regex.scan(rx, str, capture: :all_but_first)
165 |     Enum.map(matches, &List.first/1)
166 |   end
167 | 
168 |   @doc """
169 |   Try to extract the semantically relevant part from a given document.
170 | 
171 | 
172 |   Uses the [Readability](https://hex.pm/packages/readability) algorithm, which
173 |   might fail sometimes. Ideally, it returns a single string containing full
174 |   sentences. Remember that this method uses a few heuristics that *somehow*
175 |   work together nicely in many cases, but nothing more.
176 |   """
177 | 
178 |   def simple(dom) do
179 |     try do
180 |       dom
181 |       |> Floki.raw_html()
182 |       |> Readability.article()
183 |       |> Readability.readable_html()
184 |       |> String.replace(~r/<a[^>]*>(.*?)<\/a>/, "\\1")
185 |     rescue
186 |       _ -> nil
187 |     end
188 |   end
189 | 
190 |   @doc """
191 |   Try to extract the relevant text content from a given document.
192 | 
193 |   Uses the [Readability](https://hex.pm/packages/readability) algorithm, which
194 |   might fail sometimes. Ideally, it returns a single string containing full
195 |   sentences. Remember that this method uses a few heuristics that *somehow*
196 |   work together nicely in many cases, but nothing more.
197 |   """
198 | 
199 |   @spec content(DOM.dom()) :: nil | String.t()
200 | 
201 |   def content(dom) do
202 |     try do
203 |       dom
204 |       |> Readability.article()
205 |       |> Floki.filter_out("figure")
206 |       |> Readability.readable_text()
207 |       |> String.replace(~r/\s+/, " ")
208 |       |> String.replace(~r/(\s\S+[a-zäöüß]+)([A-ZÄÖÜ]\S+\s)/u, "\\1. \\2")
209 |     rescue
210 |       _ -> nil
211 |     end
212 |   end
213 | 
214 |   @doc """
215 |   Convenient fallback function if `content/1` didn't work. Uses `paragraphs/1`
216 |   under the hood.
217 |   """
218 | 
219 |   @spec sentences(DOM.dom()) :: nil | String.t()
220 | 
221 |   def sentences(dom) do
222 |     case paragraphs(dom) do
223 |       [] -> nil
224 |       list -> Enum.join(list, ".\n\n")
225 |     end
226 |   end
227 | 
228 |   @doc """
229 |   Attempt to find the most meaningful content snippets in the HTML document.
230 | 
231 |   Can be used as a fallback algorithm if `content/1` did return nil but *some*
232 |   text corpus is needed to work with.
233 | 
234 |   A text paragraph is relevant if it has a minimum amount of characters and
235 |   contains any indicators of a sentence-like structure.
236 |   Very naive approach, but works surprisingly well so far.
237 |   """
238 | 
239 |   @spec paragraphs(DOM.dom()) :: [String.t()]
240 | 
241 |   def paragraphs(dom) do
242 |     dom
243 |     |> Floki.find("article, p, div, body")
244 |     |> Enum.map(&Floki.text(&1, deep: false))
245 |     |> Enum.map(&Scrape.IR.Text.normalize_whitespace/1)
246 |     |> Enum.filter(&paragraph_is_relevant?/1)
247 |   end
248 | 
249 |   defp paragraph_is_relevant?(paragraph) do
250 |     String.length(paragraph) > 30 &&
251 |       String.contains?(paragraph, [". ", "? ", "! ", "\" ", "\", ", ": "])
252 |   end
253 | end
254 | 


--------------------------------------------------------------------------------
/lib/scrape/ir/text.ex:
--------------------------------------------------------------------------------
  1 | defmodule Scrape.IR.Text do
  2 |   @moduledoc """
  3 |   Collection of text mining algorithms, like summarization, classification and
  4 |   clustering.
  5 | 
  6 |   Details are hidden within the algorithms, so a clean interface can be provided.
  7 |   """
  8 | 
  9 |   alias Scrape.IR.Text.TFIDF
 10 |   alias Scrape.Tools.Word
 11 | 
 12 |   @doc false
 13 |   def generate_summary(text) do
 14 |     # TODO: my markov chain implementation belongs here.
 15 |     text
 16 |   end
 17 | 
 18 |   @doc """
 19 |   Dissect a text into sentences, weight their stemmed keywords against each other and
 20 |   return the 3 semantically most important sentences.
 21 |   """
 22 | 
 23 |   def extract_summary(text, start_words, language \\ :en) do
 24 |     text
 25 |     |> TFIDF.generate_database(language)
 26 |     |> TFIDF.query(start_words)
 27 |   end
 28 | 
 29 |   @doc """
 30 |   Find out in which natural language the given text is written in.
 31 | 
 32 |   Currently only german and (fallback) english are valid results. Uses external
 33 |   library [Paasaa](https://hex.pm/packages/paasaa).
 34 | 
 35 |   ## Example
 36 |       iex> Scrape.IR.Text.detect_language("the quick brown fox jumps over...")
 37 |       :en
 38 | 
 39 |       iex> Scrape.IR.Text.detect_language("Es ist ein schönes Wetter heute...")
 40 |       :de
 41 |   """
 42 | 
 43 |   @spec detect_language(String.t()) :: :de | :en
 44 | 
 45 |   def detect_language(text) do
 46 |     case Paasaa.detect(text) do
 47 |       "deu" -> :de
 48 |       _ -> :en
 49 |     end
 50 |   end
 51 | 
 52 |   @doc """
 53 |   Remove all occurences of javascript from a HTML snippet.
 54 | 
 55 |   Uses a regex (!)
 56 | 
 57 |   ## Example
 58 |       iex> Scrape.IR.Text.without_js("a<script>b</script>c")
 59 |       "ac"
 60 |   """
 61 | 
 62 |   @spec without_js(String.t()) :: String.t()
 63 | 
 64 |   def without_js(text) do
 65 |     rx = ~r/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/i
 66 |     String.replace(text, rx, "")
 67 |   end
 68 | 
 69 |   @doc """
 70 |   Strip all HTML tags from a text.
 71 | 
 72 |   ## Example
 73 |       iex> Scrape.IR.Text.without_html("<p>stuff</p>")
 74 |       "stuff"
 75 |   """
 76 | 
 77 |   @spec without_html(String.t()) :: String.t()
 78 | 
 79 |   def without_html(text) do
 80 |     text
 81 |     |> Floki.parse()
 82 |     |> Floki.text()
 83 |   end
 84 | 
 85 |   @doc """
 86 |   A text paragraph shall not include any whitespace except single spaces
 87 |   between words.
 88 | 
 89 |   ## Example
 90 |       iex> Scrape.IR.Text.normalize_whitespace("\\r\\thello world\\r ")
 91 |       "hello world"
 92 |   """
 93 | 
 94 |   @spec normalize_whitespace(String.t()) :: String.t()
 95 | 
 96 |   def normalize_whitespace(text) do
 97 |     text
 98 |     |> String.replace(~r/\s+/, " ")
 99 |     |> String.replace(~r/\s+/, " ")
100 |     |> String.trim()
101 |   end
102 | 
103 |   @doc """
104 |   Removes all junk from a given text, like javascript, html or mixed whitespace.
105 | 
106 |   ## Example
107 |       iex> Scrape.IR.Text.clean("\\t hello, \\r<b>world</b>!")
108 |       "hello, world!"
109 |   """
110 |   def clean(text) do
111 |     text
112 |     |> without_js()
113 |     |> without_html()
114 |     |> normalize_whitespace()
115 |   end
116 | 
117 |   @doc """
118 |   Dissect a text into word tokens.
119 | 
120 |   The resulting list is a list of downcased words with all non-word-characters
121 |   stripped.
122 | 
123 |   ## Examples
124 |       iex> Scrape.IR.Text.tokenize("Hello, world!")
125 |       ["hello", "world"]
126 |   """
127 | 
128 |   @spec tokenize(String.t()) :: [String.t()]
129 | 
130 |   def tokenize(text) do
131 |     text
132 |     |> String.replace(~r/[^\w\s]/u, " ")
133 |     |> normalize_whitespace()
134 |     |> String.downcase()
135 |     |> String.split()
136 |   end
137 | 
138 |   @doc """
139 |   Dissect a text into word tokens.
140 | 
141 |   The resulting list is a list of downcased words with all non-word-characters
142 |   stripped, but common phrase delimiters still included.
143 | 
144 |   ## Examples
145 |       iex> Scrape.IR.Text.tokenize_preserve_delimiters("Hello, world!")
146 |       ["hello", ",", "world", "!"]
147 |   """
148 | 
149 |   @spec tokenize_preserve_delimiters(String.t()) :: [String.t()]
150 | 
151 |   def tokenize_preserve_delimiters(text) do
152 |     text
153 |     |> String.replace(~r/([,\.\!\?])/u, " \\1 ")
154 |     |> String.replace(~r/[^\w\s,\.\!\?]/u, " ")
155 |     |> normalize_whitespace()
156 |     |> String.downcase()
157 |     |> String.split()
158 |   end
159 | 
160 |   @doc """
161 |   Dissect a text into word tokens similar to `tokenize/1` but strips words
162 |   that carry no semantic value.
163 | 
164 |   ## Examples
165 |       iex> Scrape.IR.Text.semantic_tokenize("A beautiful day!", :en)
166 |       ["beautiful", "day"]
167 |   """
168 | 
169 |   @spec semantic_tokenize(String.t(), :de | :en) :: [String.t()]
170 | 
171 |   def semantic_tokenize(text, language \\ :en) do
172 |     text
173 |     |> tokenize()
174 |     |> Enum.filter(fn word -> Word.is_meaningful?(word, language) end)
175 |   end
176 | 
177 |   @doc """
178 |   Similar to `semantic_tokenize/2`, but also determines the n (default: 20)
179 |   most relevant **stemmed** tokens from the list.
180 |   """
181 | 
182 |   def semantic_keywords(text, n \\ 20, language \\ :en) do
183 |     text
184 |     |> semantic_tokenize(language)
185 |     |> Enum.map(&Word.stem(&1, language))
186 |     |> Enum.reduce(%{}, &aggregate_word_scores/2)
187 |     |> Map.to_list()
188 |     |> Enum.sort_by(fn {_word, score} -> score end, &>=/2)
189 |     |> Enum.take(n)
190 |     |> Enum.map(&elem(&1, 0))
191 |   end
192 | 
193 |   defp aggregate_word_scores(word, acc) do
194 |     existing = Map.get(acc, word, 0)
195 |     Map.put(acc, word, existing + 1)
196 |   end
197 | end
198 | 


--------------------------------------------------------------------------------
/lib/scrape/ir/text/rake.ex:
--------------------------------------------------------------------------------
  1 | defmodule Scrape.IR.Text.RAKE do
  2 |   alias Scrape.IR.Text
  3 |   alias Scrape.Tools.Word
  4 | 
  5 |   def sample_text() do
  6 |     """
  7 |     Compatibility of systems of linear constraints over the set of natural numbers
  8 | 
  9 |     Criteria of compatibility of a system of linear Diophantine equations, strict inequations,
 10 |     and nonstrict inequations are considered. Upper bounds for components of a minimal set of
 11 |     solutions and algorithms of construction of minimal generating sets of solutions for all
 12 |     types of systems are given. These criteria and the corresponding algorithms for constructing
 13 |     a minimal supporting set of solutions can be used in solving all the considered types of
 14 |     systems and systems of mixed types.
 15 |     """
 16 |   end
 17 | 
 18 |   def execute(text, language \\ :en) do
 19 |     text
 20 |     |> Text.tokenize_preserve_delimiters()
 21 |     |> calculate_candidates(language)
 22 |     |> calculate_keyword_scores()
 23 |   end
 24 | 
 25 |   defp calculate_candidates(tokens, language) do
 26 |     calculate_candidates(tokens, language, [], [])
 27 |   end
 28 | 
 29 |   defp calculate_candidates([], _language, candidates, current_candidate) do
 30 |     candidates
 31 |     |> Kernel.++([current_candidate])
 32 |     |> List.flatten()
 33 |     |> Enum.filter(&(String.length(String.trim(&1)) > 0))
 34 |   end
 35 | 
 36 |   defp calculate_candidates([token | tokens], language, candidates, current_candidate) do
 37 |     if Word.is_stopword?(token, language) || token in [",", ".", "?", "!"] do
 38 |       calculate_candidates(
 39 |         tokens,
 40 |         language,
 41 |         candidates ++ [current_candidate |> Enum.join(" ")],
 42 |         []
 43 |       )
 44 |     else
 45 |       calculate_candidates(tokens, language, candidates, current_candidate ++ [token])
 46 |     end
 47 |   end
 48 | 
 49 |   defp calculate_keyword_scores(candidates) do
 50 |     words = candidates |> Enum.map(&String.split(&1, " ")) |> List.flatten() |> Enum.uniq()
 51 |     len = length(words)
 52 | 
 53 |     word_index =
 54 |       0..len |> Stream.zip(words) |> Enum.map(fn {k, v} -> {v, k} end) |> Enum.into(%{})
 55 | 
 56 |     table = :ets.new(:co_occurence_matrix, [:set])
 57 | 
 58 |     for candidate <- candidates do
 59 |       chunks = String.split(candidate, " ")
 60 | 
 61 |       Enum.each(chunks, fn chunk ->
 62 |         i = word_index[chunk]
 63 |         value = matrix_value(table, i, i)
 64 |         :ets.insert(table, {{i, i}, value + 1})
 65 |       end)
 66 | 
 67 |       if length(chunks) > 1 do
 68 |         Enum.each(permutations(chunks), fn words ->
 69 |           i1 = word_index[Enum.at(words, 0)]
 70 |           i2 = word_index[Enum.at(words, 1)]
 71 |           value = matrix_value(table, i1, i2)
 72 |           :ets.insert(table, {{i1, i2}, value + 1})
 73 |         end)
 74 |       end
 75 |     end
 76 | 
 77 |     word_scores =
 78 |       words
 79 |       |> Enum.map(fn word -> {word, matrix_row(table, word_index[word], len)} end)
 80 |       |> Enum.into(%{})
 81 | 
 82 |     candidates
 83 |     |> Enum.uniq()
 84 |     |> Enum.map(fn candidate ->
 85 |       chunks = candidate |> String.split(" ")
 86 |       score = chunks |> Enum.map(&word_scores[&1]) |> Enum.sum()
 87 |       {candidate, score}
 88 |     end)
 89 |     |> Enum.sort_by(fn {_candidate, score} -> score end, &>=/2)
 90 |     |> Enum.take(length(words) |> Integer.floor_div(3))
 91 |     |> Enum.map(fn {candidate, _score} -> candidate end)
 92 |   end
 93 | 
 94 |   defp matrix_value(table, i1, i2) do
 95 |     case :ets.lookup(table, {i1, i2}) do
 96 |       [] -> 0
 97 |       [{_, value}] -> value
 98 |     end
 99 |   end
100 | 
101 |   defp matrix_row(table, index, max) do
102 |     0..max
103 |     |> Enum.map(fn i -> matrix_value(table, i, index) end)
104 |     |> Enum.sum()
105 |   end
106 | 
107 |   defp permutations(list), do: for(x <- list, y <- list, x != y, do: [x, y])
108 | end
109 | 


--------------------------------------------------------------------------------
/lib/scrape/ir/text/tfidf.ex:
--------------------------------------------------------------------------------
  1 | defmodule Scrape.IR.Text.TFIDF do
  2 |   @moduledoc false
  3 | 
  4 |   defstruct [:text, :language, :corpus, :idf]
  5 | 
  6 |   def generate_database(text, language) do
  7 |     %__MODULE__{text: text, language: language}
  8 |     |> create_corpus()
  9 |     |> tokenize_sentences()
 10 |     |> calculate_idf_scores()
 11 |   end
 12 | 
 13 |   def query(%__MODULE__{corpus: corpus} = data, words) do
 14 |     idf =
 15 |       words
 16 |       |> Enum.map(fn word -> {word, calculate_inverse_document_frequency(data, word)} end)
 17 |       |> Enum.into(%{})
 18 | 
 19 |     find_id = fn excludes -> find_best_sentence_id(corpus, idf, words, excludes) end
 20 |     s1_id = find_id.([])
 21 |     s1_sentence = corpus[s1_id]
 22 | 
 23 |     s2_id = find_id.(s1_sentence.words)
 24 |     s2_sentence = corpus[s2_id]
 25 | 
 26 |     s3_id = find_id.(List.flatten([s1_sentence.words, s2_sentence.words]))
 27 |     s3_sentence = corpus[s3_id]
 28 | 
 29 |     [s1_sentence, s2_sentence, s3_sentence]
 30 |     |> Enum.map(fn %{sentence: sentence} -> sentence end)
 31 |     |> Enum.map(fn sentence -> sentence <> "." end)
 32 |     |> Enum.join(" ")
 33 |   end
 34 | 
 35 |   defp find_best_sentence_id(corpus, idf, words, blacklist) do
 36 |     for {id, %{tf: tf}} <- corpus do
 37 |       score =
 38 |         tf
 39 |         |> Enum.filter(fn {word, _} -> word in words end)
 40 |         |> Enum.filter(fn {word, _} -> word not in blacklist end)
 41 |         |> Enum.map(fn {word, value} -> value * idf[word] end)
 42 |         |> Enum.sum()
 43 | 
 44 |       {id, score}
 45 |     end
 46 |     |> Enum.sort_by(fn {_id, score} -> score end, &>=/2)
 47 |     |> List.first()
 48 |     |> elem(0)
 49 |   end
 50 | 
 51 |   defp create_corpus(%__MODULE__{text: text} = data) do
 52 |     corpus =
 53 |       text
 54 |       |> String.replace(~r/(\s\S+[a-zäöüß]+)([A-ZÄÖÜ]\S+\s)/u, "\\1. \\2")
 55 |       |> String.split(~r/[\?!\.\s]\s/)
 56 |       |> Enum.map(&String.trim/1)
 57 |       |> Enum.map(&String.replace(&1, ~r/\.+$/, ""))
 58 |       |> Enum.reject(fn sentence -> String.length(sentence) < 3 end)
 59 |       |> Enum.uniq()
 60 |       |> Enum.with_index()
 61 |       |> Enum.map(fn {sentence, i} -> {i, %{sentence: sentence}} end)
 62 |       |> Enum.into(%{})
 63 | 
 64 |     %{data | corpus: corpus}
 65 |   end
 66 | 
 67 |   defp tokenize_sentences(%__MODULE__{corpus: corpus, language: language} = data) do
 68 |     updated_corpus =
 69 |       for {id, %{sentence: sentence} = document} <- corpus do
 70 |         words = tokenize(sentence, language)
 71 | 
 72 |         updated_document =
 73 |           document
 74 |           |> Map.put(:words, words)
 75 |           |> Map.put(:tf, calculate_term_frequency(words))
 76 | 
 77 |         {id, updated_document}
 78 |       end
 79 |       |> Enum.into(%{})
 80 | 
 81 |     %{data | corpus: updated_corpus}
 82 |   end
 83 | 
 84 |   defp calculate_idf_scores(%__MODULE__{corpus: corpus} = data) do
 85 |     idf =
 86 |       corpus
 87 |       |> Enum.map(fn {_id, %{words: words}} -> words end)
 88 |       |> List.flatten()
 89 |       |> Enum.uniq()
 90 |       |> Enum.map(fn word -> {word, calculate_inverse_document_frequency(data, word)} end)
 91 |       |> Enum.into(%{})
 92 | 
 93 |     %{data | idf: idf}
 94 |   end
 95 | 
 96 |   defp calculate_term_frequency(list) do
 97 |     len = length(list)
 98 | 
 99 |     list
100 |     |> Enum.group_by(& &1)
101 |     |> Enum.map(fn {word, occurences} -> {word, length(occurences) / len} end)
102 |     |> Enum.into(%{})
103 |   end
104 | 
105 |   defp calculate_inverse_document_frequency(%__MODULE__{corpus: corpus}, word) do
106 |     num_docs = corpus |> Map.keys() |> length
107 | 
108 |     num_hits =
109 |       corpus |> Map.values() |> Enum.filter(fn %{words: words} -> word in words end) |> length
110 | 
111 |     if num_hits == 0, do: 0, else: :math.log(num_docs / num_hits)
112 |   end
113 | 
114 |   defp tokenize(str, language) do
115 |     str
116 |     |> String.replace(~r/[^\w\s]/u, "")
117 |     |> Scrape.IR.Text.semantic_tokenize(language)
118 |   end
119 | end
120 | 


--------------------------------------------------------------------------------
/lib/scrape/options.ex:
--------------------------------------------------------------------------------
 1 | defmodule Scrape.Options do
 2 |   @moduledoc false
 3 | 
 4 |   @defaults num_stems: 30
 5 | 
 6 |   def merge(opts \\ []) do
 7 |     Keyword.merge(@defaults, opts)
 8 |   end
 9 | end
10 | 


--------------------------------------------------------------------------------
/lib/scrape/source/disk.ex:
--------------------------------------------------------------------------------
 1 | defmodule Scrape.Source.Disk do
 2 |   @moduledoc """
 3 |   Abstraction over the native `File` functions. Currently without additional logic.
 4 |   """
 5 | 
 6 |   @doc """
 7 |   Same as `File.read/1`.
 8 |   """
 9 |   def get(path) do
10 |     File.read(path)
11 |   end
12 | 
13 |   @doc """
14 |   Same as `File.read!/1`.
15 |   """
16 |   def get!(path) do
17 |     File.read!(path)
18 |   end
19 | end
20 | 


--------------------------------------------------------------------------------
/lib/scrape/source/http.ex:
--------------------------------------------------------------------------------
 1 | defmodule Scrape.Source.HTTP do
 2 |   alias Scrape.Source.HTTP.Charset
 3 |   alias Scrape.Source.HTTP.Get
 4 |   alias Scrape.Source.HTTP.Transcode
 5 | 
 6 |   @doc """
 7 |   Perform a HTTP GET request against the given url.
 8 | 
 9 |   This function is optimized for *text*-based data, not binary like images.
10 |   It will try to normalize the response into valid utf-8 and transcode if needed.
11 | 
12 |   Everything that is not a status code 200 with valid encoding will result in
13 |   some error object.
14 | 
15 |   ## Examples:
16 |       iex> HTTP.get("http://example.com")
17 |       {:ok, }"some response"}
18 |   """
19 | 
20 |   @spec get(String.t()) :: {:ok, String.t()} | {:error, any()} | {:http_error, any()}
21 | 
22 |   def get(url) do
23 |     url |> Get.execute() |> evaluate()
24 |   end
25 | 
26 |   @doc """
27 |   Same as `get/1`, but will raise if the result is not `:ok`.
28 |   """
29 | 
30 |   @spec get!(String.t()) :: String.t()
31 | 
32 |   def get!(url) do
33 |     {:ok, data} = get(url)
34 |     data
35 |   end
36 | 
37 |   defp evaluate({:error, _} = response), do: response
38 | 
39 |   defp evaluate({:ok, %{status_code: 200, headers: headers, body: body}}) do
40 |     case Charset.from_headers(headers) do
41 |       nil -> {:ok, body}
42 |       charset -> {:ok, Transcode.execute(charset, body)}
43 |     end
44 |   end
45 | 
46 |   defp evaluate({:ok, %{body: body}}), do: {:http_error, body}
47 | 
48 |   defp evaluate(response), do: response
49 | end
50 | 


--------------------------------------------------------------------------------
/lib/scrape/source/http/charset.ex:
--------------------------------------------------------------------------------
 1 | defmodule Scrape.Source.HTTP.Charset do
 2 |   @moduledoc false
 3 | 
 4 |   def from_headers(headers) do
 5 |     header =
 6 |       headers
 7 |       |> Enum.filter(fn {k, _} -> k == "Content-Type" end)
 8 |       |> first
 9 | 
10 |     if header do
11 |       {_name, content} = header
12 | 
13 |       ~r/charset=(ISO-8859-[1-9])/i
14 |       |> Regex.run(content, capture: :all_but_first)
15 |       |> first
16 |     else
17 |       nil
18 |     end
19 |   end
20 | 
21 |   defp first([h | _]), do: h
22 |   defp first(_), do: nil
23 | end
24 | 


--------------------------------------------------------------------------------
/lib/scrape/source/http/get.ex:
--------------------------------------------------------------------------------
 1 | defmodule Scrape.Source.HTTP.Get do
 2 |   @moduledoc false
 3 | 
 4 |   @opts [
 5 |     follow_redirect: true,
 6 |     timeout: 33_000,
 7 |     recv_timeout: 30_000,
 8 |     ssl: [{:versions, [:"tlsv1.2"]}]
 9 |   ]
10 | 
11 |   @headers [
12 |     "user-agent":
13 |       "Mozilla/5.0 (X11; Fedora; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36",
14 |     accept: "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
15 |   ]
16 | 
17 |   def execute(url, http_headers \\ @headers, http_opts \\ @opts) do
18 |     HTTPoison.get(url, http_headers, http_opts)
19 |   end
20 | end
21 | 


--------------------------------------------------------------------------------
/lib/scrape/source/http/transcode.ex:
--------------------------------------------------------------------------------
 1 | defmodule Scrape.Source.HTTP.Transcode do
 2 |   @moduledoc false
 3 | 
 4 |   def execute(charset, text) do
 5 |     encoding = charset_to_encoding(charset)
 6 |     {_status, result} = Codepagex.to_string(text, encoding)
 7 |     result
 8 |   end
 9 | 
10 |   defp charset_to_encoding(charset) do
11 |     charset
12 |     |> String.replace("-", "_")
13 |     |> String.downcase()
14 |     |> to_charlist
15 |     |> List.to_atom()
16 |   end
17 | end
18 | 


--------------------------------------------------------------------------------
/lib/scrape/tools/dom.ex:
--------------------------------------------------------------------------------
  1 | defmodule Scrape.Tools.DOM do
  2 |   @moduledoc """
  3 |   Utility module for selecting/extracting data from a "DOM" (HTML/XML tree-like
  4 |   structure). Can find text values and attribute values, inspired by jQuery and
  5 |   implemented with Floki.
  6 |   """
  7 | 
  8 |   @typedoc """
  9 |   DOM tree representation, same as Floki's html_tree.
 10 | 
 11 |   Can be created via `from_string/1`.
 12 |   """
 13 | 
 14 |   @type dom :: String.t() | tuple() | [any()]
 15 | 
 16 |   @doc """
 17 |   Create a DOM from a given (HTML/XML) string.
 18 | 
 19 |   ## Examples
 20 |       iex> DOM.from_string("")
 21 |       []
 22 | 
 23 |       iex> DOM.from_string("<html></html>")
 24 |       {"html", [], []}
 25 |   """
 26 | 
 27 |   @spec from_string(String.t()) :: dom
 28 | 
 29 |   def from_string(string) do
 30 |     Floki.parse(string)
 31 |   end
 32 | 
 33 |   @doc """
 34 |   Builds a (HTML/XML) string from a DOM structure.
 35 | 
 36 |   ## Examples
 37 |       iex> DOM.to_string([])
 38 |       ""
 39 | 
 40 |       iex> DOM.to_string({"html", [], []})
 41 |       "<html></html>"
 42 |   """
 43 | 
 44 |   @spec to_string(dom) :: String.t()
 45 | 
 46 |   def to_string(dom) do
 47 |     case dom do
 48 |       dom when is_tuple(dom) or is_list(dom) -> Floki.raw_html(dom)
 49 |       _ -> ""
 50 |     end
 51 |   end
 52 | 
 53 |   @doc """
 54 |   Get the text value of a DOM node (including nested nodes).
 55 | 
 56 |   If many nodes match the selector, the first one is used.
 57 | 
 58 |   ## Examples
 59 |       iex> "<div>abc</div>" |> DOM.from_string() |> DOM.text("p")
 60 |       nil
 61 | 
 62 |       iex> "<div>abc</div>" |> DOM.from_string() |> DOM.text("div")
 63 |       "abc"
 64 |   """
 65 | 
 66 |   @spec text(dom, String.t()) :: nil | String.t()
 67 | 
 68 |   def text(dom, selector) do
 69 |     dom
 70 |     |> Floki.find(selector)
 71 |     |> List.first()
 72 |     |> get_text()
 73 |     |> unwrap_string()
 74 |   end
 75 | 
 76 |   @doc """
 77 |   Similar to `text/2` but iterates over all matching nodes.
 78 | 
 79 |   Returns always a list result, but with nil values filtered.
 80 | 
 81 |   ## Examples
 82 |       iex> "<div>abc</div>" |> DOM.from_string() |> DOM.texts("p")
 83 |       []
 84 | 
 85 |       iex> "<div>abc</div>" |> DOM.from_string() |> DOM.texts("div")
 86 |       ["abc"]
 87 | 
 88 |       iex> "<p>a</p><p>b</p>" |> DOM.from_string() |> DOM.texts("p")
 89 |       ["a", "b"]
 90 |   """
 91 | 
 92 |   @spec texts(dom, String.t()) :: [String.t()]
 93 | 
 94 |   def texts(dom, selector) do
 95 |     dom
 96 |     |> Floki.find(selector)
 97 |     |> Enum.map(&get_text/1)
 98 |     |> Enum.map(&unwrap_string/1)
 99 |     |> Enum.reject(&is_nil/1)
100 |     |> List.wrap()
101 |   end
102 | 
103 |   @doc """
104 |   Similar to `text/2` but but returns a chosen attribute value instead of the
105 |   node's text value (or nil).
106 | 
107 |   ## Examples
108 |       iex> "<meta name='a' content='b' />" |> DOM.from_string |> DOM.attr("meta", "unknown")
109 |       nil
110 | 
111 |       iex> "<meta name='a' content='b' />" |> DOM.from_string |> DOM.attr("meta", "content")
112 |       "b"
113 | 
114 |       iex> "<meta name='a' content='b' />" |> DOM.from_string |> DOM.attr("meta[name=a]", "content")
115 |       "b"
116 |   """
117 | 
118 |   @spec attr(dom, String.t(), String.t()) :: nil | String.t()
119 | 
120 |   def attr(dom, selector, name) do
121 |     dom
122 |     |> Floki.find(selector)
123 |     |> List.first()
124 |     |> get_attr(name)
125 |     |> unwrap_string()
126 |   end
127 | 
128 |   @doc """
129 |   Similar to `attr/3` but returns a list of all matching results.
130 | 
131 |   ## Examples
132 |       iex> "<p class='a'>b</p><p class='c' />" |> DOM.from_string() |> DOM.attrs("div", "class")
133 |       []
134 | 
135 |       iex> "<p class='a'>b</p><p class='c' />" |> DOM.from_string() |> DOM.attrs("p", "id")
136 |       []
137 | 
138 |       iex> "<p class='a'>b</p><p class='c' />" |> DOM.from_string() |> DOM.attrs("p", "class")
139 |       ["a", "c"]
140 |   """
141 | 
142 |   @spec attrs(dom, String.t(), String.t()) :: [String.t()]
143 | 
144 |   def attrs(dom, selector, name) do
145 |     dom
146 |     |> Floki.find(selector)
147 |     |> Enum.map(&get_attr(&1, name))
148 |     |> Enum.map(&unwrap_string/1)
149 |     |> Enum.reject(&is_nil/1)
150 |   end
151 | 
152 |   @doc """
153 |   Cascading query helper, applies either `text/2` or `attr/3` until something
154 |   returns a non-nil result or all queries are tried.
155 | 
156 |   ## Examples
157 |       iex> DOM.first([], [])
158 |       nil
159 | 
160 |       iex> DOM.first([], [{"b"}, {"i"}, {"div", "class"}])
161 |       nil
162 | 
163 |       iex> "<div id='1'>abc</div>" |> DOM.from_string() |> DOM.first([{"i"}, {"div", "id"}])
164 |       "1"
165 | 
166 |       iex> "<b>abc</b>" |> DOM.from_string() |> DOM.first([{"i"}, {"b"}])
167 |       "abc"
168 |   """
169 | 
170 |   @spec first(dom, [{String.t()} | {String.t(), String.t()}]) :: nil | String.t()
171 | 
172 |   def first(_dom, []), do: nil
173 | 
174 |   def first(dom, [{selector} | queries]) do
175 |     case text(dom, selector) do
176 |       nil -> first(dom, queries)
177 |       string -> string
178 |     end
179 |   end
180 | 
181 |   def first(dom, [{selector, name} | queries]) do
182 |     case attr(dom, selector, name) do
183 |       nil -> first(dom, queries)
184 |       string -> string
185 |     end
186 |   end
187 | 
188 |   defp get_text(nil), do: ""
189 |   defp get_text(value), do: Floki.text(value)
190 | 
191 |   defp get_attr(nil, _name), do: nil
192 |   defp get_attr(elem, name), do: elem |> Floki.attribute(name) |> List.first()
193 | 
194 |   defp unwrap_string(value) when not is_binary(value), do: nil
195 |   defp unwrap_string(""), do: nil
196 |   defp unwrap_string(value), do: value
197 | end
198 | 


--------------------------------------------------------------------------------
/lib/scrape/tools/tree.ex:
--------------------------------------------------------------------------------
  1 | defmodule Scrape.Tools.Tree do
  2 |   @moduledoc """
  3 |   Utility module for interacting with nested Map structures, here called "tree".
  4 |   """
  5 | 
  6 |   @doc """
  7 |   Transform a given xml string into a tree.
  8 | 
  9 |   The string must be utf-8 encoded. It will be sanitized via Floki and the xml
 10 |   declaration header will be stripped.
 11 | 
 12 |   ## Examples
 13 |       iex> Tree.from_xml_string("<feed><item>abc</item></feed>")
 14 |       %{"feed" => %{"item" => "abc"}}
 15 |   """
 16 | 
 17 |   @spec from_xml_string(String.t()) :: map()
 18 | 
 19 |   def from_xml_string(nil), do: %{}
 20 | 
 21 |   def from_xml_string(xml) do
 22 |     xml
 23 |     |> String.replace(~r/<\?xml.*?>/i, "")
 24 |     |> Floki.parse()
 25 |     |> Floki.raw_html()
 26 |     |> String.trim()
 27 |     |> try_build_tree()
 28 |     |> try_normalize()
 29 |   end
 30 | 
 31 |   defp try_build_tree(""), do: %{}
 32 | 
 33 |   defp try_build_tree(xml) do
 34 |     try do
 35 |       # XMap.from_xml(xml)
 36 |       XmlToMap.naive_map(xml)
 37 |     rescue
 38 |       _ -> %{}
 39 |     end
 40 |   end
 41 | 
 42 |   defp try_normalize(map) do
 43 |     case Morphix.compactiform(map) do
 44 |       {:ok, tree} -> tree
 45 |       _ -> map
 46 |     end
 47 |   end
 48 | 
 49 |   @doc """
 50 |   Attempts all queries until one returns a non-nil result or nil.
 51 | 
 52 |   ## Examples
 53 |       iex> Tree.first(%{"hello" => "world"}, ["unknown"])
 54 |       nil
 55 | 
 56 |       iex> Tree.first(%{"hello" => "world"}, ["unknown", "hello"])
 57 |       "world"
 58 |   """
 59 | 
 60 |   @spec first(map(), [String.t()]) :: nil | any()
 61 | 
 62 |   def first(_tree, []), do: nil
 63 | 
 64 |   def first(tree, [selector | queries]) do
 65 |     case find(tree, selector) do
 66 |       nil -> first(tree, queries)
 67 |       [] -> first(tree, queries)
 68 |       [match] -> match
 69 |       [match | _] -> match
 70 |       match -> match
 71 |     end
 72 |   end
 73 | 
 74 |   @doc """
 75 |   Applies `find/2` to all given selectors and combines the result.
 76 | 
 77 |   ## Examples
 78 |       iex> Tree.find_all(%{"a" => "b", "c" => "d"}, ["a", "c"])
 79 |       ["b", "d"]
 80 | 
 81 |       iex> Tree.find_all(%{"a" => "b", "c" => "d"}, ["a", "z"])
 82 |       ["b"]
 83 | 
 84 |       iex> Tree.find_all(%{"a" => "b", "c" => "d"}, ["x", "y"])
 85 |       []
 86 |   """
 87 | 
 88 |   @spec find_all(map(), [String.t()]) :: [any()]
 89 | 
 90 |   def find_all(_tree, []), do: []
 91 | 
 92 |   def find_all(tree, selectors) do
 93 |     selectors
 94 |     |> Enum.map(&find(tree, &1))
 95 |     |> normalize()
 96 |   end
 97 | 
 98 |   @doc """
 99 |   Attempts to get a nested value from the tree using a string selector syntax.
100 | 
101 |   Returns nil if nothing matches the selector or all matching results.
102 | 
103 |   ## Examples
104 |       iex> Tree.find(%{"a" => %{"b" => "c"}}, "a")
105 |       %{"b" => "c"}
106 | 
107 |       iex> Tree.find(%{"a" => %{"b" => "c"}}, "a.b")
108 |       "c"
109 | 
110 |       iex> Tree.find(%{"a" => %{"b" => "c"}}, "unknown")
111 |       nil
112 | 
113 |       iex> Tree.find(%{"a" => [%{"b" => "c"}]}, "a.b")
114 |       ["c"]
115 | 
116 |       iex> Tree.find(%{"a" => [%{"b" => [%{"c" => "d"}]}]}, "a.b.c")
117 |       ["d"]
118 | 
119 |       iex> Tree.find(%{"a" => [%{"b" => "c"}, %{"b" => "c"}]}, "a.b")
120 |       ["c", "c"]
121 | 
122 |       iex> Tree.find(%{"a" => [%{"b" => [%{"c" => "d"}]}]}, "a.*.c")
123 |       ["d"]
124 | 
125 |       iex> Tree.find(%{"hello" => "world"}, "~ell")
126 |       ["world"]
127 |   """
128 | 
129 |   @spec find(map(), String.t()) :: any()
130 | 
131 |   def find(tree, selector) when is_map(tree) and is_binary(selector) do
132 |     tree
133 |     |> pick(String.split(selector, "."))
134 |     |> normalize()
135 |   end
136 | 
137 |   defp pick(nil, _), do: nil
138 |   defp pick(n, []), do: n
139 |   defp pick(n, keys) when is_list(n), do: Enum.map(n, &pick(&1, keys))
140 |   defp pick(n, _) when not is_map(n), do: nil
141 |   defp pick(n, ["*" | t]), do: n |> Map.values() |> Enum.map(&pick(&1, t))
142 | 
143 |   defp pick(n, ["~" <> pattern = _h | t]) do
144 |     n
145 |     |> Map.keys()
146 |     |> Enum.filter(&String.contains?(&1, pattern))
147 |     |> Enum.map(&Map.get(n, &1))
148 |     |> Enum.map(&pick(&1, t))
149 |   end
150 | 
151 |   defp pick(n, [h | t]) do
152 |     case Map.get(n, h) do
153 |       nil -> nil
154 |       sub -> pick(sub, t)
155 |     end
156 |   end
157 | 
158 |   defp normalize(value) when not is_list(value), do: value
159 | 
160 |   defp normalize(value) when is_list(value) do
161 |     value
162 |     |> List.flatten()
163 |     |> Enum.reject(&is_nil/1)
164 |   end
165 | end
166 | 


--------------------------------------------------------------------------------
/lib/scrape/tools/url.ex:
--------------------------------------------------------------------------------
 1 | defmodule Scrape.Tools.URL do
 2 |   @moduledoc """
 3 |   Simple utility functions to extract information from URLs.
 4 |   """
 5 | 
 6 |   @doc """
 7 |   Rebase an URL to another root URL, useful for turning relative URLs into
 8 |   absolute ones.
 9 | 
10 |   ## Example
11 |       iex> URL.merge("/path", "http://example.com")
12 |       "http://example.com/path"
13 |   """
14 | 
15 |   @spec merge(nil | String.t(), String.t()) :: nil | String.t()
16 | 
17 |   def merge(nil, _), do: nil
18 | 
19 |   def merge("", _), do: nil
20 | 
21 |   def merge(url, nil), do: url
22 | 
23 |   def merge(url, ""), do: url
24 | 
25 |   def merge(url, root_url) do
26 |     root_url |> URI.merge(url) |> URI.to_string()
27 |   end
28 | 
29 |   @doc """
30 |   Checks if a given string actually represents an URL.
31 | 
32 |   ## Example
33 |       iex> URL.is_http?("http://example.com")
34 |       true
35 | 
36 |       iex> URL.is_http?("example")
37 |       false
38 |   """
39 | 
40 |   @spec is_http?(String.t()) :: boolean()
41 | 
42 |   def is_http?(url) do
43 |     ["http", "/"]
44 |     |> Enum.any?(&String.starts_with?(url, &1))
45 |   end
46 | 
47 |   @doc """
48 |   Transforms a given url into it's basic form, only including protocol scheme
49 |   and host, without any other things like path, query or hash.
50 | 
51 |   ## Examples
52 |       iex> URL.base("https://example.com/path?param=1#search")
53 |       "https://example.com"
54 | 
55 |       iex> URL.base("//example.com")
56 |       "http://example.com"
57 |   """
58 | 
59 |   @spec base(String.t()) :: String.t()
60 | 
61 |   def base(url) do
62 |     uri = URI.parse(url)
63 |     scheme = uri.scheme || "http"
64 |     host = uri.host
65 |     "#{scheme}://#{host}"
66 |   end
67 | end
68 | 


--------------------------------------------------------------------------------
/lib/scrape/tools/word.ex:
--------------------------------------------------------------------------------
 1 | defmodule Scrape.Tools.Word do
 2 |   @moduledoc """
 3 |   Algorithms to extract information from single words.
 4 |   """
 5 | 
 6 |   @stemmer_codes %{
 7 |     :de => :german2,
 8 |     :en => :english
 9 |   }
10 | 
11 |   @doc """
12 |   Extract the stem of a given word.
13 | 
14 |   Uses the snowball algorithm under the hood via the library
15 |   [Stemex](https://hex.pm/packages/stemex), which in turn uses NIFs for raw
16 |   speed. Currently only german and english are supported.
17 | 
18 |   ## Example
19 |       iex> Word.stem("beautiful", :en)
20 |       "beauti"
21 | 
22 |       iex> Word.stem("derbsten", :de)
23 |       "derb"
24 |   """
25 | 
26 |   @spec stem(String.t(), :de | :en) :: String.t()
27 | 
28 |   def stem(word, language \\ :en)
29 |   def stem(nil, _), do: nil
30 |   def stem("", _), do: ""
31 | 
32 |   def stem(word, language) do
33 |     try do
34 |       apply(Stemex, @stemmer_codes[language], [word])
35 |     rescue
36 |       _ -> word
37 |     end
38 |   end
39 | 
40 |   @doc """
41 |     Check if a given word is a stopword against the provided language lists.
42 | 
43 |     Note: the provided language lists are all-downcased words.
44 | 
45 |     ## Examples
46 |       iex> Word.IsStopword.execute("when", :en)
47 |       true
48 | 
49 |       iex> Word.IsStopword.execute("linux", :en)
50 |       false
51 | 
52 |       iex> Word.IsStopword.execute("ein", :de)
53 |       true
54 | 
55 |       iex> Word.IsStopword.execute("elixir", :de)
56 |       false
57 |   """
58 | 
59 |   @spec is_stopword?(String.t(), :de | :en) :: boolean()
60 | 
61 |   defdelegate is_stopword?(word, language \\ :en),
62 |     to: Scrape.Tools.Word.IsStopword,
63 |     as: :execute
64 | 
65 |   @doc """
66 |   Determine if a given word might be relevant for analytical purposes.
67 | 
68 |   Uses a simple heuristic and checks for stopword matches.
69 | 
70 |   ## Examples
71 |       iex> Word.is_meaningful?("a", :en)
72 |       false
73 | 
74 |       iex> Word.is_meaningful?("apple", :en)
75 |       true
76 |   """
77 | 
78 |   @spec is_meaningful?(String.t(), :de | :en) :: boolean()
79 | 
80 |   def is_meaningful?(word, language \\ :en) do
81 |     String.length(word) > 2 and String.match?(word, ~r/^[\p{L}\p{M}\w]+$/u) and
82 |       not is_stopword?(word, language)
83 |   end
84 | end
85 | 


--------------------------------------------------------------------------------
/lib/scrape/tools/word/is_stopword.ex:
--------------------------------------------------------------------------------
 1 | defmodule Scrape.Tools.Word.IsStopword do
 2 |   @moduledoc false
 3 | 
 4 |   def execute(word, language \\ :en)
 5 | 
 6 |   for file <- File.ls!(Path.join([__DIR__, "stopwords"])) do
 7 |     language = file |> Path.basename(".txt") |> String.to_atom()
 8 | 
 9 |     for line <- File.stream!(Path.join([__DIR__, "stopwords", file]), [], :line) do
10 |       word = String.trim(line)
11 |       def execute(unquote(word), unquote(language)), do: true
12 |     end
13 |   end
14 | 
15 |   def execute(_, _), do: false
16 | end
17 | 


--------------------------------------------------------------------------------
/lib/scrape/tools/word/stopwords/de.txt:
--------------------------------------------------------------------------------
  1 | ab
  2 | aber
  3 | abgerufen
  4 | abgerufene
  5 | abgerufener
  6 | abgerufenes
  7 | acht
  8 | alle
  9 | allein
 10 | allem
 11 | allen
 12 | aller
 13 | allerdings
 14 | allerlei
 15 | alles
 16 | allgemein
 17 | allmählich
 18 | allzu
 19 | als
 20 | alsbald
 21 | ander
 22 | andere
 23 | anderem
 24 | anderen
 25 | anderer
 26 | andererseits
 27 | anderes
 28 | anderm
 29 | andern
 30 | andernfalls
 31 | anders
 32 | anerkannt
 33 | anerkannte
 34 | anerkannter
 35 | anerkanntes
 36 | anfangen
 37 | anfing
 38 | angefangen
 39 | angesetze
 40 | angesetzt
 41 | angesetzten
 42 | angesetzter
 43 | ansetzen
 44 | anstatt
 45 | arbeiten
 46 | auch
 47 | auf
 48 | aufgehört
 49 | aufgrund
 50 | aufhören
 51 | aufhörte
 52 | aufzusuchen
 53 | aus
 54 | ausdrücken
 55 | ausdrückt
 56 | ausdrückte
 57 | ausgenommen
 58 | ausser
 59 | ausserdem
 60 | author
 61 | autor
 62 | außen
 63 | außer
 64 | außerdem
 65 | außerhalb
 66 | bald
 67 | bearbeite
 68 | bearbeiten
 69 | bearbeitete
 70 | bearbeiteten
 71 | bedarf
 72 | bedurfte
 73 | bedürfen
 74 | befragen
 75 | befragte
 76 | befragten
 77 | befragter
 78 | begann
 79 | beginnen
 80 | begonnen
 81 | behalten
 82 | behielt
 83 | bei
 84 | beide
 85 | beiden
 86 | beiderlei
 87 | beides
 88 | beim
 89 | beinahe
 90 | beitragen
 91 | beitrugen
 92 | bekannt
 93 | bekannte
 94 | bekannter
 95 | bekennen
 96 | benutzt
 97 | bereits
 98 | berichten
 99 | berichtet
100 | berichtete
101 | berichteten
102 | besonders
103 | besser
104 | bestehen
105 | besteht
106 | beträchtlich
107 | bevor
108 | bezüglich
109 | bietet
110 | bin
111 | bis
112 | bisher
113 | bislang
114 | bist
115 | bleiben
116 | blieb
117 | bloss
118 | bloß
119 | brachte
120 | brachten
121 | brauchen
122 | braucht
123 | bringen
124 | bräuchte
125 | bsp.
126 | bzw
127 | böden
128 | ca.
129 | da
130 | dabei
131 | dadurch
132 | dafür
133 | dagegen
134 | daher
135 | dahin
136 | damals
137 | damit
138 | danach
139 | daneben
140 | dank
141 | danke
142 | danken
143 | dann
144 | dannen
145 | daran
146 | darauf
147 | daraus
148 | darf
149 | darfst
150 | darin
151 | darum
152 | darunter
153 | darüber
154 | darüberhinaus
155 | das
156 | dass
157 | dasselbe
158 | davon
159 | davor
160 | dazu
161 | daß
162 | dein
163 | deine
164 | deinem
165 | deinen
166 | deiner
167 | deines
168 | dem
169 | demnach
170 | demselben
171 | den
172 | denen
173 | denn
174 | dennoch
175 | denselben
176 | der
177 | derart
178 | derartig
179 | derem
180 | deren
181 | derer
182 | derjenige
183 | derjenigen
184 | derselbe
185 | derselben
186 | derzeit
187 | des
188 | deshalb
189 | desselben
190 | dessen
191 | desto
192 | deswegen
193 | dich
194 | die
195 | diejenige
196 | dies
197 | diese
198 | dieselbe
199 | dieselben
200 | diesem
201 | diesen
202 | dieser
203 | dieses
204 | diesseits
205 | dinge
206 | dir
207 | direkt
208 | direkte
209 | direkten
210 | direkter
211 | doch
212 | doppelt
213 | dort
214 | dorther
215 | dorthin
216 | drauf
217 | drei
218 | dreißig
219 | drin
220 | dritte
221 | drunter
222 | drüber
223 | du
224 | dunklen
225 | durch
226 | durchaus
227 | durfte
228 | durften
229 | dürfen
230 | dürfte
231 | eben
232 | ebenfalls
233 | ebenso
234 | ehe
235 | eher
236 | eigenen
237 | eigenes
238 | eigentlich
239 | ein
240 | einbaün
241 | eine
242 | einem
243 | einen
244 | einer
245 | einerseits
246 | eines
247 | einfach
248 | einführen
249 | einführte
250 | einführten
251 | eingesetzt
252 | einig
253 | einige
254 | einigem
255 | einigen
256 | einiger
257 | einigermaßen
258 | einiges
259 | einmal
260 | eins
261 | einseitig
262 | einseitige
263 | einseitigen
264 | einseitiger
265 | einst
266 | einstmals
267 | einzig
268 | ende
269 | entsprechend
270 | entweder
271 | er
272 | ergänze
273 | ergänzen
274 | ergänzte
275 | ergänzten
276 | erhalten
277 | erhielt
278 | erhielten
279 | erhält
280 | erneut
281 | erst
282 | erste
283 | ersten
284 | erster
285 | eröffne
286 | eröffnen
287 | eröffnet
288 | eröffnete
289 | eröffnetes
290 | es
291 | etliche
292 | etwa
293 | etwas
294 | euch
295 | euer
296 | eure
297 | eurem
298 | euren
299 | eurer
300 | eures
301 | fall
302 | falls
303 | fand
304 | fast
305 | ferner
306 | finden
307 | findest
308 | findet
309 | folgende
310 | folgenden
311 | folgender
312 | folgendes
313 | folglich
314 | fordern
315 | fordert
316 | forderte
317 | forderten
318 | fortsetzen
319 | fortsetzt
320 | fortsetzte
321 | fortsetzten
322 | fragte
323 | frau
324 | frei
325 | freie
326 | freier
327 | freies
328 | fuer
329 | fünf
330 | für
331 | gab
332 | ganz
333 | ganze
334 | ganzem
335 | ganzen
336 | ganzer
337 | ganzes
338 | gar
339 | gbr
340 | geb
341 | geben
342 | geblieben
343 | gebracht
344 | gedurft
345 | geehrt
346 | geehrte
347 | geehrten
348 | geehrter
349 | gefallen
350 | gefiel
351 | gefälligst
352 | gefällt
353 | gegeben
354 | gegen
355 | gehabt
356 | gehen
357 | geht
358 | gekommen
359 | gekonnt
360 | gemacht
361 | gemocht
362 | gemäss
363 | genommen
364 | genug
365 | gern
366 | gesagt
367 | gesehen
368 | gestern
369 | gestrige
370 | getan
371 | geteilt
372 | geteilte
373 | getragen
374 | gewesen
375 | gewissermaßen
376 | gewollt
377 | geworden
378 | ggf
379 | gib
380 | gibt
381 | gleich
382 | gleichwohl
383 | gleichzeitig
384 | glücklicherweise
385 | gmbh
386 | gratulieren
387 | gratuliert
388 | gratulierte
389 | gute
390 | guten
391 | gängig
392 | gängige
393 | gängigen
394 | gängiger
395 | gängiges
396 | gänzlich
397 | hab
398 | habe
399 | haben
400 | haette
401 | halb
402 | hallo
403 | hast
404 | hat
405 | hatte
406 | hatten
407 | hattest
408 | hattet
409 | hen
410 | heraus
411 | heute
412 | heutige
413 | hier
414 | hiermit
415 | hiesige
416 | hin
417 | hinein
418 | hinten
419 | hinter
420 | hinterher
421 | hoch
422 | hundert
423 | hätt
424 | hätte
425 | hätten
426 | höchstens
427 | ich
428 | igitt
429 | ihm
430 | ihn
431 | ihnen
432 | ihr
433 | ihre
434 | ihrem
435 | ihren
436 | ihrer
437 | ihres
438 | immer
439 | immerhin
440 | indem
441 | indessen
442 | info
443 | infolge
444 | innen
445 | innerhalb
446 | ins
447 | insofern
448 | inzwischen
449 | irgend
450 | irgendeine
451 | irgendwas
452 | irgendwen
453 | irgendwer
454 | irgendwie
455 | irgendwo
456 | ist
457 | ja
458 | je
459 | jede
460 | jedem
461 | jeden
462 | jedenfalls
463 | jeder
464 | jederlei
465 | jedes
466 | jedoch
467 | jemand
468 | jene
469 | jenem
470 | jenen
471 | jener
472 | jenes
473 | jenseits
474 | jetzt
475 | jährig
476 | jährige
477 | jährigen
478 | jähriges
479 | kam
480 | kann
481 | kannst
482 | kaum
483 | kein
484 | keine
485 | keinem
486 | keinen
487 | keiner
488 | keinerlei
489 | keines
490 | keineswegs
491 | klar
492 | klare
493 | klaren
494 | klares
495 | klein
496 | kleinen
497 | kleiner
498 | kleines
499 | koennen
500 | koennt
501 | koennte
502 | koennten
503 | komme
504 | kommen
505 | kommt
506 | konkret
507 | konkrete
508 | konkreten
509 | konkreter
510 | konkretes
511 | konnte
512 | konnten
513 | könn
514 | können
515 | könnt
516 | könnte
517 | könnten
518 | künftig
519 | lag
520 | lagen
521 | langsam
522 | lassen
523 | laut
524 | lediglich
525 | leer
526 | legen
527 | legte
528 | legten
529 | leicht
530 | leider
531 | lesen
532 | letze
533 | letzten
534 | letztendlich
535 | letztens
536 | letztes
537 | letztlich
538 | lichten
539 | liegt
540 | liest
541 | links
542 | längst
543 | längstens
544 | mache
545 | machen
546 | machst
547 | macht
548 | machte
549 | machten
550 | mag
551 | magst
552 | mal
553 | man
554 | manche
555 | manchem
556 | manchen
557 | mancher
558 | mancherorts
559 | manches
560 | manchmal
561 | mann
562 | margin
563 | mehr
564 | mehrere
565 | mein
566 | meine
567 | meinem
568 | meinen
569 | meiner
570 | meines
571 | meist
572 | meiste
573 | meisten
574 | meta
575 | mich
576 | mindestens
577 | mir
578 | mit
579 | mithin
580 | mochte
581 | morgen
582 | morgige
583 | muessen
584 | muesst
585 | muesste
586 | muss
587 | musst
588 | musste
589 | mussten
590 | muß
591 | mußt
592 | möchte
593 | möchten
594 | möchtest
595 | mögen
596 | möglich
597 | mögliche
598 | möglichen
599 | möglicher
600 | möglicherweise
601 | müssen
602 | müsste
603 | müssten
604 | müßt
605 | müßte
606 | nach
607 | nachdem
608 | nacher
609 | nachhinein
610 | nacht
611 | nahm
612 | natürlich
613 | neben
614 | nebenan
615 | nehmen
616 | nein
617 | neu
618 | neue
619 | neuem
620 | neuen
621 | neuer
622 | neues
623 | neun
624 | nicht
625 | nichts
626 | nie
627 | niemals
628 | niemand
629 | nimm
630 | nimmer
631 | nimmt
632 | nirgends
633 | nirgendwo
634 | noch
635 | nun
636 | nur
637 | nutzen
638 | nutzt
639 | nutzung
640 | nächste
641 | nämlich
642 | nötigenfalls
643 | nützt
644 | ob
645 | oben
646 | oberhalb
647 | obgleich
648 | obschon
649 | obwohl
650 | oder
651 | oft
652 | ohne
653 | pfui
654 | plötzlich
655 | pro
656 | reagiere
657 | reagieren
658 | reagiert
659 | reagierte
660 | rechts
661 | regelmäßig
662 | rief
663 | rund
664 | sage
665 | sagen
666 | sagt
667 | sagte
668 | sagten
669 | sagtest
670 | sang
671 | sangen
672 | schlechter
673 | schließlich
674 | schnell
675 | schon
676 | schreibe
677 | schreiben
678 | schreibens
679 | schreiber
680 | schwierig
681 | schätzen
682 | schätzt
683 | schätzte
684 | schätzten
685 | sechs
686 | sect
687 | sehe
688 | sehen
689 | sehr
690 | sehrwohl
691 | seht
692 | sei
693 | seid
694 | sein
695 | seine
696 | seinem
697 | seinen
698 | seiner
699 | seines
700 | seit
701 | seitdem
702 | seite
703 | seiten
704 | seither
705 | selber
706 | selbst
707 | senke
708 | senken
709 | senkt
710 | senkte
711 | senkten
712 | setzen
713 | setzt
714 | setzte
715 | setzten
716 | sich
717 | sicher
718 | sicherlich
719 | sie
720 | sieben
721 | siebte
722 | siehe
723 | sieht
724 | sind
725 | singen
726 | singt
727 | sobald
728 | sodaß
729 | soeben
730 | sofern
731 | sofort
732 | sog
733 | sogar
734 | solange
735 | solc
736 | solch
737 | solche
738 | solchem
739 | solchen
740 | solcher
741 | solches
742 | soll
743 | sollen
744 | sollst
745 | sollt
746 | sollte
747 | sollten
748 | solltest
749 | somit
750 | sondern
751 | sonst
752 | sonstwo
753 | sooft
754 | soviel
755 | soweit
756 | sowie
757 | sowohl
758 | spielen
759 | später
760 | startet
761 | startete
762 | starteten
763 | statt
764 | stattdessen
765 | steht
766 | steige
767 | steigen
768 | steigt
769 | stets
770 | stieg
771 | stiegen
772 | suchen
773 | sämtliche
774 | tages
775 | tat
776 | tatsächlich
777 | tatsächlichen
778 | tatsächlicher
779 | tatsächliches
780 | tausend
781 | teile
782 | teilen
783 | teilte
784 | teilten
785 | titel
786 | total
787 | trage
788 | tragen
789 | trotzdem
790 | trug
791 | trägt
792 | tun
793 | tust
794 | tut
795 | txt
796 | tät
797 | ueber
798 | um
799 | umso
800 | unbedingt
801 | und
802 | ungefähr
803 | unmöglich
804 | unmögliche
805 | unmöglichen
806 | unmöglicher
807 | unnötig
808 | uns
809 | unse
810 | unsem
811 | unsen
812 | unser
813 | unsere
814 | unserem
815 | unseren
816 | unserer
817 | unseres
818 | unserm
819 | unses
820 | unten
821 | unter
822 | unterbrach
823 | unterbrechen
824 | unterhalb
825 | unwichtig
826 | usw
827 | vergangen
828 | vergangene
829 | vergangener
830 | vergangenes
831 | vermag
832 | vermutlich
833 | vermögen
834 | verrate
835 | verraten
836 | verriet
837 | verrieten
838 | version
839 | versorge
840 | versorgen
841 | versorgt
842 | versorgte
843 | versorgten
844 | versorgtes
845 | veröffentlichen
846 | veröffentlicher
847 | veröffentlicht
848 | veröffentlichte
849 | veröffentlichten
850 | veröffentlichtes
851 | viel
852 | viele
853 | vielen
854 | vieler
855 | vieles
856 | vielleicht
857 | vielmals
858 | vier
859 | vollständig
860 | vom
861 | von
862 | vor
863 | voran
864 | vorbei
865 | vorgestern
866 | vorher
867 | vorne
868 | vorüber
869 | völlig
870 | wachen
871 | waere
872 | wann
873 | war
874 | waren
875 | warst
876 | warum
877 | weder
878 | weg
879 | wegen
880 | weil
881 | weiter
882 | weitere
883 | weiterem
884 | weiteren
885 | weiterer
886 | weiteres
887 | weiterhin
888 | weiß
889 | welche
890 | welchem
891 | welchen
892 | welcher
893 | welches
894 | wem
895 | wen
896 | wenig
897 | wenige
898 | weniger
899 | wenigstens
900 | wenn
901 | wenngleich
902 | wer
903 | werde
904 | werden
905 | werdet
906 | weshalb
907 | wessen
908 | wichtig
909 | wie
910 | wieder
911 | wieso
912 | wieviel
913 | wiewohl
914 | will
915 | willst
916 | wir
917 | wird
918 | wirklich
919 | wirst
920 | wo
921 | wodurch
922 | wogegen
923 | woher
924 | wohin
925 | wohingegen
926 | wohl
927 | wohlweislich
928 | wolle
929 | wollen
930 | wollt
931 | wollte
932 | wollten
933 | wolltest
934 | wolltet
935 | womit
936 | woraufhin
937 | woraus
938 | worin
939 | wurde
940 | wurden
941 | während
942 | währenddessen
943 | wär
944 | wäre
945 | wären
946 | würde
947 | würden
948 | z.B.
949 | zahlreich
950 | zehn
951 | zeitweise
952 | ziehen
953 | zieht
954 | zog
955 | zogen
956 | zu
957 | zudem
958 | zuerst
959 | zufolge
960 | zugleich
961 | zuletzt
962 | zum
963 | zumal
964 | zur
965 | zurück
966 | zusammen
967 | zuviel
968 | zwanzig
969 | zwar
970 | zwei
971 | zwischen
972 | zwölf
973 | ähnlich
974 | übel
975 | über
976 | überall
977 | überallhin
978 | überdies
979 | übermorgen
980 | übrig
981 | übrigens


--------------------------------------------------------------------------------
/lib/scrape/tools/word/stopwords/en.txt:
--------------------------------------------------------------------------------
  1 | 'll
  2 | a
  3 | able
  4 | about
  5 | above
  6 | abst
  7 | accordance
  8 | according
  9 | accordingly
 10 | across
 11 | act
 12 | actually
 13 | added
 14 | adj
 15 | affected
 16 | affecting
 17 | affects
 18 | after
 19 | afterwards
 20 | again
 21 | against
 22 | ah
 23 | all
 24 | almost
 25 | alone
 26 | along
 27 | already
 28 | also
 29 | although
 30 | always
 31 | am
 32 | among
 33 | amongst
 34 | an
 35 | and
 36 | announce
 37 | another
 38 | any
 39 | anybody
 40 | anyhow
 41 | anymore
 42 | anyone
 43 | anything
 44 | anyway
 45 | anyways
 46 | anywhere
 47 | apparently
 48 | approximately
 49 | are
 50 | aren
 51 | arent
 52 | arise
 53 | around
 54 | as
 55 | aside
 56 | ask
 57 | asking
 58 | at
 59 | auth
 60 | available
 61 | away
 62 | awfully
 63 | b
 64 | back
 65 | be
 66 | became
 67 | because
 68 | become
 69 | becomes
 70 | becoming
 71 | been
 72 | before
 73 | beforehand
 74 | begin
 75 | beginning
 76 | beginnings
 77 | begins
 78 | behind
 79 | being
 80 | believe
 81 | below
 82 | beside
 83 | besides
 84 | between
 85 | beyond
 86 | biol
 87 | both
 88 | brief
 89 | briefly
 90 | but
 91 | by
 92 | c
 93 | ca
 94 | came
 95 | can
 96 | can't
 97 | cannot
 98 | cause
 99 | causes
100 | certain
101 | certainly
102 | co
103 | com
104 | come
105 | comes
106 | contain
107 | containing
108 | contains
109 | could
110 | couldnt
111 | d
112 | date
113 | did
114 | didn't
115 | different
116 | do
117 | does
118 | doesn't
119 | doing
120 | don't
121 | done
122 | down
123 | downwards
124 | due
125 | during
126 | e
127 | each
128 | ed
129 | edu
130 | effect
131 | eg
132 | eight
133 | eighty
134 | either
135 | else
136 | elsewhere
137 | end
138 | ending
139 | enough
140 | especially
141 | et
142 | et-al
143 | etc
144 | even
145 | ever
146 | every
147 | everybody
148 | everyone
149 | everything
150 | everywhere
151 | ex
152 | except
153 | f
154 | far
155 | few
156 | ff
157 | fifth
158 | first
159 | five
160 | fix
161 | followed
162 | following
163 | follows
164 | for
165 | former
166 | formerly
167 | forth
168 | found
169 | four
170 | from
171 | further
172 | furthermore
173 | g
174 | gave
175 | get
176 | gets
177 | getting
178 | give
179 | given
180 | gives
181 | giving
182 | go
183 | goes
184 | gone
185 | got
186 | gotten
187 | h
188 | had
189 | happens
190 | hardly
191 | has
192 | hasn't
193 | have
194 | haven't
195 | having
196 | he
197 | hed
198 | hence
199 | her
200 | here
201 | hereafter
202 | hereby
203 | herein
204 | heres
205 | hereupon
206 | hers
207 | herself
208 | hes
209 | hi
210 | hid
211 | him
212 | himself
213 | his
214 | hither
215 | home
216 | how
217 | howbeit
218 | however
219 | hundred
220 | i
221 | i'll
222 | i've
223 | id
224 | ie
225 | if
226 | im
227 | immediate
228 | immediately
229 | importance
230 | important
231 | in
232 | inc
233 | indeed
234 | index
235 | information
236 | instead
237 | into
238 | invention
239 | inward
240 | is
241 | isn't
242 | it
243 | it'll
244 | itd
245 | its
246 | itself
247 | j
248 | just
249 | k
250 | keep
251 | keeps
252 | kept
253 | kg
254 | km
255 | know
256 | known
257 | knows
258 | l
259 | largely
260 | last
261 | lately
262 | later
263 | latter
264 | latterly
265 | least
266 | less
267 | lest
268 | let
269 | lets
270 | like
271 | liked
272 | likely
273 | line
274 | little
275 | look
276 | looking
277 | looks
278 | ltd
279 | m
280 | made
281 | mainly
282 | make
283 | makes
284 | many
285 | may
286 | maybe
287 | me
288 | mean
289 | means
290 | meantime
291 | meanwhile
292 | merely
293 | mg
294 | might
295 | million
296 | miss
297 | ml
298 | more
299 | moreover
300 | most
301 | mostly
302 | mr
303 | mrs
304 | much
305 | mug
306 | must
307 | my
308 | myself
309 | n
310 | na
311 | name
312 | namely
313 | nay
314 | nd
315 | near
316 | nearly
317 | necessarily
318 | necessary
319 | need
320 | needs
321 | neither
322 | never
323 | nevertheless
324 | new
325 | next
326 | nine
327 | ninety
328 | no
329 | nobody
330 | non
331 | none
332 | nonetheless
333 | noone
334 | nor
335 | normally
336 | nos
337 | not
338 | noted
339 | nothing
340 | now
341 | nowhere
342 | o
343 | obtain
344 | obtained
345 | obviously
346 | of
347 | off
348 | often
349 | oh
350 | ok
351 | okay
352 | old
353 | omitted
354 | on
355 | once
356 | one
357 | ones
358 | only
359 | onto
360 | or
361 | ord
362 | other
363 | others
364 | otherwise
365 | ought
366 | our
367 | ours
368 | ourselves
369 | out
370 | outside
371 | over
372 | overall
373 | owing
374 | own
375 | p
376 | page
377 | pages
378 | part
379 | particular
380 | particularly
381 | past
382 | per
383 | perhaps
384 | placed
385 | please
386 | plus
387 | poorly
388 | possible
389 | possibly
390 | potentially
391 | pp
392 | predominantly
393 | present
394 | previously
395 | primarily
396 | probably
397 | promptly
398 | proud
399 | provides
400 | put
401 | q
402 | que
403 | quickly
404 | quite
405 | qv
406 | r
407 | ran
408 | rather
409 | rd
410 | re
411 | readily
412 | really
413 | recent
414 | recently
415 | ref
416 | refs
417 | regarding
418 | regardless
419 | regards
420 | related
421 | relatively
422 | research
423 | respectively
424 | resulted
425 | resulting
426 | results
427 | right
428 | run
429 | s
430 | said
431 | same
432 | saw
433 | say
434 | saying
435 | says
436 | sec
437 | section
438 | see
439 | seeing
440 | seem
441 | seemed
442 | seeming
443 | seems
444 | seen
445 | self
446 | selves
447 | sent
448 | seven
449 | several
450 | shall
451 | she
452 | she'll
453 | shed
454 | shes
455 | should
456 | shouldn't
457 | show
458 | showed
459 | shown
460 | showns
461 | shows
462 | significant
463 | significantly
464 | similar
465 | similarly
466 | since
467 | six
468 | slightly
469 | so
470 | some
471 | somebody
472 | somehow
473 | someone
474 | somethan
475 | something
476 | sometime
477 | sometimes
478 | somewhat
479 | somewhere
480 | soon
481 | sorry
482 | specifically
483 | specified
484 | specify
485 | specifying
486 | still
487 | stop
488 | strongly
489 | sub
490 | substantially
491 | successfully
492 | such
493 | sufficiently
494 | suggest
495 | sup
496 | sure
497 | than
498 | that
499 | that's
500 | the
501 | their
502 | theirs
503 | them
504 | themselves
505 | then
506 | there
507 | there's
508 | these
509 | they
510 | they'd
511 | they'll
512 | they're
513 | they've
514 | this
515 | those
516 | through
517 | to
518 | too
519 | under
520 | until
521 | up
522 | very
523 | was
524 | wasn't
525 | we
526 | we'd
527 | we'll
528 | we're
529 | we've
530 | were
531 | weren't
532 | what
533 | what's
534 | when
535 | when's
536 | where
537 | where's
538 | which
539 | while
540 | who
541 | who's
542 | whom
543 | why
544 | why's
545 | with
546 | won't
547 | would
548 | wouldn't
549 | you
550 | you'd
551 | you'll
552 | you're
553 | you've
554 | your
555 | yours
556 | yourself
557 | yourselves


--------------------------------------------------------------------------------
/mix.exs:
--------------------------------------------------------------------------------
 1 | defmodule Scrape.MixProject do
 2 |   use Mix.Project
 3 | 
 4 |   def project do
 5 |     [
 6 |       app: :scrape,
 7 |       version: "3.1.0",
 8 |       elixir: "~> 1.10",
 9 |       description: description(),
10 |       package: package(),
11 |       start_permanent: Mix.env() == :prod,
12 |       deps: deps()
13 |     ]
14 |   end
15 | 
16 |   # Run "mix help compile.app" to learn about applications.
17 |   def application do
18 |     [
19 |       extra_applications: [:logger],
20 |       mod: {Scrape.Application, []}
21 |     ]
22 |   end
23 | 
24 |   defp description do
25 |     """
26 |     Scrape any website, article or RSS/Atom feed with ease!
27 |     """
28 |   end
29 | 
30 |   defp package do
31 |     [
32 |       files: ["lib", "mix.exs", "README.md", "LICENSE.txt"],
33 |       maintainers: ["Maximilian Stroh"],
34 |       licenses: ["LGPLv3"],
35 |       links: %{"GitHub" => "https://github.com/Anonyfox/elixir-scrape"}
36 |     ]
37 |   end
38 | 
39 |   # Run "mix help deps" to learn about dependencies.
40 |   defp deps do
41 |     [
42 |       # enable development with `mix test.watch --stale`
43 |       {:mix_test_watch, "~> 0.8", only: :dev, runtime: false},
44 |       # documentation generation
45 |       {:ex_doc, "~> 0.20.2", only: :dev, runtime: false},
46 |       # language detection
47 |       {:paasaa, "~> 0.3.1"},
48 |       # snowball stemmer for multiple languages with a NIF
49 |       {:stemex, "~> 0.1.1"},
50 |       # HTML/XML parser with CSS3 selectors
51 |       {:floki, "~> 0.21.0"},
52 |       # clone of arc90's readability algorithm
53 |       {:readability, "~> 0.10.0"},
54 |       # iconv written in pure elixir
55 |       {:codepagex, "~> 0.1.4"},
56 |       # http client
57 |       {:httpoison, "~> 0.13.0"},
58 |       # xml to map
59 |       {:elixir_xml_to_map, "~> 0.1.2"},
60 |       # map transformation functions
61 |       {:morphix, "~> 0.8.0"}
62 |     ]
63 |   end
64 | end
65 | 


--------------------------------------------------------------------------------
/mix.lock:
--------------------------------------------------------------------------------
 1 | %{
 2 |   "certifi": {:hex, :certifi, "2.5.1", "867ce347f7c7d78563450a18a6a28a8090331e77fa02380b4a21962a65d36ee5", [:rebar3], [{:parse_trans, "~>3.3", [hex: :parse_trans, repo: "hexpm", optional: false]}], "hexpm", "805abd97539caf89ec6d4732c91e62ba9da0cda51ac462380bbd28ee697a8c42"},
 3 |   "codepagex": {:hex, :codepagex, "0.1.4", "dae3bc57e9334c324914b32ed61c0a30929fac3e73dc71fc611ed7eeb2dcb867", [:mix], [], "hexpm", "21710d98fb2bc03a4d44365b66aba569c3a9267437cfafd09ca27ed92a99c75e"},
 4 |   "combine": {:hex, :combine, "0.10.0", "eff8224eeb56498a2af13011d142c5e7997a80c8f5b97c499f84c841032e429f", [:mix], [], "hexpm"},
 5 |   "dogma": {:hex, :dogma, "0.1.13", "7b6c6ad2b3ee6501eda3bd39e197dd5198be8d520d1c175c7f713803683cf27a", [:mix], [{:poison, ">= 2.0.0", [hex: :poison, repo: "hexpm", optional: false]}], "hexpm"},
 6 |   "earmark": {:hex, :earmark, "1.3.2", "b840562ea3d67795ffbb5bd88940b1bed0ed9fa32834915125ea7d02e35888a5", [:mix], [], "hexpm", "e3be2bc3ae67781db529b80aa7e7c49904a988596e2dbff897425b48b3581161"},
 7 |   "elixir_xml_to_map": {:hex, :elixir_xml_to_map, "0.1.2", "e3d1bd2f6562711117ae209657f385a1c1c34c8c720c748eeba2e22815797071", [:mix], [{:erlsom, "~>1.4", [hex: :erlsom, repo: "hexpm", optional: false]}], "hexpm", "a134d24496ebb25e1ab7027bba18a3be1f91f44aa3e6701bdc6ea5807d98ef0a"},
 8 |   "erlsom": {:hex, :erlsom, "1.5.0", "c5a5cdd0ee0e8dca62bcc4b13ff08da24fdefc16ccd8b25282a2fda2ba1be24a", [:rebar3], [], "hexpm", "55a9dbf9cfa77fcfc108bd8e2c4f9f784dea228a8f4b06ea10b684944946955a"},
 9 |   "ex_doc": {:hex, :ex_doc, "0.20.2", "1bd0dfb0304bade58beb77f20f21ee3558cc3c753743ae0ddbb0fd7ba2912331", [:mix], [{:earmark, "~> 1.3", [hex: :earmark, repo: "hexpm", optional: false]}, {:makeup_elixir, "~> 0.10", [hex: :makeup_elixir, repo: "hexpm", optional: false]}], "hexpm", "8e24fc8ff9a50b9f557ff020d6c91a03cded7e59ac3e0eec8a27e771430c7d27"},
10 |   "exjsx": {:hex, :exjsx, "4.0.0", "60548841e0212df401e38e63c0078ec57b33e7ea49b032c796ccad8cde794b5c", [:mix], [{:jsx, "~> 2.8.0", [hex: :jsx, repo: "hexpm", optional: false]}], "hexpm", "32e95820a97cffea67830e91514a2ad53b888850442d6d395f53a1ac60c82e07"},
11 |   "file_system": {:hex, :file_system, "0.2.7", "e6f7f155970975789f26e77b8b8d8ab084c59844d8ecfaf58cbda31c494d14aa", [:mix], [], "hexpm", "b4cfa2d69c7f0b18fd06db222b2398abeef743a72504e6bd7df9c52f171b047f"},
12 |   "floki": {:hex, :floki, "0.21.0", "0c0191a6dbc559300bac232f716c55fb5738d45ae846b3141b19e5f5741c1907", [:mix], [{:html_entities, "~> 0.4.0", [hex: :html_entities, repo: "hexpm", optional: false]}, {:mochiweb, "~> 2.15", [hex: :mochiweb, repo: "hexpm", optional: false]}], "hexpm", "4558100b255f5143d42432e75ceb731d04dbe824d1cf57c38e7e0f3c644ca0cd"},
13 |   "gettext": {:hex, :gettext, "0.16.1", "e2130b25eebcbe02bb343b119a07ae2c7e28bd4b146c4a154da2ffb2b3507af2", [:mix], [], "hexpm"},
14 |   "hackney": {:hex, :hackney, "1.15.1", "9f8f471c844b8ce395f7b6d8398139e26ddca9ebc171a8b91342ee15a19963f4", [:rebar3], [{:certifi, "2.5.1", [hex: :certifi, repo: "hexpm", optional: false]}, {:idna, "6.0.0", [hex: :idna, repo: "hexpm", optional: false]}, {:metrics, "1.0.1", [hex: :metrics, repo: "hexpm", optional: false]}, {:mimerl, "~>1.1", [hex: :mimerl, repo: "hexpm", optional: false]}, {:ssl_verify_fun, "1.1.4", [hex: :ssl_verify_fun, repo: "hexpm", optional: false]}], "hexpm", "c2790c9f0f7205f4a362512192dee8179097394400e745e4d20bab7226a8eaad"},
15 |   "html5ever": {:hex, :html5ever, "0.7.0", "9f63ec1c783b2dc9f326840fcc993c01e926dbdef4e51ba1bbe5355993c258b4", [:mix], [{:rustler, "~> 0.18.0", [hex: :rustler, repo: "hexpm", optional: false]}], "hexpm"},
16 |   "html_entities": {:hex, :html_entities, "0.4.0", "f2fee876858cf6aaa9db608820a3209e45a087c5177332799592142b50e89a6b", [:mix], [], "hexpm", "3e3d7156a272950373ce5a4018b1490bea26676f8d6a7d409f6fac8568b8cb9a"},
17 |   "httpoison": {:hex, :httpoison, "0.13.0", "bfaf44d9f133a6599886720f3937a7699466d23bb0cd7a88b6ba011f53c6f562", [:mix], [{:hackney, "~> 1.8", [hex: :hackney, repo: "hexpm", optional: false]}], "hexpm", "4846958172d6401c4f34ecc5c2c4607b5b0d90b8eec8f6df137ca4907942ed0f"},
18 |   "idna": {:hex, :idna, "6.0.0", "689c46cbcdf3524c44d5f3dde8001f364cd7608a99556d8fbd8239a5798d4c10", [:rebar3], [{:unicode_util_compat, "0.4.1", [hex: :unicode_util_compat, repo: "hexpm", optional: false]}], "hexpm", "4bdd305eb64e18b0273864920695cb18d7a2021f31a11b9c5fbcd9a253f936e2"},
19 |   "jsx": {:hex, :jsx, "2.8.3", "a05252d381885240744d955fbe3cf810504eb2567164824e19303ea59eef62cf", [:mix, :rebar3], [], "hexpm", "fc3499fed7a726995aa659143a248534adc754ebd16ccd437cd93b649a95091f"},
20 |   "makeup": {:hex, :makeup, "0.8.0", "9cf32aea71c7fe0a4b2e9246c2c4978f9070257e5c9ce6d4a28ec450a839b55f", [:mix], [{:nimble_parsec, "~> 0.5.0", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "5fbc8e549aa9afeea2847c0769e3970537ed302f93a23ac612602e805d9d1e7f"},
21 |   "makeup_elixir": {:hex, :makeup_elixir, "0.13.0", "be7a477997dcac2e48a9d695ec730b2d22418292675c75aa2d34ba0909dcdeda", [:mix], [{:makeup, "~> 0.8", [hex: :makeup, repo: "hexpm", optional: false]}], "hexpm", "adf0218695e22caeda2820eaba703fa46c91820d53813a2223413da3ef4ba515"},
22 |   "metrics": {:hex, :metrics, "1.0.1", "25f094dea2cda98213cecc3aeff09e940299d950904393b2a29d191c346a8486", [:rebar3], [], "hexpm", "69b09adddc4f74a40716ae54d140f93beb0fb8978d8636eaded0c31b6f099f16"},
23 |   "mimerl": {:hex, :mimerl, "1.2.0", "67e2d3f571088d5cfd3e550c383094b47159f3eee8ffa08e64106cdf5e981be3", [:rebar3], [], "hexpm", "f278585650aa581986264638ebf698f8bb19df297f66ad91b18910dfc6e19323"},
24 |   "mix_test_watch": {:hex, :mix_test_watch, "0.9.0", "c72132a6071261893518fa08e121e911c9358713f62794a90c95db59042af375", [:mix], [{:file_system, "~> 0.2.1 or ~> 0.3", [hex: :file_system, repo: "hexpm", optional: false]}], "hexpm", "817dec4a7f6edf260258002f99ac8ffaf7a8f395b27bf2d13ec24018beecec8a"},
25 |   "mochiweb": {:hex, :mochiweb, "2.18.0", "eb55f1db3e6e960fac4e6db4e2db9ec3602cc9f30b86cd1481d56545c3145d2e", [:rebar3], [], "hexpm", "b93e2b1e564bdbadfecc297277f9e6d0902da645b417d6c9210f6038ac63489a"},
26 |   "morphix": {:hex, :morphix, "0.8.0", "69ea4b2bc89eed7a85d5f3af7176862e1fd6e64af7f788a9d976cf599f0695af", [:mix], [], "hexpm", "307683e71d74af44da4af07ec7cc978d242f3395a159b64515f093d44280169f"},
27 |   "nimble_parsec": {:hex, :nimble_parsec, "0.5.0", "90e2eca3d0266e5c53f8fbe0079694740b9c91b6747f2b7e3c5d21966bba8300", [:mix], [], "hexpm", "5c040b8469c1ff1b10093d3186e2e10dbe483cd73d79ec017993fb3985b8a9b3"},
28 |   "paasaa": {:hex, :paasaa, "0.3.1", "94e1c4fc83bdd7b8c06fd90f965ff90a6198cbcf6ddf27b64de62f5dbcb2ccf7", [:mix], [{:exjsx, "~> 4.0", [hex: :exjsx, repo: "hexpm", optional: false]}], "hexpm", "5e02b49d9a968f6ccffa130c9a9f977a8ec3403b7a26069547bfda9daa557d10"},
29 |   "parallel": {:hex, :parallel, "0.0.3", "d1c9a03f0fd6c85ba174938b9823db51e01a68f9f0e76e3f3e11989cbeb607e7", [:mix], [], "hexpm"},
30 |   "parse_trans": {:hex, :parse_trans, "3.3.0", "09765507a3c7590a784615cfd421d101aec25098d50b89d7aa1d66646bc571c1", [:rebar3], [], "hexpm", "17ef63abde837ad30680ea7f857dd9e7ced9476cdd7b0394432af4bfc241b960"},
31 |   "poison": {:hex, :poison, "3.1.0", "d9eb636610e096f86f25d9a46f35a9facac35609a7591b3be3326e99a0484665", [:mix], [], "hexpm"},
32 |   "readability": {:hex, :readability, "0.10.0", "934212018e70346a982927ee4b32d3ddb3d5feba7bf7ab04f57da66ced5ab7a2", [:mix], [{:floki, "~> 0.20", [hex: :floki, repo: "hexpm", optional: false]}, {:httpoison, "~> 0.13.0", [hex: :httpoison, repo: "hexpm", optional: false]}], "hexpm", "b0edfbd1878cbb27a81d7d3678670cdfb2d2b8fef6a9ca9cbc4013eb640082cd"},
33 |   "rustler": {:hex, :rustler, "0.18.0", "db4bd0c613d83a1badc31be90ddada6f9821de29e4afd15c53a5da61882e4f2d", [:mix], [], "hexpm"},
34 |   "ssl_verify_fun": {:hex, :ssl_verify_fun, "1.1.4", "f0eafff810d2041e93f915ef59899c923f4568f4585904d010387ed74988e77b", [:make, :mix, :rebar3], [], "hexpm", "603561dc0fd62f4f2ea9b890f4e20e1a0d388746d6e20557cafb1b16950de88c"},
35 |   "stemex": {:hex, :stemex, "0.1.1", "726d693b67c4ee82398ca6f1bfbacc8d7aad20861a0371e44e9c6f9dee1e042d", [:mix], [], "hexpm", "219b8e81fedba5a9bb978b8f7eaf230e77f2702d58e409adcca998fde1788521"},
36 |   "timex": {:hex, :timex, "3.4.2", "d74649c93ad0e12ce5b17cf5e11fbd1fb1b24a3d114643e86dba194b64439547", [:mix], [{:combine, "~> 0.10", [hex: :combine, repo: "hexpm", optional: false]}, {:gettext, "~> 0.10", [hex: :gettext, repo: "hexpm", optional: false]}, {:tzdata, "~> 0.1.8 or ~> 0.5", [hex: :tzdata, repo: "hexpm", optional: false]}], "hexpm"},
37 |   "tzdata": {:hex, :tzdata, "0.5.19", "7962a3997bf06303b7d1772988ede22260f3dae1bf897408ebdac2b4435f4e6a", [:mix], [{:hackney, "~> 1.0", [hex: :hackney, repo: "hexpm", optional: false]}], "hexpm"},
38 |   "unicode_util_compat": {:hex, :unicode_util_compat, "0.4.1", "d869e4c68901dd9531385bb0c8c40444ebf624e60b6962d95952775cac5e90cd", [:rebar3], [], "hexpm", "1d1848c40487cdb0b30e8ed975e34e025860c02e419cb615d255849f3427439d"},
39 | }
40 | 


--------------------------------------------------------------------------------
/test/flow/article_test.exs:
--------------------------------------------------------------------------------
 1 | defmodule Scrape.Flow.ArticleTest do
 2 |   use ExUnit.Case
 3 | 
 4 |   alias Scrape.Flow.Article
 5 | 
 6 |   describe "Article#from_url" do
 7 |   end
 8 | 
 9 |   describe "Article#from_file" do
10 |     test "works when a valid article file exists" do
11 |       {:ok, data} = Article.from_file("cache/article/nytimes.html")
12 |       assert data.title =~ "Highest Minimum Wage"
13 |       assert data.summary =~ "raising the minimum wage"
14 |     end
15 | 
16 |     test "refuses when no file exists" do
17 |       {:error, error} = Article.from_file("missing")
18 | 
19 |       assert error ==
20 |                {:assign, :html,
21 |                 %File.Error{action: "read file", path: "missing", reason: :enoent}}
22 |     end
23 |   end
24 | 
25 |   describe "Article#from_string" do
26 |     test "works when a valid string is given" do
27 |       html = File.read!("cache/article/nytimes.html")
28 |       {:ok, data} = Article.from_string(html)
29 |       assert data.title =~ "Highest Minimum Wage"
30 |       assert data.summary =~ "raising the minimum wage"
31 |     end
32 | 
33 |     test "refuses when nil is given" do
34 |       {:error, error} = Article.from_string(nil)
35 |       assert error == :html_invalid
36 |     end
37 | 
38 |     test "refuses when empty string is given" do
39 |       {:error, error} = Article.from_string("")
40 |       assert error == :html_invalid
41 |     end
42 |   end
43 | end
44 | 


--------------------------------------------------------------------------------
/test/flow/domain_test.exs:
--------------------------------------------------------------------------------
 1 | defmodule Scrape.Flow.DomainTest do
 2 |   use ExUnit.Case
 3 | 
 4 |   alias Scrape.Flow.Domain
 5 | 
 6 |   describe "Domain#from_url" do
 7 |   end
 8 | 
 9 |   describe "Domain#from_file" do
10 |     test "works when a valid domain file exists" do
11 |       {:ok, data} = Domain.from_file("cache/domain/venturebeat.html")
12 |       assert data.title =~ "Fortnite teams up with Avengers"
13 |       assert length(data.feed_urls) == 3
14 |     end
15 | 
16 |     test "refuses when no file exists" do
17 |       {:error, error} = Domain.from_file("missing")
18 | 
19 |       assert error ==
20 |                {:assign, :html,
21 |                 %File.Error{action: "read file", path: "missing", reason: :enoent}}
22 |     end
23 |   end
24 | 
25 |   describe "Domain#from_string" do
26 |     test "works when a valid string is given" do
27 |       html = File.read!("cache/domain/venturebeat.html")
28 |       {:ok, data} = Domain.from_string(html)
29 |       assert data.title =~ "Fortnite teams up with Avengers"
30 |       assert length(data.feed_urls) == 3
31 |     end
32 | 
33 |     test "refuses when nil is given" do
34 |       {:error, error} = Domain.from_string(nil)
35 |       assert error == :html_invalid
36 |     end
37 | 
38 |     test "refuses when empty string is given" do
39 |       {:error, error} = Domain.from_string("")
40 |       assert error == :html_invalid
41 |     end
42 |   end
43 | end
44 | 


--------------------------------------------------------------------------------
/test/flow/feed_test.exs:
--------------------------------------------------------------------------------
 1 | defmodule Scrape.Flow.FeedTest do
 2 |   use ExUnit.Case
 3 | 
 4 |   alias Scrape.Flow.Feed
 5 | 
 6 |   describe "Feed#from_url" do
 7 |   end
 8 | 
 9 |   describe "Feed#from_string" do
10 |     test "works when a valid string is given" do
11 |       xml = File.read!("cache/feed/latimes.xml")
12 |       {:ok, data} = Feed.from_string(xml)
13 |       assert data.title =~ "latimes.com - Los Angeles Times"
14 |       assert data.website_url == "http://www.latimes.com"
15 | 
16 |       item = data[:items] |> List.first()
17 |       item.title =~ "guitar"
18 |     end
19 | 
20 |     test "refuses when nil is given" do
21 |       {:error, error} = Feed.from_string(nil)
22 |       assert error == :xml_invalid
23 |     end
24 | 
25 |     test "refuses when empty string is given" do
26 |       {:error, error} = Feed.from_string("")
27 |       assert error == :xml_invalid
28 |     end
29 |   end
30 | end
31 | 


--------------------------------------------------------------------------------
/test/ir/feed_item_test.exs:
--------------------------------------------------------------------------------
  1 | defmodule Scrape.IR.FeedItemTest do
  2 |   use ExUnit.Case
  3 | 
  4 |   alias Scrape.IR.Feed
  5 |   alias Scrape.IR.FeedItem
  6 | 
  7 |   doctest FeedItem
  8 | 
  9 |   describe "FeedItem#title/1" do
 10 |     test "can extract from xml string of type atom" do
 11 |       xml = "<title>abc</title>"
 12 |       assert FeedItem.title(xml) == "abc"
 13 |     end
 14 | 
 15 |     test "can extract from xml string of type rss" do
 16 |       xml = "<title>abc</title>"
 17 |       assert FeedItem.title(xml) == "abc"
 18 |     end
 19 | 
 20 |     test "can extract from german atom feed" do
 21 |       xml = File.read!("cache/feed/heise.xml")
 22 |       item = xml |> Feed.items() |> List.first()
 23 |       assert FeedItem.title(item) =~ "Fachkräftemangel"
 24 |     end
 25 | 
 26 |     test "can extract from german rss feed" do
 27 |       xml = File.read!("cache/feed/spiegel.xml")
 28 |       item = xml |> Feed.items() |> List.first()
 29 |       assert FeedItem.title(item) =~ "Schwertransporter"
 30 |     end
 31 | 
 32 |     test "can extract from english atom feed" do
 33 |       xml = File.read!("cache/feed/elixir-lang.xml")
 34 |       item = xml |> Feed.items() |> List.first()
 35 |       assert FeedItem.title(item) == "Elixir v1.0 released"
 36 |     end
 37 | 
 38 |     test "can extract from english rss feed" do
 39 |       xml = File.read!("cache/feed/latimes.xml")
 40 |       item = xml |> Feed.items() |> List.first()
 41 |       assert FeedItem.title(item) =~ "Essential tracks"
 42 |     end
 43 |   end
 44 | 
 45 |   describe "FeedItem#description/1" do
 46 |     test "can extract from xml string of type atom" do
 47 |       xml = "<summary>abc</summary>"
 48 |       assert FeedItem.description(xml) == "abc"
 49 |     end
 50 | 
 51 |     test "can extract from xml string of type rss" do
 52 |       xml = "<description>abc</description>"
 53 |       assert FeedItem.description(xml) == "abc"
 54 |     end
 55 | 
 56 |     test "can extract from german atom feed" do
 57 |       xml = File.read!("cache/feed/heise.xml")
 58 |       item = xml |> Feed.items() |> List.first()
 59 |       assert FeedItem.description(item) =~ "730.000 Mitarbeiter"
 60 |     end
 61 | 
 62 |     test "can extract from german rss feed" do
 63 |       xml = File.read!("cache/feed/spiegel.xml")
 64 |       item = xml |> Feed.items() |> List.first()
 65 |       assert FeedItem.description(item) =~ "Schweres Unglück in der Oberpfalz"
 66 |     end
 67 | 
 68 |     test "can extract from english atom feed" do
 69 |       xml = File.read!("cache/feed/elixir-lang.xml")
 70 |       item = xml |> Feed.items() |> List.first()
 71 |       assert FeedItem.description(item) =~ "Elixir v1.0 is finally out"
 72 |     end
 73 | 
 74 |     test "can extract from english rss feed" do
 75 |       xml = File.read!("cache/feed/latimes.xml")
 76 |       item = xml |> Feed.items() |> List.first()
 77 |       assert FeedItem.description(item) =~ "high-energy party music"
 78 |     end
 79 |   end
 80 | 
 81 |   describe "FeedItem#website_url/1" do
 82 |     test "can extract from xml string of type atom" do
 83 |       xml = "<link href='http://example.com' />"
 84 |       assert FeedItem.article_url(xml) == "http://example.com"
 85 |     end
 86 | 
 87 |     test "can extract from xml string of type rss" do
 88 |       xml = "<link>http://example.com</link>"
 89 |       assert FeedItem.article_url(xml) == "http://example.com"
 90 |     end
 91 | 
 92 |     test "can extract from german atom feed" do
 93 |       xml = File.read!("cache/feed/heise.xml")
 94 |       item = xml |> Feed.items() |> List.first()
 95 |       assert FeedItem.article_url(item) =~ "https://www.heise.de/newsticker"
 96 |     end
 97 | 
 98 |     test "can extract from german rss feed" do
 99 |       xml = File.read!("cache/feed/spiegel.xml")
100 |       item = xml |> Feed.items() |> List.first()
101 |       assert FeedItem.article_url(item) =~ "http://www.spiegel.de/panorama"
102 |     end
103 | 
104 |     test "can extract from english atom feed" do
105 |       xml = File.read!("cache/feed/elixir-lang.xml")
106 |       item = xml |> Feed.items() |> List.first()
107 |       assert FeedItem.article_url(item) =~ "http://elixir-lang.org/blog"
108 |     end
109 | 
110 |     test "can extract from english rss feed" do
111 |       xml = File.read!("cache/feed/latimes.xml")
112 |       item = xml |> Feed.items() |> List.first()
113 |       assert FeedItem.article_url(item) =~ "http://www.latimes.com/la-et-ms"
114 |     end
115 |   end
116 | 
117 |   describe "FeedItem#tags/1" do
118 |     test "can extract from xml string of type atom" do
119 |       xml = "<category>abc</category>"
120 |       assert FeedItem.tags(xml) == ["abc"]
121 |     end
122 | 
123 |     test "can extract from xml string of type rss" do
124 |       xml = "<category>abc</category>"
125 |       assert FeedItem.tags(xml) == ["abc"]
126 |     end
127 | 
128 |     test "can extract from german atom feed" do
129 |       xml = File.read!("cache/feed/heise.xml")
130 |       item = xml |> Feed.items() |> List.first()
131 |       assert FeedItem.tags(item) == []
132 |     end
133 | 
134 |     test "can extract from german rss feed" do
135 |       xml = File.read!("cache/feed/spiegel.xml")
136 |       item = xml |> Feed.items() |> List.first()
137 |       assert FeedItem.tags(item) == ["panorama"]
138 |     end
139 | 
140 |     test "can extract from english atom feed" do
141 |       xml = File.read!("cache/feed/elixir-lang.xml")
142 |       item = xml |> Feed.items() |> List.first()
143 |       assert FeedItem.tags(item) == []
144 |     end
145 | 
146 |     test "can extract from english rss feed" do
147 |       xml = File.read!("cache/feed/latimes.xml")
148 |       item = xml |> Feed.items() |> List.first()
149 |       assert FeedItem.tags(item) == []
150 |     end
151 |   end
152 | 
153 |   describe "FeedItem#author/1" do
154 |     test "can extract from xml string of type atom" do
155 |       xml = "<author>abc</author>"
156 |       assert FeedItem.author(xml) == "abc"
157 |     end
158 | 
159 |     test "can extract from xml string of type rss" do
160 |       xml = "<author>abc</author>"
161 |       assert FeedItem.author(xml) == "abc"
162 |     end
163 | 
164 |     test "can extract from german atom feed" do
165 |       xml = File.read!("cache/feed/heise.xml")
166 |       item = xml |> Feed.items() |> List.first()
167 |       assert FeedItem.author(item) == nil
168 |     end
169 | 
170 |     test "can extract from german rss feed" do
171 |       xml = File.read!("cache/feed/spiegel.xml")
172 |       item = xml |> Feed.items() |> List.first()
173 |       assert FeedItem.author(item) == nil
174 |     end
175 | 
176 |     test "can extract from english atom feed" do
177 |       xml = File.read!("cache/feed/elixir-lang.xml")
178 |       item = xml |> Feed.items() |> List.first()
179 |       assert FeedItem.author(item) == "José Valim"
180 |     end
181 | 
182 |     test "can extract from english rss feed" do
183 |       xml = File.read!("cache/feed/latimes.xml")
184 |       item = xml |> Feed.items() |> List.first()
185 |       assert FeedItem.author(item) == "Randall Roberts"
186 |     end
187 |   end
188 | 
189 |   describe "FeedItem#image_url/1" do
190 |     test "can extract from xml string of type atom" do
191 |       xml = "<enclosure url='abc' />"
192 |       assert FeedItem.image_url(xml) == "abc"
193 |     end
194 | 
195 |     test "can extract from xml string of type rss" do
196 |       xml = "<enclosure url='abc' />"
197 |       assert FeedItem.image_url(xml) == "abc"
198 |     end
199 | 
200 |     test "can extract from german atom feed" do
201 |       xml = File.read!("cache/feed/heise.xml")
202 |       item = xml |> Feed.items() |> List.first()
203 |       assert FeedItem.image_url(item) =~ "https://www.heise.de/scale/geometry/"
204 |     end
205 | 
206 |     test "can extract from german rss feed" do
207 |       xml = File.read!("cache/feed/spiegel.xml")
208 |       item = xml |> Feed.items() |> List.first()
209 |       assert FeedItem.image_url(item) == nil
210 |     end
211 | 
212 |     test "can extract from english atom feed" do
213 |       xml = File.read!("cache/feed/elixir-lang.xml")
214 |       item = xml |> Feed.items() |> List.first()
215 |       assert FeedItem.image_url(item) == nil
216 |     end
217 | 
218 |     test "can extract from english rss feed" do
219 |       xml = File.read!("cache/feed/latimes.xml")
220 |       item = xml |> Feed.items() |> List.first()
221 |       assert FeedItem.image_url(item) == nil
222 |     end
223 |   end
224 | end
225 | 


--------------------------------------------------------------------------------
/test/ir/feed_test.exs:
--------------------------------------------------------------------------------
  1 | defmodule Scrape.IR.FeedTest do
  2 |   use ExUnit.Case
  3 | 
  4 |   alias Scrape.IR.Feed
  5 | 
  6 |   doctest Feed
  7 | 
  8 |   describe "Feed#title/1" do
  9 |     test "can extract from xml string of type atom" do
 10 |       assert Feed.title("<feed><title>abc</title></feed>") == "abc"
 11 |     end
 12 | 
 13 |     test "can extract from xml string of type rss" do
 14 |       assert Feed.title("<rss><channel><title>abc</title></channel></rss>") == "abc"
 15 |     end
 16 | 
 17 |     test "can extract from german atom feed" do
 18 |       html = File.read!("cache/feed/heise.xml")
 19 |       assert Feed.title(html) == "heise online News"
 20 |     end
 21 | 
 22 |     test "can extract from german rss feed" do
 23 |       html = File.read!("cache/feed/spiegel.xml")
 24 |       assert Feed.title(html) == "SPIEGEL ONLINE - Schlagzeilen"
 25 |     end
 26 | 
 27 |     test "can extract from english atom feed" do
 28 |       html = File.read!("cache/feed/elixir-lang.xml")
 29 |       assert Feed.title(html) == "Elixir Lang"
 30 |     end
 31 | 
 32 |     test "can extract from english rss feed" do
 33 |       html = File.read!("cache/feed/latimes.xml")
 34 |       assert Feed.title(html) == "latimes.com - Los Angeles Times"
 35 |     end
 36 |   end
 37 | 
 38 |   describe "Feed#description/1" do
 39 |     test "can extract from xml string of type atom" do
 40 |       xml = "<feed><description>abc</description></feed>"
 41 |       assert Feed.description(xml) == "abc"
 42 |     end
 43 | 
 44 |     test "can extract from xml string of type rss" do
 45 |       xml = "<rss><channel><description>abc</description></channel></rss>"
 46 |       assert Feed.description(xml) == "abc"
 47 |     end
 48 | 
 49 |     test "can extract from german atom feed" do
 50 |       xml = File.read!("cache/feed/heise.xml")
 51 |       assert Feed.description(xml) == "Nachrichten nicht nur aus der Welt der Computer"
 52 |     end
 53 | 
 54 |     test "can extract from german rss feed" do
 55 |       xml = File.read!("cache/feed/spiegel.xml")
 56 |       assert Feed.description(xml) =~ "Alles Wichtige aus"
 57 |     end
 58 | 
 59 |     test "can extract from english atom feed" do
 60 |       xml = File.read!("cache/feed/elixir-lang.xml")
 61 |       assert Feed.description(xml) == nil
 62 |     end
 63 | 
 64 |     test "can extract from english rss feed" do
 65 |       xml = File.read!("cache/feed/latimes.xml")
 66 |       assert Feed.description(xml) =~ "source of breaking news"
 67 |     end
 68 |   end
 69 | 
 70 |   describe "Feed#website_url/1" do
 71 |     test "can extract from xml string of type atom" do
 72 |       xml = "<feed><link href='http://example.com' /></feed>"
 73 |       assert Feed.website_url(xml) == "http://example.com"
 74 |     end
 75 | 
 76 |     test "can extract from xml string of type rss" do
 77 |       xml = "<rss><channel><link>http://example.com</link></channel></rss>"
 78 |       assert Feed.website_url(xml) == "http://example.com"
 79 |     end
 80 | 
 81 |     test "can extract from german atom feed" do
 82 |       xml = File.read!("cache/feed/heise.xml")
 83 |       assert Feed.website_url(xml) == "https://www.heise.de"
 84 |     end
 85 | 
 86 |     test "can extract from german rss feed" do
 87 |       xml = File.read!("cache/feed/spiegel.xml")
 88 |       assert Feed.website_url(xml) == "http://www.spiegel.de"
 89 |     end
 90 | 
 91 |     test "can extract from english atom feed" do
 92 |       xml = File.read!("cache/feed/elixir-lang.xml")
 93 |       assert Feed.website_url(xml) == "http://elixir-lang.org"
 94 |     end
 95 | 
 96 |     test "can extract from english rss feed" do
 97 |       xml = File.read!("cache/feed/latimes.xml")
 98 |       assert Feed.website_url(xml) == "http://www.latimes.com"
 99 |     end
100 |   end
101 | 
102 |   describe "Feed#items/1" do
103 |     test "can extract from xml string of type atom" do
104 |       xml = "<feed><entry><title>abc</title></entry></feed>"
105 |       assert Feed.items(xml) == [%{"title" => "abc"}]
106 |     end
107 | 
108 |     test "can extract from xml string of type rss" do
109 |       xml = "<rss><channel><item><title>abc</title></item></channel></rss>"
110 |       assert Feed.items(xml) == [%{"title" => "abc"}]
111 |     end
112 | 
113 |     test "can extract from german atom feed" do
114 |       xml = File.read!("cache/feed/heise.xml")
115 |       item = xml |> Feed.items() |> List.first()
116 |       assert item["title"] =~ "Fachkräftemangel"
117 |     end
118 | 
119 |     test "can extract from german rss feed" do
120 |       xml = File.read!("cache/feed/spiegel.xml")
121 |       item = xml |> Feed.items() |> List.first()
122 |       assert item["title"] =~ "Schwertransporter"
123 |     end
124 | 
125 |     test "can extract from english atom feed" do
126 |       xml = File.read!("cache/feed/elixir-lang.xml")
127 |       item = xml |> Feed.items() |> List.first()
128 |       assert item["title"] =~ "v1.0 released"
129 |     end
130 | 
131 |     test "can extract from english rss feed" do
132 |       xml = File.read!("cache/feed/latimes.xml")
133 |       item = xml |> Feed.items() |> List.first()
134 |       assert item["title"] =~ "Instrumental guitar music"
135 |     end
136 |   end
137 | end
138 | 


--------------------------------------------------------------------------------
/test/ir/html_test.exs:
--------------------------------------------------------------------------------
 1 | defmodule Scrape.IR.HTMLTest do
 2 |   use ExUnit.Case
 3 | 
 4 |   alias Scrape.IR.HTML
 5 | 
 6 |   doctest HTML
 7 | 
 8 |   describe "DOM#title/1" do
 9 |     test "can extract title from html string" do
10 |       assert HTML.title("<title>abc</title>") == "abc"
11 |     end
12 | 
13 |     test "can extract title from html website" do
14 |       html = File.read!("cache/domain/venturebeat.html")
15 |       assert HTML.title(html) =~ "Fortnite teams up with Avengers"
16 |     end
17 | 
18 |     test "can extract title from german html article" do
19 |       html = File.read!("cache/article/spiegel.html")
20 |       assert HTML.title(html) =~ "Forscher über schwarzes Loch"
21 |     end
22 | 
23 |     test "can extract title from english html article" do
24 |       html = File.read!("cache/article/nytimes.html")
25 |       assert HTML.title(html) =~ "Americans Are Seeing"
26 |     end
27 |   end
28 | 
29 |   describe "DOM#image_url/2" do
30 |     test "can extract image_url from html string" do
31 |       url = "http://example.com"
32 |       html = ~s(<meta property="og:image" content="img.jpg" />)
33 |       assert HTML.image_url(html, url) == "http://example.com/img.jpg"
34 |       assert HTML.image_url(html) == "img.jpg"
35 |     end
36 |   end
37 | 
38 |   describe "DOM#icon_url/2" do
39 |     test "can extract image_url from html string" do
40 |       url = "http://example.com"
41 |       html = ~s(<link rel='icon' href="img.jpg" />)
42 |       assert HTML.icon_url(html, url) == "http://example.com/img.jpg"
43 |       assert HTML.icon_url(html) == "img.jpg"
44 |     end
45 |   end
46 | 
47 |   describe "DOM#description/1" do
48 |     test "can extract description from html string" do
49 |       html = "<meta name='description' content='interesting!' />"
50 |       assert HTML.description(html) == "interesting!"
51 |     end
52 |   end
53 | 
54 |   describe "DOM#content/1" do
55 |     test "can extract text from english html string" do
56 |       html = File.read!("cache/article/nytimes.html")
57 |       assert HTML.content(html) =~ "Minimum Wage Increases Have Trade-Offs."
58 |     end
59 | 
60 |     test "can extract text from german html string" do
61 |       html = File.read!("cache/article/spiegel.html")
62 |       assert HTML.content(html) =~ "Im Interview erklärt er die Faszination schwarzer Löcher"
63 |     end
64 |   end
65 | 
66 |   describe "DOM#paragraphs/1" do
67 |     test "can extract text from english html string" do
68 |       html = File.read!("cache/article/nytimes.html")
69 |       assert HTML.paragraphs(html) |> List.first() =~ "It hasn’t budged since."
70 |     end
71 | 
72 |     test "can extract text from german html string" do
73 |       html = File.read!("cache/article/spiegel.html")
74 |       assert HTML.paragraphs(html) |> List.first() =~ "Volltreffer gelandet"
75 |     end
76 |   end
77 | end
78 | 


--------------------------------------------------------------------------------
/test/ir/text_test.exs:
--------------------------------------------------------------------------------
 1 | defmodule Scrape.IR.TextTest do
 2 |   use ExUnit.Case
 3 | 
 4 |   alias Scrape.IR.Text
 5 | 
 6 |   doctest Text
 7 | 
 8 |   # test "greets the world" do
 9 |   #   example = "lorem ipsum..."
10 |   #   assert Text.extract_summary(example, ["lorem"]) == ["lorem ipsum"]
11 |   #   assert Text.generate_summary(example) == example
12 |   # end
13 | 
14 |   test "can detect language of text" do
15 |     assert Text.detect_language("the quick brown fox jumps over...") == :en
16 |     assert Text.detect_language("Es ist ein schönes Wetter heute...") == :de
17 |   end
18 | end
19 | 


--------------------------------------------------------------------------------
/test/scrape_test.exs:
--------------------------------------------------------------------------------
1 | defmodule ScrapeTest do
2 |   use ExUnit.Case
3 |   # doctest Scrape
4 | 
5 |   # test "greets the world" do
6 |   #   assert Scrape.hello() == :world
7 |   # end
8 | end
9 | 


--------------------------------------------------------------------------------
/test/test_helper.exs:
--------------------------------------------------------------------------------
1 | ExUnit.start()
2 | 


--------------------------------------------------------------------------------
/test/tools/dom_test.exs:
--------------------------------------------------------------------------------
  1 | defmodule Scrape.Tools.DomTest do
  2 |   use ExUnit.Case
  3 | 
  4 |   alias Scrape.Tools.DOM
  5 | 
  6 |   doctest DOM
  7 | 
  8 |   describe "DOM#from_string/1" do
  9 |     test "works with nil" do
 10 |       assert DOM.from_string(nil) == []
 11 |     end
 12 | 
 13 |     test "works with empty string" do
 14 |       assert DOM.from_string("") == []
 15 |     end
 16 | 
 17 |     test "works with html string with one root node" do
 18 |       assert DOM.from_string("<html />") == {"html", [], []}
 19 |     end
 20 | 
 21 |     test "works with html string with two root nodes" do
 22 |       assert DOM.from_string("<head /><body />") == [{"head", [], []}, {"body", [], []}]
 23 |     end
 24 |   end
 25 | 
 26 |   describe "DOM#to_string/1" do
 27 |     test "works with nil" do
 28 |       assert DOM.to_string(nil) == ""
 29 |     end
 30 | 
 31 |     test "works with empty dom" do
 32 |       assert DOM.to_string([]) == ""
 33 |     end
 34 | 
 35 |     test "works with dom with one root node" do
 36 |       assert DOM.to_string({"html", [], []}) == "<html></html>"
 37 |     end
 38 | 
 39 |     test "works with dom with two root nodes" do
 40 |       dom = [{"head", [], []}, {"body", [], []}]
 41 |       assert DOM.to_string(dom) == "<head></head><body></body>"
 42 |     end
 43 |   end
 44 | 
 45 |   describe "DOM#text/2" do
 46 |     test "returns nil if nothing is found" do
 47 |       dom = DOM.from_string("<p>hello world</p>")
 48 |       assert DOM.text(dom, "div") == nil
 49 |     end
 50 | 
 51 |     test "returns string if something is found" do
 52 |       dom = DOM.from_string("<p>hello world</p>")
 53 |       assert DOM.text(dom, "p") == "hello world"
 54 |     end
 55 | 
 56 |     test "returns first string if many matches are found" do
 57 |       dom = DOM.from_string("<p>hello</p><p>world</p>")
 58 |       assert DOM.text(dom, "p") == "hello"
 59 |     end
 60 |   end
 61 | 
 62 |   describe "DOM#texts/2" do
 63 |     test "returns empty list if nothing is found" do
 64 |       dom = DOM.from_string("<p>hello world</p>")
 65 |       assert DOM.texts(dom, "div") == []
 66 |     end
 67 | 
 68 |     test "returns string list if one match is found" do
 69 |       dom = DOM.from_string("<p>hello world</p>")
 70 |       assert DOM.texts(dom, "p") == ["hello world"]
 71 |     end
 72 | 
 73 |     test "returns string list if many matches are found" do
 74 |       dom = DOM.from_string("<p>hello</p><p>world</p>")
 75 |       assert DOM.texts(dom, "p") == ["hello", "world"]
 76 |     end
 77 |   end
 78 | 
 79 |   describe "DOM#attr/3" do
 80 |     test "returns nil if nothing is found" do
 81 |       dom = DOM.from_string("<meta name='a' content='b' />")
 82 |       assert DOM.attr(dom, "unknown", "unknown") == nil
 83 |       assert DOM.attr(dom, "meta", "unknown") == nil
 84 |       assert DOM.attr(dom, "meta[name='unknown']", "unknown") == nil
 85 |       assert DOM.attr(dom, "unknown", "content") == nil
 86 |     end
 87 | 
 88 |     test "returns string if something is found" do
 89 |       dom = DOM.from_string("<meta name='a' content='b' />")
 90 |       assert DOM.attr(dom, "meta[name='a']", "content") == "b"
 91 |     end
 92 | 
 93 |     test "returns first string if many matches are found" do
 94 |       dom = DOM.from_string("<meta name='a' content='b' /><meta name='a' content='c' />")
 95 |       assert DOM.attr(dom, "meta[name='a']", "content") == "b"
 96 |     end
 97 |   end
 98 | 
 99 |   describe "DOM#attrs/3" do
100 |     test "returns empty list if nothing is found" do
101 |       dom = DOM.from_string("<meta name='a' content='b' />")
102 |       assert DOM.attrs(dom, "unknown", "unknown") == []
103 |     end
104 | 
105 |     test "returns string list if one match is found" do
106 |       dom = DOM.from_string("<meta name='a' content='b' /><meta name='c' content='d' />")
107 |       assert DOM.attrs(dom, "meta[name=a]", "content") == ["b"]
108 |     end
109 | 
110 |     test "returns string list if many matches are found" do
111 |       dom = DOM.from_string("<meta name='a' content='b' /><meta name='a' content='c' />")
112 |       assert DOM.attrs(dom, "meta[name=a]", "content") == ["b", "c"]
113 |     end
114 |   end
115 | end
116 | 


--------------------------------------------------------------------------------
/test/tools/tree_test.exs:
--------------------------------------------------------------------------------
 1 | defmodule Scrape.Tools.TreeTest do
 2 |   use ExUnit.Case
 3 | 
 4 |   alias Scrape.Tools.Tree
 5 | 
 6 |   doctest Tree
 7 | 
 8 |   describe "Tree#from_string/1" do
 9 |     test "works with nil" do
10 |       assert Tree.from_xml_string(nil) == %{}
11 |     end
12 | 
13 |     test "works with empty string" do
14 |       assert Tree.from_xml_string("") == %{}
15 |     end
16 | 
17 |     test "works with xml string" do
18 |       assert Tree.from_xml_string("<node>abc</node>") == %{"node" => "abc"}
19 |     end
20 |   end
21 | end
22 | 


--------------------------------------------------------------------------------
/test/tools/url_test.exs:
--------------------------------------------------------------------------------
 1 | defmodule Scrape.Tools.URLTest do
 2 |   use ExUnit.Case
 3 | 
 4 |   alias Scrape.Tools.URL
 5 | 
 6 |   doctest URL
 7 | 
 8 |   describe "URL.merge/2" do
 9 |     test "can merge relative paths" do
10 |       root_url = "http://example.com"
11 |       assert URL.merge("/path", root_url) == "http://example.com/path"
12 |       assert URL.merge("/path", root_url <> "/something") == "http://example.com/path"
13 |     end
14 |   end
15 | end
16 | 


--------------------------------------------------------------------------------
/test/tools/word_test.exs:
--------------------------------------------------------------------------------
 1 | defmodule Scrape.Tools.WordTest do
 2 |   use ExUnit.Case
 3 | 
 4 |   alias Scrape.Tools.Word
 5 | 
 6 |   doctest Word
 7 | 
 8 |   describe "Word.stem/2" do
 9 |     test "can stem english words" do
10 |       assert Word.stem("beautiful", :en) == "beauti"
11 |     end
12 | 
13 |     test "can stem german words" do
14 |       assert Word.stem("derbsten", :de) == "derb"
15 |     end
16 |   end
17 | 
18 |   describe "Word.is_stopword?/2" do
19 |     test "can check english words" do
20 |       assert Word.is_stopword?("a", :en) == true
21 |       assert Word.is_stopword?("apple", :en) == false
22 |     end
23 | 
24 |     test "can check german words" do
25 |       assert Word.is_stopword?("eine", :de) == true
26 |       assert Word.is_stopword?("vitamin", :de) == false
27 |     end
28 |   end
29 | end
30 | 


--------------------------------------------------------------------------------