├── .formatter.exs ├── .gitignore ├── LICENSE.txt ├── README.md ├── cache ├── article │ ├── kreuzfahrt-prozente.html │ ├── nytimes.html │ └── spiegel.html ├── domain │ ├── spiegel.html │ ├── venturebeat.html │ └── zeit.html └── feed │ ├── elixir-lang.xml │ ├── heise.xml │ ├── latimes.xml │ └── spiegel.xml ├── config └── config.exs ├── lib ├── scrape.ex └── scrape │ ├── application.ex │ ├── flow.ex │ ├── flow │ ├── article.ex │ ├── domain.ex │ ├── feed.ex │ └── feed_item.ex │ ├── ir │ ├── feed.ex │ ├── feed_item.ex │ ├── html.ex │ ├── text.ex │ └── text │ │ ├── rake.ex │ │ └── tfidf.ex │ ├── options.ex │ ├── source │ ├── disk.ex │ ├── http.ex │ └── http │ │ ├── charset.ex │ │ ├── get.ex │ │ └── transcode.ex │ └── tools │ ├── dom.ex │ ├── tree.ex │ ├── url.ex │ ├── word.ex │ └── word │ ├── is_stopword.ex │ └── stopwords │ ├── de.txt │ └── en.txt ├── mix.exs ├── mix.lock └── test ├── flow ├── article_test.exs ├── domain_test.exs └── feed_test.exs ├── ir ├── feed_item_test.exs ├── feed_test.exs ├── html_test.exs └── text_test.exs ├── scrape_test.exs ├── test_helper.exs └── tools ├── dom_test.exs ├── tree_test.exs ├── url_test.exs └── word_test.exs /.formatter.exs: -------------------------------------------------------------------------------- 1 | # Used by "mix format" 2 | [ 3 | inputs: ["{mix,.formatter}.exs", "{config,lib,test}/**/*.{ex,exs}"] 4 | ] 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # mac os x stuff 2 | .DS_Store 3 | 4 | # The directory Mix will write compiled artifacts to. 5 | /_build/ 6 | 7 | # If you run "mix test --cover", coverage assets end up here. 8 | /cover/ 9 | 10 | # The directory Mix downloads your dependencies sources to. 11 | /deps/ 12 | 13 | # Where third-party dependencies like ExDoc output generated docs. 14 | /doc/ 15 | 16 | # Ignore .fetch files in case you like to edit your project deps locally. 17 | /.fetch 18 | 19 | # If the VM crashes, it generates a dump, let's ignore it too. 20 | erl_crash.dump 21 | 22 | # Also ignore archive artifacts (built via "mix archive.build"). 23 | *.ez 24 | 25 | # Ignore package tarball (built via "mix hex.build"). 26 | scrape-*.tar 27 | 28 | # Ignore VSCode artifacts 29 | .elixir_ls 30 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | GNU LESSER GENERAL PUBLIC LICENSE 2 | Version 3, 29 June 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | 9 | This version of the GNU Lesser General Public License incorporates 10 | the terms and conditions of version 3 of the GNU General Public 11 | License, supplemented by the additional permissions listed below. 12 | 13 | 0. Additional Definitions. 14 | 15 | As used herein, "this License" refers to version 3 of the GNU Lesser 16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU 17 | General Public License. 18 | 19 | "The Library" refers to a covered work governed by this License, 20 | other than an Application or a Combined Work as defined below. 21 | 22 | An "Application" is any work that makes use of an interface provided 23 | by the Library, but which is not otherwise based on the Library. 24 | Defining a subclass of a class defined by the Library is deemed a mode 25 | of using an interface provided by the Library. 26 | 27 | A "Combined Work" is a work produced by combining or linking an 28 | Application with the Library. The particular version of the Library 29 | with which the Combined Work was made is also called the "Linked 30 | Version". 31 | 32 | The "Minimal Corresponding Source" for a Combined Work means the 33 | Corresponding Source for the Combined Work, excluding any source code 34 | for portions of the Combined Work that, considered in isolation, are 35 | based on the Application, and not on the Linked Version. 36 | 37 | The "Corresponding Application Code" for a Combined Work means the 38 | object code and/or source code for the Application, including any data 39 | and utility programs needed for reproducing the Combined Work from the 40 | Application, but excluding the System Libraries of the Combined Work. 41 | 42 | 1. Exception to Section 3 of the GNU GPL. 43 | 44 | You may convey a covered work under sections 3 and 4 of this License 45 | without being bound by section 3 of the GNU GPL. 46 | 47 | 2. Conveying Modified Versions. 48 | 49 | If you modify a copy of the Library, and, in your modifications, a 50 | facility refers to a function or data to be supplied by an Application 51 | that uses the facility (other than as an argument passed when the 52 | facility is invoked), then you may convey a copy of the modified 53 | version: 54 | 55 | a) under this License, provided that you make a good faith effort to 56 | ensure that, in the event an Application does not supply the 57 | function or data, the facility still operates, and performs 58 | whatever part of its purpose remains meaningful, or 59 | 60 | b) under the GNU GPL, with none of the additional permissions of 61 | this License applicable to that copy. 62 | 63 | 3. Object Code Incorporating Material from Library Header Files. 64 | 65 | The object code form of an Application may incorporate material from 66 | a header file that is part of the Library. You may convey such object 67 | code under terms of your choice, provided that, if the incorporated 68 | material is not limited to numerical parameters, data structure 69 | layouts and accessors, or small macros, inline functions and templates 70 | (ten or fewer lines in length), you do both of the following: 71 | 72 | a) Give prominent notice with each copy of the object code that the 73 | Library is used in it and that the Library and its use are 74 | covered by this License. 75 | 76 | b) Accompany the object code with a copy of the GNU GPL and this license 77 | document. 78 | 79 | 4. Combined Works. 80 | 81 | You may convey a Combined Work under terms of your choice that, 82 | taken together, effectively do not restrict modification of the 83 | portions of the Library contained in the Combined Work and reverse 84 | engineering for debugging such modifications, if you also do each of 85 | the following: 86 | 87 | a) Give prominent notice with each copy of the Combined Work that 88 | the Library is used in it and that the Library and its use are 89 | covered by this License. 90 | 91 | b) Accompany the Combined Work with a copy of the GNU GPL and this license 92 | document. 93 | 94 | c) For a Combined Work that displays copyright notices during 95 | execution, include the copyright notice for the Library among 96 | these notices, as well as a reference directing the user to the 97 | copies of the GNU GPL and this license document. 98 | 99 | d) Do one of the following: 100 | 101 | 0) Convey the Minimal Corresponding Source under the terms of this 102 | License, and the Corresponding Application Code in a form 103 | suitable for, and under terms that permit, the user to 104 | recombine or relink the Application with a modified version of 105 | the Linked Version to produce a modified Combined Work, in the 106 | manner specified by section 6 of the GNU GPL for conveying 107 | Corresponding Source. 108 | 109 | 1) Use a suitable shared library mechanism for linking with the 110 | Library. A suitable mechanism is one that (a) uses at run time 111 | a copy of the Library already present on the user's computer 112 | system, and (b) will operate properly with a modified version 113 | of the Library that is interface-compatible with the Linked 114 | Version. 115 | 116 | e) Provide Installation Information, but only if you would otherwise 117 | be required to provide such information under section 6 of the 118 | GNU GPL, and only to the extent that such information is 119 | necessary to install and execute a modified version of the 120 | Combined Work produced by recombining or relinking the 121 | Application with a modified version of the Linked Version. (If 122 | you use option 4d0, the Installation Information must accompany 123 | the Minimal Corresponding Source and Corresponding Application 124 | Code. If you use option 4d1, you must provide the Installation 125 | Information in the manner specified by section 6 of the GNU GPL 126 | for conveying Corresponding Source.) 127 | 128 | 5. Combined Libraries. 129 | 130 | You may place library facilities that are a work based on the 131 | Library side by side in a single library together with other library 132 | facilities that are not Applications and are not covered by this 133 | License, and convey such a combined library under terms of your 134 | choice, if you do both of the following: 135 | 136 | a) Accompany the combined library with a copy of the same work based 137 | on the Library, uncombined with any other library facilities, 138 | conveyed under the terms of this License. 139 | 140 | b) Give prominent notice with the combined library that part of it 141 | is a work based on the Library, and explaining where to find the 142 | accompanying uncombined form of the same work. 143 | 144 | 6. Revised Versions of the GNU Lesser General Public License. 145 | 146 | The Free Software Foundation may publish revised and/or new versions 147 | of the GNU Lesser General Public License from time to time. Such new 148 | versions will be similar in spirit to the present version, but may 149 | differ in detail to address new problems or concerns. 150 | 151 | Each version is given a distinguishing version number. If the 152 | Library as you received it specifies that a certain numbered version 153 | of the GNU Lesser General Public License "or any later version" 154 | applies to it, you have the option of following the terms and 155 | conditions either of that published version or of any later version 156 | published by the Free Software Foundation. If the Library as you 157 | received it does not specify a version number of the GNU Lesser 158 | General Public License, you may choose any version of the GNU Lesser 159 | General Public License ever published by the Free Software Foundation. 160 | 161 | If the Library as you received it specifies that a proxy can decide 162 | whether future versions of the GNU Lesser General Public License shall 163 | apply, that proxy's public statement of acceptance of any version is 164 | permanent authorization for you to choose that version for the 165 | Library. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Scrape 2 | 3 | [![Hex.pm](https://img.shields.io/hexpm/dt/scrape.svg)](https://hex.pm/packages/scrape) 4 | [![Hex.pm](https://img.shields.io/hexpm/v/scrape.svg)](https://hex.pm/packages/scrape) 5 | [![Hex.pm](https://img.shields.io/hexpm/l/scrape.svg)](https://hex.pm/packages/scrape) 6 | 7 | Structured Data extraction from common web resources, using information-retrieval techniques. See the [docs](https://hexdocs.pm/scrape/Scrape.html) 8 | 9 | ## Installation 10 | 11 | The package can be installed by adding `scrape` to your list of dependencies in `mix.exs`: 12 | 13 | ```elixir 14 | def deps do 15 | [ 16 | {:scrape, "~> 3.0.0"} 17 | ] 18 | end 19 | ``` 20 | 21 | ## Known Issues 22 | 23 | * This package uses an outdated version of `httpoison` because of `keepcosmos/readability`. You can override this in your app with `override: true` and everything should work. 24 | * The current version 3.X is a complete rewrite from scratch, so some new issues might occur and the API has changed. Please provide some URL to a HTML/Feed document when submitting issues, so I can look into it for bugfixing. 25 | 26 | ## Usage 27 | 28 | * `Scrape.domain!(url)` -> get structured data of a domain-type url (like https://bbc.com) 29 | * `Scrape.feed!(url)` -> get structured data of a RSS/Atom feed 30 | * `Scrape.article!(url)` -> get structured data of an article-type url 31 | 32 | ## License 33 | 34 | LGPLv3. You can use this package any way you want (including commercially), but I want bugfixes and improvements to flow back into this package for everyone's benefit. 35 | -------------------------------------------------------------------------------- /cache/feed/latimes.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | latimes.com - Los Angeles Times 6 | http://www.latimes.com/rss2.0.xml 7 | The LA Times is a leading source of breaking news, entertainment, sports, politics, and more for Southern California and the world. 8 | en-US 9 | ©2016 Los Angeles Times 10 | Sun, 3 Jul 2016 11:14:41 -0700 11 | 12 | Essential tracks: Instrumental guitar music from Marisa Anderson, William Tyler and Harry Taussig 13 | Randall Roberts 14 | http://www.latimes.com/la-et-ms-essential-tracks-marisa-anderson-tyler-20160624-snap-story.html 15 | Summer playlists usually teem with high-energy party music, but nights spent camping under the stars at Joshua Tree or amid the Big Sur redwoods call for sounds more contemplative, organic, earthen. This month has seen the arrival of a few remarkable guitar-based instrumental albums that fit the...

17 | ]]>
18 | Sun, 3 Jul 2016 11:07:00 PDT 19 | 20 |
21 | 22 | Pine Fire continues to burn in wilderness area north of Ojai 23 | Alice Walton 24 | http://www.latimes.com/la-me-pine-fire-update-20160703-snap-story.html 25 | Hundreds of firefighters continued to battle a wildfire in the Sespe Wilderness north of Ojai on Sunday with no containment in sight.

The Pine fire started Thursday morning about 11 miles north of Ojai. As of Sunday morning, the fire had consumed 1,590 acres and was threatening 50 structures, said...

27 | ]]>
28 | Sun, 3 Jul 2016 11:05:00 PDT 29 |
30 | 31 | Ex-campaign manager dismisses complaints about Donald Trump's six-point-star tweet 32 | David Willman 33 | http://www.latimes.com/la-na-trailguide-updates-former-campaign-manager-dismisses-1467557105-htmlstory.html 34 | 37 | Sun, 3 Jul 2016 10:22:00 PDT 38 | 39 | 40 | 41 | GOP Sen. Cotton says Trump can 'make the case for himself' 42 | Christi Parsons 43 | http://www.latimes.com/la-na-trailguide-updates-gop-sen-cotton-says-trump-can-make-1467565002-htmlstory.html 44 | 47 | Sun, 3 Jul 2016 10:13:00 PDT 48 | 49 | 50 | 51 | Driver who escaped police pursuit in San Diego is finally arrested - two months later 52 | David Hernandez 53 | http://www.latimes.com/la-me-pursuit-arrest-20160703-snap-story.html 54 | Police say a man accused of leading officers on a chase through several freeways before ditching the car in downtown San Diego was arrested last week — two months after the pursuit.

Officials say Ahran Haugley, 41, drove off on April 28 when officers approached the Honda Accord in which he was...

56 | ]]>
57 | Sun, 3 Jul 2016 10:00:00 PDT 58 |
59 | 60 | FBI questions Hillary Clinton about her private email server 61 | http://www.latimes.com/la-na-trailguide-updates-07032016-htmlstory.html 62 | 65 | Sun, 3 Jul 2016 09:56:52 PDT 66 | 67 | 68 | Serena Williams cruises to third-round win at Wimbledon 69 | Associated Press 70 | http://www.latimes.com/la-sp-wimbledon-20160703-snap-story.html 71 | Serena Williams earned a decent day's rest on the middle Sunday at Wimbledon while Jo-Wilfried Tsonga had to work overtime — 19-17 in the fifth set — in another marathon involving John Isner.

Williams, the defending women's champion and six-time winner, overwhelmed Annika Beck, 6-3, 6-0, in just...

73 | ]]>
74 | Sun, 3 Jul 2016 09:50:00 PDT 75 | 76 |
77 | 78 | Lewis Hamilton wins Austrian Grand Prix after final-lap pass of Nico Rosberg 79 | Associated Press 80 | http://www.latimes.com/la-sp-formula-one-austrian-grand-prix-20160703-snap-story.html 81 | Lewis Hamilton won the Austrian Grand Prix on Sunday after colliding with Nico Rosberg on the final lap, an incident he blamed on his German teammate.

The two Mercedes drivers touched as Hamilton sought to overtake and Formula One championship leader Rosberg ended up losing his front wing, which...

83 | ]]>
84 | Sun, 3 Jul 2016 09:40:00 PDT 85 | 86 |
87 | 88 | Peter Sagan moves into Tour de France lead with Stage 2 win 89 | Associated Press 90 | http://www.latimes.com/la-sp-tour-de-france-20160703-snap-story.html 91 | World champion Peter Sagan made the most of a steep, short climb in a frenzied finale to win the second stage of the Tour de France and claim the race leader's yellow jersey on Sunday.

Sagan, who pulled on the coveted shirt for the first time, used his power on the 1.9-kilometer Cote de la Glacerie...

93 | ]]>
94 | Sun, 3 Jul 2016 09:30:00 PDT 95 | 96 |
97 | 98 | Man injured after explosion reported in Central Park 99 | Associated Press 100 | http://www.latimes.com/la-na-central-park-explosion-reported-20160703-snap-story.html 101 | Authorities say a man was seriously hurt in New York City’s Central Park after people near the area reported hearing some kind of explosion.

Fire officials say it happened shortly before 11 a.m., inside the park at 68th Street and Fifth Avenue. The man suffered serious injuries, possibly requiring...

103 | ]]>
104 | Sun, 3 Jul 2016 09:29:00 PDT 105 | 106 |
107 | 108 | At least 120 people - including 15 children - killed in dual Baghdad bombings 109 | Associated Press 110 | http://www.latimes.com/la-fg-ap-baghdad-bombing-20160702-snap-story.html 111 | A suicide truck bomb in downtown Baghdad killed 115 people and wounded nearly 200 others who were out shopping and celebrating early Sunday ahead of the holiday marking the end of Ramadan, security and medical officials said.

The attack, claimed by Islamic State, was the deadliest in months in...

113 | ]]>
114 | Sun, 3 Jul 2016 09:16:00 PDT 115 | 116 |
117 | 118 | Possible Clinton running mates audition with attacks on Trump and defenses of their views on trade 119 | Christi Parsons 120 | http://www.latimes.com/la-na-trailguide-updates-possible-clinton-running-mates-audition-1467553920-htmlstory.html 121 | 124 | Sun, 3 Jul 2016 09:06:00 PDT 125 | 126 | 127 | 128 | Scott Kazmir gets a first-inning adjustment 129 | Jesse Dougherty 130 | http://www.latimes.com/la-sp-scott-kazmir-first-inning-20160702-snap-story.html 131 | Scott Kazmir’s up-and-down season has carried one glaring theme: The first inning is his toughest obstacle.

Heading into his start against the Rockies on Saturday, Kazmir had a 9.00 ERA in the first inning of his 16 starts. Opponents were hitting a healthy .342 against him in the first. More than...

133 | ]]>
134 | Sun, 3 Jul 2016 09:00:00 PDT 135 | 136 |
137 | 138 | Former campaign bus rolls into Los Angeles as anti-Trump protest art 139 | Javier Panzar 140 | http://www.latimes.com/politics/la-pol-ca-donald-trump-bus-iowa-california-20160702-snap-story.html 141 | 144 | Sun, 3 Jul 2016 08:43:00 PDT 145 | 146 | 147 | 148 | Woman saves pet cockatiel, but home burned in fast-moving brush fire 149 | Howard Blume and Shelby Grad 150 | http://www.latimes.com/la-me-sb-fire-update-20160703-snap-story.html 151 | The fire seemed to come out of nowhere and spread rapidly Saturday in a San Bernardino neighborhood.

Resident Martha Hall told the San Bernardino Sun that she saw the flames rushing up the hill toward her home. She ran into her house, grabbed her pet cockatiel and fled. As she was leaving, she...

153 | ]]>
154 | Sun, 3 Jul 2016 08:33:00 PDT 155 | 156 |
157 | 158 | In Colorado, conservatives grapple with the Trump conundrum 159 | Melanie Mason 160 | http://www.latimes.com/la-na-pol-trump-colorado-conservatives-20160703-snap-story.html 161 | To understand the dilemma Colorado Republicans wrestled with at a conservative gathering this weekend, one only had to look at the range of speakers, whose positions on Donald Trump ran the gamut from enthusiastic support to vehement opposition.

Trump himself came to Colorado for the Western Conservative...

163 | ]]>
164 | Sun, 3 Jul 2016 08:15:00 PDT 165 | 166 |
167 | 168 | Porter Ranch's future after massive gas leak is in the eye of the beholder 169 | Alice Walton 170 | http://www.latimes.com/la-me-porter-ranch-20160629-snap-story.html 171 | In the hills above the 118 freeway, mansions are being built. Restaurants and grocery stores are packed. Cyclists pedal up and down wide-open streets.

On the surface, the community of Porter Ranch is returning to normal four months after the largest methane leak in American history was capped in...

173 | ]]>
174 | Sun, 3 Jul 2016 08:00:00 PDT 175 | 176 |
177 | 178 | Web Buzz: With the Lola app, personal travel advice and service are a quick text away 179 | Jen Leo 180 | http://www.latimes.com/la-tr-webbuzz-20160624-snap-story.html 181 | Need immediate advice about a flight or hotel? Here’s an instant messaging app that connects you to a helpful online concierge.

Name: Lola app

What it does: Connects travelers with travel agents who can find the best options for you based on your preferences, including favorite airlines, hotel...

183 | ]]>
184 | Sun, 3 Jul 2016 08:00:00 PDT 185 |
186 | 187 | 'Deer Hunter,' 'Heaven's Gate' director Michael Cimino dies at 77; the film community reacts 188 | Deborah Vankin 189 | http://www.latimes.com/la-et-mn-michael-cimino-20160702-snap-htmlstory.html 190 | 193 | Sun, 3 Jul 2016 07:38:00 PDT 194 | 195 | 196 | 197 | Inside Donald Trump's secret smear campaign against a tribal casino 198 | Joseph Tanfani 199 | http://www.latimes.com/politics/la-na-pol-trump-anti-indian-campaign-20160630-snap-story.html 200 | 203 | Sun, 3 Jul 2016 07:13:00 PDT 204 | 205 | 206 | 207 | Letters: To tip or not to tip, plus spritzing while driving 208 | http://www.latimes.com/la-tr-letters-20160626-snap-story.html 209 | I just returned from two weeks in England: one week in Bath, in the Cotswolds, and one week in London. I tipped everywhere I would in L.A., and the recipients were very appreciative [“Tips on Tipping,” On the Spot by Catharine Hamm, June 26].

Upon leaving for London, I took a taxi to the train...

211 | ]]>
212 | Sun, 3 Jul 2016 06:30:00 PDT 213 | 214 |
215 | 216 | Tesla and Google are both driving toward autonomous vehicles. Which company is taking the better route? 217 | Tracey Lien 218 | http://www.latimes.com/la-fi-hy-tesla-google-20160701-snap-story.html 219 | Google and Tesla agree autonomous vehicles will make streets safer, and both are racing toward a driverless future. But when Google tested its self-driving car prototype on employees a few years ago, it noticed something that would take it down a different path from Tesla.

Once behind the wheel...

221 | ]]>
222 | Sun, 3 Jul 2016 06:00:00 PDT 223 | 224 |
225 | 226 | CEOs are getting more political, but consumers aren't buying it 227 | Jena McGregor 228 | http://www.latimes.com/la-fi-on-leadership-ceo-activism-20160630-snap-story.html 229 | Starbucks Chief Executive Howard Schultz has spoken out on gun control, race relations and the "cynicism, despair, division, exclusion, fear and yes -- indifference" in America today.

Facebook founder and CEO Mark Zuckerberg said at a developer conference this year that "I hear fearful voices calling...

231 | ]]>
232 | Sun, 3 Jul 2016 06:00:00 PDT 233 | 234 |
235 | 236 | Feedback: Why type when you can write? 237 | http://www.latimes.com/la-ca-0703-feedback-20160627-snap-story.html 238 | Buried within Laila Lalami’s entertaining essay [“The Power of Procrastination,” June 24] are the words “the blank screen.” Therein lies her problem. For years, well-intentioned friends have urged me to use a computer instead of writing longhand on yellow lined paper, because, they say, “it is...

240 | ]]>
241 | Sun, 3 Jul 2016 06:00:00 PDT 242 | 243 |
244 | 245 | Learn all about exploring Yosemite National Park from the experts at REI 246 | http://www.latimes.com/la-tr-films-20160620-snap-story.html 247 | YOSEMITE

Workshop

REI experts will share tips on exploring Yosemite National Park.

When, where: 7 p.m. Thursday at the REI store in Arcadia, 214 N. Santa Anita Ave.

Admission, info: Free. (626) 447-1062

PATAGONIA

Presentation

Explorer and mountain guide Tad McCrea will share his experiences and...

249 | ]]>
250 | Sun, 3 Jul 2016 06:00:00 PDT 251 |
252 |
253 |
254 | -------------------------------------------------------------------------------- /cache/feed/spiegel.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | SPIEGEL ONLINE - Schlagzeilen 6 | http://www.spiegel.de 7 | Deutschlands führende Nachrichtenseite. Alles Wichtige aus Politik, Wirtschaft, Sport, Kultur, Wissenschaft, Technik und mehr. 8 | de 9 | Thu, 05 Nov 2015 23:50:24 +0100 10 | Thu, 05 Nov 2015 23:50:24 +0100 11 | 12 | SPIEGEL ONLINE 13 | http://www.spiegel.de 14 | http://www.spiegel.de/static/sys/logo_120x61.gif 15 | 16 | 17 | Unglück in Bayern: Zug erfasst Schwertransporter - mehrere Tote 18 | http://www.spiegel.de/panorama/gesellschaft/bayern-zug-erfasst-schwertransporter-auf-bahnuebergang-a-1061387.html#ref=rss 19 | Schweres Unglück in der Oberpfalz: Ein Lkw und ein Zug sind auf einem Bahnübergang kollidiert, beide fingen Feuer. Laut Polizei gab es Tote und Verletzte. 20 | Panorama 21 | Thu, 05 Nov 2015 23:42:00 +0100 22 | http://www.spiegel.de/panorama/gesellschaft/bayern-zug-erfasst-schwertransporter-auf-bahnuebergang-a-1061387.html 23 | 24 | 25 | 26 | US-Republikaner: Trump und Carson bekommen Schutz vom Secret Service 27 | http://www.spiegel.de/politik/ausland/usa-secret-service-bewacht-trump-und-carson-von-den-republikanern-a-1061385.html#ref=rss 28 | Moderne Insignien der Macht für Donald Trump und Ben Carson: Die beiden Präsidentschaftskandidaten der US-Republikaner sollen nun rund um die Uhr vom Secret Service beschützt werden - auch, weil sie in Umfragen vorne liegen. 29 | Politik 30 | Thu, 05 Nov 2015 23:23:51 +0100 31 | http://www.spiegel.de/politik/ausland/usa-secret-service-bewacht-trump-und-carson-von-den-republikanern-a-1061385.html 32 | Moderne Insignien der Macht für Donald Trump und Ben Carson: Die beiden Präsidentschaftskandidaten der US-Republikaner sollen nun rund um die Uhr vom Secret Service beschützt werden - auch, weil sie in Umfragen vorne liegen.]]> 33 | 34 | 35 | 36 | Europa League: Bobadilla lässt Augsburg jubeln 37 | http://www.spiegel.de/sport/fussball/europa-league-fc-augsburg-gewinnt-gegen-az-alkmaar-deutlich-a-1061379.html#ref=rss 38 | Chance gewahrt: Der FC Augsburg hat nach einem deutlichen Sieg gegen AZ Alkmaar gute Aussichten auf das Erreichen der K.o.-Phase. Beim Bundesligisten traf ein Spieler dreifach. 39 | Sport 40 | Thu, 05 Nov 2015 22:59:00 +0100 41 | http://www.spiegel.de/sport/fussball/europa-league-fc-augsburg-gewinnt-gegen-az-alkmaar-deutlich-a-1061379.html 42 | Chance gewahrt: Der FC Augsburg hat nach einem deutlichen Sieg gegen AZ Alkmaar gute Aussichten auf das Erreichen der K.o.-Phase. Beim Bundesligisten traf ein Spieler dreifach.]]> 43 | 44 | 45 | 46 | Remis gegen Sparta Prag: Schalkes Krise geht in der Europa League weiter 47 | http://www.spiegel.de/sport/fussball/fc-schalke-spielt-in-der-europa-league-bei-sparta-prag-remis-a-1061382.html#ref=rss 48 | Der FC Schalke geht angeschlagen ins Derby gegen Borussia Dortmund. In der Europa League reichte es bei Sparta Prag nur zu einem Unentschieden, vor allem die verletzten Abwehrspieler bereiten Trainer André Breitenreiter Sorgen. 49 | Sport 50 | Thu, 05 Nov 2015 22:58:00 +0100 51 | http://www.spiegel.de/sport/fussball/fc-schalke-spielt-in-der-europa-league-bei-sparta-prag-remis-a-1061382.html 52 | Der FC Schalke geht angeschlagen ins Derby gegen Borussia Dortmund. In der Europa League reichte es bei Sparta Prag nur zu einem Unentschieden, vor allem die verletzten Abwehrspieler bereiten Trainer André Breitenreiter Sorgen.]]> 53 | 54 | 55 | 56 | Koalitionsstreit in der Flüchtlingskrise: Erledigt 57 | http://www.spiegel.de/politik/deutschland/fluechtlingskrise-merkel-seehofer-gabriel-finden-kompromiss-a-1061380.html#ref=rss 58 | Transitzonen + Einreisezentren = Aufnahme-Einrichtungen. Das ist der Kompromiss der Großen Koalition. CSU-Chef Seehofer erscheint als Verlierer des Gipfeltreffens. Ist er das wirklich? 59 | Politik 60 | Thu, 05 Nov 2015 22:18:00 +0100 61 | http://www.spiegel.de/politik/deutschland/fluechtlingskrise-merkel-seehofer-gabriel-finden-kompromiss-a-1061380.html 62 | Transitzonen + Einreisezentren = Aufnahme-Einrichtungen. Das ist der Kompromiss der Großen Koalition. CSU-Chef Seehofer erscheint als Verlierer des Gipfeltreffens. Ist er das wirklich?]]> 63 | 64 | 65 | 66 | Schwedischer Minister zu Flüchtlingen: "Bleibt in Deutschland" 67 | http://www.spiegel.de/politik/ausland/schweden-kann-fluechtlinge-laut-minister-nicht-mehr-unterbringen-a-1061378.html#ref=rss 68 | Schweden ist für seine liberale Asypolitik bekannt, doch jetzt will der Migrationsminister Flüchtlinge fernhalten - das Land habe nicht genug Unterkünfte. Morgan Johansson hat Asylsuchende aufgefordert, nach Deutschland zurückzukehren. 69 | Politik 70 | Thu, 05 Nov 2015 21:48:00 +0100 71 | http://www.spiegel.de/politik/ausland/schweden-kann-fluechtlinge-laut-minister-nicht-mehr-unterbringen-a-1061378.html 72 | Schweden ist für seine liberale Asypolitik bekannt, doch jetzt will der Migrationsminister Flüchtlinge fernhalten - das Land habe nicht genug Unterkünfte. Morgan Johansson hat Asylsuchende aufgefordert, nach Deutschland zurückzukehren.]]> 73 | 74 | 75 | 76 | Pope-Pop: Franziskus veröffentlicht Rockalbum 77 | http://www.spiegel.de/panorama/gesellschaft/papst-cd-franziskus-veroeffentlicht-rock-album-a-1061376.html#ref=rss 78 | Röhrende Bässe, E-Gitarren-Soli und dazu die Predigten von Franziskus: Über die musikalische Qualität des ersten Rock-Pop-Albums des Papstes lässt sich streiten, doch das Anliegen ist hehr. 79 | Panorama 80 | Thu, 05 Nov 2015 21:38:00 +0100 81 | http://www.spiegel.de/panorama/gesellschaft/papst-cd-franziskus-veroeffentlicht-rock-album-a-1061376.html 82 | Röhrende Bässe, E-Gitarren-Soli und dazu die Predigten von Franziskus: Über die musikalische Qualität des ersten Rock-Pop-Albums des Papstes lässt sich streiten, doch das Anliegen ist hehr.]]> 83 | 84 | 85 | 86 | Europa League: Klopp siegt mit Liverpool auch international 87 | http://www.spiegel.de/sport/fussball/juergen-klopp-siegt-mit-liverpool-in-der-europa-league-a-1061373.html#ref=rss 88 | Mehr als 3000 Kilometer Anreise und frostige Temperaturen haben sich gelohnt: Jürgen Klopp hat mit Liverpool in Kasan seinen dritten Sieg in Folge gefeiert. Neapel steht schon in der Zwischenrunde. 89 | Sport 90 | Thu, 05 Nov 2015 20:56:00 +0100 91 | http://www.spiegel.de/sport/fussball/juergen-klopp-siegt-mit-liverpool-in-der-europa-league-a-1061373.html 92 | Mehr als 3000 Kilometer Anreise und frostige Temperaturen haben sich gelohnt: Jürgen Klopp hat mit Liverpool in Kasan seinen dritten Sieg in Folge gefeiert. Neapel steht schon in der Zwischenrunde.]]> 93 | 94 | 95 | 96 | Europa League: Dortmund siegt und schafft es vorzeitig in die Zwischenrunde 97 | http://www.spiegel.de/sport/fussball/europa-league-borussia-dortmund-bezwingt-fk-qaebaelae-a-1061372.html#ref=rss 98 | Borussia Dortmund ist dank eines überzeugenden Sieges gegen FK Qäbälä frühzeitig in die K.o-Runde der Europa League eingezogen. Der BVB konnte es sich dabei sogar erlauben, einige Leistungsträger zu schonen. 99 | Sport 100 | Thu, 05 Nov 2015 20:53:22 +0100 101 | http://www.spiegel.de/sport/fussball/europa-league-borussia-dortmund-bezwingt-fk-qaebaelae-a-1061372.html 102 | Borussia Dortmund ist dank eines überzeugenden Sieges gegen FK Qäbälä frühzeitig in die K.o-Runde der Europa League eingezogen. Der BVB konnte es sich dabei sogar erlauben, einige Leistungsträger zu schonen.]]> 103 | 104 | 105 | 106 | Nasa-Daten zeigen: Sonneneruptionen reißen Mars-Atmosphäre weg 107 | http://www.spiegel.de/wissenschaft/weltall/sonneneruptionen-reissen-mars-atmosphaere-weg-a-1061367.html#ref=rss 108 | Verglichen mit der Erde hat der Mars eine extrem dünne Atmosphäre. Schuld daran sind wohl massive Sonnenstürme. Darauf deuten Messungen der Nasa-Sonde "Maven" hin. 109 | Wissenschaft 110 | Thu, 05 Nov 2015 20:04:00 +0100 111 | http://www.spiegel.de/wissenschaft/weltall/sonneneruptionen-reissen-mars-atmosphaere-weg-a-1061367.html 112 | Verglichen mit der Erde hat der Mars eine extrem dünne Atmosphäre. Schuld daran sind wohl massive Sonnenstürme. Darauf deuten Messungen der Nasa-Sonde "Maven" hin.]]> 113 | 114 | 115 | 116 | James-Bond-Quiz: Lizenz zum Danebenliegen 117 | http://www.spiegel.de/kultur/kino/james-bond-quiz-haben-sie-das-zeug-zum-doppel-null-agenten-a-1061142.html#ref=rss 118 | Sie wissen alles über James Bond? Kennen alle Girls, alle Autos, alle Action-Szenen? Bestimmt nicht! Das schwerste Bond-Quiz, zu dem SPIEGEL ONLINE imstande ist, wird Sie eines Besseren belehren. 119 | Kultur 120 | Thu, 05 Nov 2015 19:40:30 +0100 121 | http://www.spiegel.de/kultur/kino/james-bond-quiz-haben-sie-das-zeug-zum-doppel-null-agenten-a-1061142.html 122 | Sie wissen alles über James Bond? Kennen alle Girls, alle Autos, alle Action-Szenen? Bestimmt nicht! Das schwerste Bond-Quiz, zu dem SPIEGEL ONLINE imstande ist, wird Sie eines Besseren belehren.]]> 123 | 124 | 125 | 126 | James-Bond-Quiz: Haben Sie das Zeug zum Doppel-Null-Agenten? 127 | http://www.spiegel.de/quiztool/quiztool-64516.html#ref=rss 128 | Mit welchem Bond-Girl war Womanizer James Bond verheiratet? Wie tötete 007 den Bösewicht Sanchez in "Octopussy" und welcher der bislang vierundzwanzig Streifen ist kein "richtiger" Bond? Testen Sie Ihr Geheimagenten-Wissen im Quiz. 129 | Kultur 130 | Thu, 05 Nov 2015 19:39:31 +0100 131 | http://www.spiegel.de/quiztool/quiztool-64516.html 132 | 133 | 134 | 135 | Drama "El Club": Priester ohne Reue 136 | http://www.spiegel.de/kultur/kino/el-club-filmkritik-die-suenden-der-seelsorger-a-1060686.html#ref=rss 137 | Vier suspendierte Priester leben an der Nordküste Chiles, schuldig und isoliert. Nach einer Gewalttat zwingt ein Ermittler sie dazu, sich ihrer eigenen Vergangenheit zu stellen. "El Club" ist packender Mystery-Thriller und Abrechnung zugleich. 138 | Kultur 139 | Thu, 05 Nov 2015 19:25:00 +0100 140 | http://www.spiegel.de/kultur/kino/el-club-filmkritik-die-suenden-der-seelsorger-a-1060686.html 141 | 142 | 143 | 144 | Berühmter Historiker: Hans Mommsen ist tot 145 | http://www.spiegel.de/kultur/gesellschaft/hans-mommsen-ist-tot-a-1061374.html#ref=rss 146 | Er zählte zu den streitbarsten Historikern der Nachkriegszeit: Hans Mommsen prägte die Forschung über die NS-Zeit. Jetzt ist er an seinem 85. Geburtstag in Bayern gestorben. 147 | Kultur 148 | Thu, 05 Nov 2015 19:24:00 +0100 149 | http://www.spiegel.de/kultur/gesellschaft/hans-mommsen-ist-tot-a-1061374.html 150 | Er zählte zu den streitbarsten Historikern der Nachkriegszeit: Hans Mommsen prägte die Forschung über die NS-Zeit. Jetzt ist er an seinem 85. Geburtstag in Bayern gestorben.]]> 151 | 152 | 153 | 154 | Mafia-Prozess in Italien: "Die Stadträte müssen unseren Befehlen folgen" 155 | http://www.spiegel.de/panorama/justiz/rom-mafia-capitale-um-massimo-carminati-vor-gericht-a-1061316.html#ref=rss 156 | In Italien hat ein Riesenprozess gegen die Hauptstadtmafia begonnen: 250 Zeugen sollen gehört werden, 46 Angeklagte stehen vor Gericht. Im Mittelpunkt steht Massimo "der Schwarze" Carminati, selbsternannter König von Rom. 157 | Panorama 158 | Thu, 05 Nov 2015 19:00:00 +0100 159 | http://www.spiegel.de/panorama/justiz/rom-mafia-capitale-um-massimo-carminati-vor-gericht-a-1061316.html 160 | In Italien hat ein Riesenprozess gegen die Hauptstadtmafia begonnen: 250 Zeugen sollen gehört werden, 46 Angeklagte stehen vor Gericht. Im Mittelpunkt steht Massimo "der Schwarze" Carminati, selbsternannter König von Rom.]]> 161 | 162 | 163 | 164 | Push-Nachrichten-App Notify: Facebooks erdrückende Umarmung 165 | http://www.spiegel.de/netzwelt/netzpolitik/facebook-notify-news-app-soll-push-nachrichten-schicken-a-1061368.html#ref=rss 166 | Medienberichten zufolge will Facebook eine Nachrichten-App auf den Markt bringen. Push-Meldungen diverser Medien sollen darin nach den Wünschen der Nutzer zusammengestellt werden. Das passt zu Facebooks Umarmungsstrategie gegenüber Medien. 167 | Netzwelt 168 | Thu, 05 Nov 2015 18:43:00 +0100 169 | http://www.spiegel.de/netzwelt/netzpolitik/facebook-notify-news-app-soll-push-nachrichten-schicken-a-1061368.html 170 | Medienberichten zufolge will Facebook eine Nachrichten-App auf den Markt bringen. Push-Meldungen diverser Medien sollen darin nach den Wünschen der Nutzer zusammengestellt werden. Das passt zu Facebooks Umarmungsstrategie gegenüber Medien. ]]> 171 | 172 | 173 | 174 | Emma Watson trifft Malala Yousafzai: Endlich Feministin 175 | http://www.spiegel.de/panorama/leute/emma-watson-und-malala-yousafzai-ueber-frauenrechte-a-1061346.html#ref=rss 176 | Malala Yousafzai und Emma Watson setzen sich beide für die Rechte der Frauen ein. Als Feministin wollte sich Yousafzai jedoch nie bezeichnen - bis sie von Watson überzeugt wurde. 177 | Panorama 178 | Thu, 05 Nov 2015 18:41:00 +0100 179 | http://www.spiegel.de/panorama/leute/emma-watson-und-malala-yousafzai-ueber-frauenrechte-a-1061346.html 180 | 181 | 182 | 183 | Nach Hackerangriff: Abgeordnete müssen jetzt Passwörter mit mindestens acht Zeichen verwenden 184 | http://www.spiegel.de/politik/deutschland/hackerangriff-bundestag-ruestet-ein-bisschen-auf-a-1061332.html#ref=rss 185 | Was hat der Bundestag aus dem schweren Hackerangriff gelernt? Das Parlament schränkt jetzt sogar das Internet für Abgeordnete ein - die Angreifer sind weiterhin unentdeckt. 186 | Politik 187 | Thu, 05 Nov 2015 18:36:00 +0100 188 | http://www.spiegel.de/politik/deutschland/hackerangriff-bundestag-ruestet-ein-bisschen-auf-a-1061332.html 189 | Was hat der Bundestag aus dem schweren Hackerangriff gelernt? Das Parlament schränkt jetzt sogar das Internet für Abgeordnete ein - die Angreifer sind weiterhin unentdeckt. ]]> 190 | 191 | 192 | 193 | Kraftfahrt-Bundesamt: Machtlose Aufseher 194 | http://www.spiegel.de/wirtschaft/soziales/kraftfahrt-bundesamt-die-ohnmaechtigen-aufseher-a-1061338.html#ref=rss 195 | Die Abgasaffäre bei Volkswagen hat auch den Ruf des Kraftfahrt-Bundesamts schwer beschädigt. Doch die Vorwürfe zielen in die falsche Richtung. Denn die Ansagen kommen aus Berlin. 196 | Wirtschaft 197 | Thu, 05 Nov 2015 18:16:00 +0100 198 | http://www.spiegel.de/wirtschaft/soziales/kraftfahrt-bundesamt-die-ohnmaechtigen-aufseher-a-1061338.html 199 | 200 | 201 | 202 | Mutmaßlicher NSU-Helfer: Die Handy-Kontakte des André E. 203 | http://www.spiegel.de/panorama/nsu-prozess-die-handy-kontakte-des-andre-e-a-1020286.html#ref=rss 204 | Die große Aufmerksamkeit für Beate Zschäpe kann dem Mitangeklagten André E. nur recht sein: Er unterstützte den NSU laut Anklage über Jahre. Seine Handydaten zeigen, wie viele Kontakte er zu mutmaßlichen Kriminellen pflegte. 205 | Panorama 206 | Thu, 05 Nov 2015 18:04:00 +0100 207 | http://www.spiegel.de/panorama/nsu-prozess-die-handy-kontakte-des-andre-e-a-1020286.html 208 | Die große Aufmerksamkeit für Beate Zschäpe kann dem Mitangeklagten André E. nur recht sein: Er unterstützte den NSU laut Anklage über Jahre. Seine Handydaten zeigen, wie viele Kontakte er zu mutmaßlichen Kriminellen pflegte.]]> 209 | 210 | 211 | 212 | Gipfel in Berlin: Koalition einigt sich auf Registrierzentren für Flüchtlinge 213 | http://www.spiegel.de/politik/deutschland/fluechtlinge-koalition-einigt-sich-auf-registrierzentren-a-1061370.html#ref=rss 214 | Keine Transitzonen an der Grenze, dafür spezielle Aufnahme-Einrichtungen für Flüchtlinge aus sicheren Herkunftsländern: Auf diesen Kompromiss haben sich die Spitzen der Großen Koalition bei ihrem Flüchtlingsgipfel geeinigt. 215 | Politik 216 | Thu, 05 Nov 2015 17:59:00 +0100 217 | http://www.spiegel.de/politik/deutschland/fluechtlinge-koalition-einigt-sich-auf-registrierzentren-a-1061370.html 218 | Keine Transitzonen an der Grenze, dafür spezielle Aufnahme-Einrichtungen für Flüchtlinge aus sicheren Herkunftsländern: Auf diesen Kompromiss haben sich die Spitzen der Großen Koalition bei ihrem Flüchtlingsgipfel geeinigt.]]> 219 | 220 | 221 | 222 | Vorratsdatenspeicherung: Thüringen will im Bundesrat Vermittlungsausschuss anrufen 223 | http://www.spiegel.de/netzwelt/web/vorratsdatenspeicherung-thueringen-will-vermittlungsausschuss-anrufen-a-1061363.html#ref=rss 224 | Am Freitag ist die umstrittene Vorratsdatenspeicherung Thema im Bundesrat. Thüringens Justizminister hat bereits angekündigt, wegen Datenschutzbedenken den Vermittlungsausschuss anrufen zu wollen. Seine Chancen stehen aber schlecht. 225 | Netzwelt 226 | Thu, 05 Nov 2015 17:49:00 +0100 227 | http://www.spiegel.de/netzwelt/web/vorratsdatenspeicherung-thueringen-will-vermittlungsausschuss-anrufen-a-1061363.html 228 | 229 | 230 | 231 | Teuerste H&M-Kollektion: Elitäres für die Massen 232 | http://www.spiegel.de/stil/balmain-entwirft-fuer-h-m-elitaeres-fuer-die-massen-a-1060771.html#ref=rss 233 | Es hat mit Karl Lagerfeld funktioniert und mit Stella McCartney: Top-Designer entwerfen für H&M, Kunden stürmen die Läden - und zahlen ein Vielfaches der üblichen Preise. Die neue Kollektion kommt von Olivier Rousteing und ist so teuer wie keine zuvor. Zu teuer? Stimmen Sie ab! 234 | Stil 235 | Thu, 05 Nov 2015 17:47:00 +0100 236 | http://www.spiegel.de/stil/balmain-entwirft-fuer-h-m-elitaeres-fuer-die-massen-a-1060771.html 237 | 238 | 239 | 240 | Flüchtlingsfamilien: Kinder ohne Papiere dürfen zur Schule 241 | http://www.spiegel.de/schulspiegel/kinder-ohne-aufenthaltsrecht-duerfen-zur-schule-a-1060431.html#ref=rss 242 | Alle Kinder haben ein Recht auf Bildung - auch wenn sie nicht alle Papiere haben, die Deutschland vorschreibt. Doch nehmen Schulen solche Kinder überhaupt auf? Und müssen sie das an die Behörden melden? Die Antworten auf die wichtigsten Fragen. 243 | SchulSPIEGEL 244 | Thu, 05 Nov 2015 17:41:00 +0100 245 | http://www.spiegel.de/schulspiegel/kinder-ohne-aufenthaltsrecht-duerfen-zur-schule-a-1060431.html 246 | Alle Kinder haben ein Recht auf Bildung - auch wenn sie nicht alle Papiere haben, die Deutschland vorschreibt. Doch nehmen Schulen solche Kinder überhaupt auf? Und müssen sie das an die Behörden melden? Die Antworten auf die wichtigsten Fragen.]]> 247 | 248 | 249 | 250 | Afghanistan: Ärzte ohne Grenzen wirft USA vorsätzlichen Angriff auf Klinik vor 251 | http://www.spiegel.de/politik/ausland/aerzte-ohne-grenzen-werfen-usa-vorsaetzlichen-angriff-auf-klinik-in-kunduz-vor-a-1061362.html#ref=rss 252 | Bei einem US-Luftangriff auf ein Krankenhaus in Kunduz wurden vor einem Monat 30 Menschen getötet. Die Hilfsorganisation Ärzte ohne Grenzen bezweifelt, dass die Armee aus Versehen handelte. 253 | Politik 254 | Thu, 05 Nov 2015 17:25:00 +0100 255 | http://www.spiegel.de/politik/ausland/aerzte-ohne-grenzen-werfen-usa-vorsaetzlichen-angriff-auf-klinik-in-kunduz-vor-a-1061362.html 256 | Bei einem US-Luftangriff auf ein Krankenhaus in Kunduz wurden vor einem Monat 30 Menschen getötet. Die Hilfsorganisation Ärzte ohne Grenzen bezweifelt, dass die Armee aus Versehen handelte. ]]> 257 | 258 | 259 | 260 | Elendspanorama "Vorbereitung auf das nächste Leben": Amerika ist erledigt 261 | http://www.spiegel.de/kultur/literatur/vorbereitung-auf-das-naechste-leben-von-atticus-lish-rezension-a-1060858.html#ref=rss 262 | Eine Geschichte vom unteren Rand der Gesellschaft: Atticus Lish gelang mit seinem Debütroman "Vorbereitung auf das nächste Leben" ein Überraschungshit in den USA, der das Pathos zurück in die Gegenwartsliteratur bringt. 263 | Kultur 264 | Thu, 05 Nov 2015 17:17:00 +0100 265 | http://www.spiegel.de/kultur/literatur/vorbereitung-auf-das-naechste-leben-von-atticus-lish-rezension-a-1060858.html 266 | 267 | 268 | 269 | Mehrtägiger Streik: Was Lufthansa-Passagiere jetzt wissen müssen 270 | http://www.spiegel.de/reise/aktuell/lufthansa-streik-was-passagiere-wissen-muessen-a-1061342.html#ref=rss 271 | Lufthansa-Kunden müssen sich auf einen mehrtägigen Streik mit vielen Flugausfällen einrichten. Die Flugbegleiter wollen aber nicht vor Freitagmittag mit ihrem Ausstand beginnen. Hier finden Sie Antworten auf die wichtigsten Fragen. 272 | Reise 273 | Thu, 05 Nov 2015 17:08:00 +0100 274 | http://www.spiegel.de/reise/aktuell/lufthansa-streik-was-passagiere-wissen-muessen-a-1061342.html 275 | Lufthansa-Kunden müssen sich auf einen mehrtägigen Streik mit vielen Flugausfällen einrichten. Die Flugbegleiter wollen aber nicht vor Freitagmittag mit ihrem Ausstand beginnen. Hier finden Sie Antworten auf die wichtigsten Fragen.]]> 276 | 277 | 278 | 279 | Verhandlungen geplatzt: Lufthansa-Streik startet am Freitag 280 | http://www.spiegel.de/wirtschaft/unternehmen/lufthansa-streik-am-freitag-a-1061357.html#ref=rss 281 | Auch der letzte Einigungsversuch ist gescheitert: Die Flugbegleiter der Lufthansa streiken ab Freitag - mit Rücksicht auf die Kunden aber erst ab 12 Uhr. 282 | Wirtschaft 283 | Thu, 05 Nov 2015 17:04:00 +0100 284 | http://www.spiegel.de/wirtschaft/unternehmen/lufthansa-streik-am-freitag-a-1061357.html 285 | 286 | 287 | 288 | 1000 Kilometer entfernt: US-Ermittler entdecken vermissten Jungen nach 13 Jahren 289 | http://www.spiegel.de/panorama/ohio-vermisster-junge-in-den-usa-nach-13-jahren-aufgetaucht-a-1061350.html#ref=rss 290 | Mit fünf Jahren verschwand ein Junge aus Alabama. Die Mutter vermutete, der Vater habe den Jungen entführt. Jahre später haben ihn nun Ermittler in Ohio entdeckt - durch einen Zufall. 291 | Panorama 292 | Thu, 05 Nov 2015 17:00:00 +0100 293 | http://www.spiegel.de/panorama/ohio-vermisster-junge-in-den-usa-nach-13-jahren-aufgetaucht-a-1061350.html 294 | 295 | 296 | 297 | Pressekompass: Sterbehilfevereine legal oder illegal? Das sagen die Medien 298 | http://www.spiegel.de/gesundheit/diagnose/pressekompass-zur-sterbehilfe-debatte-im-bundestag-a-1061351.html#ref=rss 299 | Totales Verbot oder liberale Freigabe - die vier Gesetzentwürfe zur Sterbehilfe könnten unterschiedlicher nicht sein. Wie entscheidet der Bundestag am Freitag? Der Pressekompass zeigt Meinungstrends der Medien. 300 | Gesundheit 301 | Thu, 05 Nov 2015 16:59:00 +0100 302 | http://www.spiegel.de/gesundheit/diagnose/pressekompass-zur-sterbehilfe-debatte-im-bundestag-a-1061351.html 303 | Totales Verbot oder liberale Freigabe - die vier Gesetzentwürfe zur Sterbehilfe könnten unterschiedlicher nicht sein. Wie entscheidet der Bundestag am Freitag? Der Pressekompass zeigt Meinungstrends der Medien.]]> 304 | 305 | 306 | 307 | 308 | -------------------------------------------------------------------------------- /config/config.exs: -------------------------------------------------------------------------------- 1 | # This file is responsible for configuring your application 2 | # and its dependencies with the aid of the Mix.Config module. 3 | use Mix.Config 4 | 5 | # This configuration is loaded before any dependency and is restricted 6 | # to this project. If another project depends on this project, this 7 | # file won't be loaded nor affect the parent project. For this reason, 8 | # if you want to provide default values for your application for 9 | # third-party users, it should be done in your "mix.exs" file. 10 | 11 | # You can configure your application as: 12 | # 13 | # config :scrape, key: :value 14 | # 15 | # and access this configuration in your application as: 16 | # 17 | # Application.get_env(:scrape, :key) 18 | # 19 | # You can also configure a third-party app: 20 | # 21 | # config :logger, level: :info 22 | # 23 | 24 | # It is also possible to import configuration files, relative to this 25 | # directory. For example, you can emulate configuration per environment 26 | # by uncommenting the line below and defining dev.exs, test.exs and such. 27 | # Configuration from the imported file will override the ones defined 28 | # here (which is why it is important to import them last). 29 | # 30 | # import_config "#{Mix.env()}.exs" 31 | -------------------------------------------------------------------------------- /lib/scrape.ex: -------------------------------------------------------------------------------- 1 | defmodule Scrape do 2 | @moduledoc """ 3 | Elixir Toolkit for extracting meaningful structured data out of 4 | common web resources. 5 | 6 | This process is often called "web-scraping". Actually, the normalization 7 | and transformation of data into a well-known structured form is also 8 | known as "data engineering", which in turn is the prerequisite for most 9 | data-science/machine-learning/... algorithms in the wild. 10 | 11 | Currently Scrape supports 3 types of common web data: 12 | 13 | * Feeds: RSS or Atom XML feeds 14 | * Domains: "root" pages of a web presence 15 | * Articles: "content" pages of a web presence 16 | """ 17 | 18 | @doc """ 19 | Given a valid url, return structured data of the content. 20 | 21 | This function is intended for "content" pages. 22 | """ 23 | 24 | @spec article(String.t()) :: {:ok, map()} | {:error, any()} 25 | @spec article(String.t(), [{atom(), any()}]) :: {:ok, map()} | {:error, any()} 26 | 27 | def article(url, opts \\ []) do 28 | Scrape.Flow.Article.from_url(url, opts) 29 | end 30 | 31 | @doc """ 32 | Same as `article/2` but will return the result directly or raise an 33 | error if the result is not `:ok` 34 | """ 35 | 36 | def article!(url, opts \\ []) do 37 | {:ok, article} = Scrape.Flow.Article.from_url(url, opts) 38 | article 39 | end 40 | 41 | @doc """ 42 | Given a valid url, return structured data of the domain. 43 | 44 | This function is intended for "root" pages of a web presence. The most 45 | important usecase for Scrape is to detect possible feeds for the domain. 46 | """ 47 | 48 | @spec domain(String.t()) :: {:ok, map()} | {:error, any()} 49 | @spec domain(String.t(), [{atom(), any()}]) :: {:ok, map()} | {:error, any()} 50 | 51 | def domain(url, opts \\ []) do 52 | Scrape.Flow.Domain.from_url(url, opts) 53 | end 54 | 55 | @doc """ 56 | Same as `domain/2` but will return the result directly or raise an 57 | error if the result is not `:ok`. 58 | """ 59 | 60 | def domain!(url, opts \\ []) do 61 | {:ok, domain} = Scrape.Flow.Domain.from_url(url, opts) 62 | domain 63 | end 64 | 65 | @doc """ 66 | Given a valid url, return structured data of the feed. 67 | """ 68 | 69 | @spec feed(String.t()) :: {:ok, map()} | {:error, any()} 70 | @spec feed(String.t(), [{atom(), any()}]) :: {:ok, map()} | {:error, any()} 71 | 72 | def feed(url, opts \\ []) do 73 | Scrape.Flow.Feed.from_url(url, opts) 74 | end 75 | 76 | @doc """ 77 | Same as `feed/2` but will return the result directly or raise an error 78 | if the result is not `:ok`. 79 | """ 80 | 81 | def feed!(url, opts \\ []) do 82 | {:ok, feed} = Scrape.Flow.Feed.from_url(url, opts) 83 | feed 84 | end 85 | end 86 | -------------------------------------------------------------------------------- /lib/scrape/application.ex: -------------------------------------------------------------------------------- 1 | defmodule Scrape.Application do 2 | # See https://hexdocs.pm/elixir/Application.html 3 | # for more information on OTP Applications 4 | @moduledoc false 5 | 6 | use Application 7 | 8 | def start(_type, _args) do 9 | # List all child processes to be supervised 10 | children = [ 11 | # Starts a worker by calling: Scrape.Worker.start_link(arg) 12 | # {Scrape.Worker, arg} 13 | ] 14 | 15 | # See https://hexdocs.pm/elixir/Supervisor.html 16 | # for other strategies and supported options 17 | opts = [strategy: :one_for_one, name: Scrape.Supervisor] 18 | Supervisor.start_link(children, opts) 19 | end 20 | end 21 | -------------------------------------------------------------------------------- /lib/scrape/flow.ex: -------------------------------------------------------------------------------- 1 | defmodule Scrape.Flow do 2 | @moduledoc """ 3 | Logic Module for implementing linear data processing workflows. 4 | 5 | Uses a "token" approach to store/retrieve values and persists a pipeline 6 | state that can be halted at any time. In case that something goes wrong, 7 | the pipeline will be halted and an error object will be returned with the 8 | occured error. Therefore, the pipeline should never raise an actual exception. 9 | """ 10 | 11 | @typedoc """ 12 | Intermediate state object that holds everything relevant for the data 13 | processing work flow. `state` holds general processing information, `assigns` 14 | are the user-level data fields and `options` contains a keyword list for, 15 | well, configuration options. 16 | """ 17 | 18 | @type flow :: %__MODULE__{ 19 | state: %{ 20 | halted: boolean(), 21 | error: nil | any() 22 | }, 23 | assigns: map(), 24 | options: [{atom(), any()}] 25 | } 26 | 27 | defstruct(state: %{halted: false, error: nil}, assigns: %{}, options: []) 28 | 29 | @doc """ 30 | Initiate a new data processing flow with optional configuration. 31 | 32 | NOTE: the options are currently not used but will be in upcoming versions. 33 | 34 | ## Example 35 | iex> Flow.start() 36 | %Flow{state: %{halted: false, error: nil}, assigns: %{}, options: []} 37 | """ 38 | 39 | @spec start([{atom(), any()}]) :: flow 40 | 41 | def start(opts \\ []) do 42 | %__MODULE__{options: opts} 43 | end 44 | 45 | @doc """ 46 | Declare a new value in the data flow. 47 | 48 | Will do nothing if the flow got halted previously. If a function is given, 49 | and it raises an exception, the pipeline will catch the error and transform 50 | into a halted state. 51 | """ 52 | 53 | @spec assign(flow, [{atom(), any()}]) :: flow 54 | 55 | def assign(%__MODULE__{state: %{halted: true}} = flow, _) do 56 | flow 57 | end 58 | 59 | def assign(%__MODULE__{} = flow, [{k, v}]) when not is_function(v) do 60 | %{flow | assigns: Map.put(flow.assigns, k, v)} 61 | end 62 | 63 | def assign(%__MODULE__{} = flow, [{k, v}]) do 64 | try do 65 | %{flow | assigns: Map.put(flow.assigns, k, v.(flow.assigns))} 66 | rescue 67 | error -> %{flow | state: %{halted: true, error: {:assign, k, error}}} 68 | end 69 | end 70 | 71 | @doc """ 72 | Select keys from the flow assigns and return a map with the chosen fields. 73 | 74 | Will result in an error object if the flow got halted previously. 75 | """ 76 | 77 | @spec finish(flow, [atom()]) :: {:ok, map()} | {:error, any()} 78 | 79 | def finish(_flow, keys \\ []) 80 | 81 | def finish(%__MODULE__{state: %{halted: true, error: error}}, _) do 82 | {:error, error} 83 | end 84 | 85 | def finish(%__MODULE__{assigns: assigns}, []), do: {:ok, assigns} 86 | 87 | def finish(%__MODULE__{assigns: assigns}, keys) do 88 | {:ok, Map.take(assigns, keys)} 89 | end 90 | end 91 | -------------------------------------------------------------------------------- /lib/scrape/flow/article.ex: -------------------------------------------------------------------------------- 1 | defmodule Scrape.Flow.Article do 2 | @moduledoc false 3 | 4 | alias Scrape.Flow 5 | alias Scrape.IR.HTML 6 | alias Scrape.IR.Text 7 | 8 | def from_url(url, opts \\ []) do 9 | Flow.start(opts) 10 | |> Flow.assign(url: url) 11 | |> Flow.assign(html: &Scrape.Source.HTTP.get!(&1[:url])) 12 | |> process_html() 13 | end 14 | 15 | def from_file(path, opts \\ []) do 16 | Flow.start(opts) 17 | |> Flow.assign(path: path) 18 | |> Flow.assign(html: &Scrape.Source.Disk.get!(&1[:path])) 19 | |> process_html() 20 | end 21 | 22 | def from_string(html, opts \\ []) do 23 | Flow.start(opts) 24 | |> Flow.assign(html: html) 25 | |> process_html() 26 | end 27 | 28 | defp process_html(%{assigns: %{html: nil}}), do: {:error, :html_invalid} 29 | 30 | defp process_html(%{assigns: %{html: ""}}), do: {:error, :html_invalid} 31 | 32 | defp process_html(flow) do 33 | flow 34 | |> Flow.assign(dom: &Floki.parse(&1[:html])) 35 | |> Flow.assign(title: &HTML.title(&1[:dom])) 36 | |> Flow.assign(image_url: &HTML.image_url(&1[:dom], &1[:url])) 37 | |> Flow.assign(readable_html: &HTML.simple(&1[:dom])) 38 | |> Flow.assign(text: fn %{html: html} -> HTML.content(html) || HTML.sentences(html) end) 39 | |> Flow.assign(language: &Text.detect_language(&1[:text])) 40 | |> Flow.assign(stems: &Text.semantic_keywords(&1[:text], 30, &1[:language])) 41 | |> Flow.assign(summary: &Text.extract_summary(&1[:text], &1[:stems], &1[:language])) 42 | |> Flow.finish([:url, :title, :text, :summary, :language, :stems, :image_url, :readable_html]) 43 | end 44 | end 45 | -------------------------------------------------------------------------------- /lib/scrape/flow/domain.ex: -------------------------------------------------------------------------------- 1 | defmodule Scrape.Flow.Domain do 2 | @moduledoc false 3 | 4 | alias Scrape.Flow 5 | alias Scrape.IR.HTML 6 | 7 | def from_url(url, opts \\ []) do 8 | Flow.start(opts) 9 | |> Flow.assign(url: url) 10 | |> Flow.assign(html: &Scrape.Source.HTTP.get!(&1[:url])) 11 | |> process_html() 12 | end 13 | 14 | def from_file(path, opts \\ []) do 15 | Flow.start(opts) 16 | |> Flow.assign(path: path) 17 | |> Flow.assign(html: &Scrape.Source.Disk.get!(&1[:path])) 18 | |> process_html() 19 | end 20 | 21 | def from_string(html, opts \\ []) do 22 | Flow.start(opts) 23 | |> Flow.assign(html: html) 24 | |> process_html() 25 | end 26 | 27 | defp process_html(%{assigns: %{html: nil}}), do: {:error, :html_invalid} 28 | 29 | defp process_html(%{assigns: %{html: ""}}), do: {:error, :html_invalid} 30 | 31 | defp process_html(flow) do 32 | flow 33 | |> Flow.assign(dom: &Floki.parse(&1[:html])) 34 | |> Flow.assign(title: &HTML.title(&1[:dom])) 35 | |> Flow.assign(description: &HTML.description(&1[:dom])) 36 | |> Flow.assign(icon_url: &HTML.icon_url(&1[:dom], &1[:url])) 37 | |> Flow.assign(feed_urls: &HTML.feed_urls(&1[:dom], &1[:url])) 38 | |> Flow.finish([:url, :title, :description, :icon_url, :feed_urls]) 39 | end 40 | end 41 | -------------------------------------------------------------------------------- /lib/scrape/flow/feed.ex: -------------------------------------------------------------------------------- 1 | defmodule Scrape.Flow.Feed do 2 | @moduledoc false 3 | 4 | alias Scrape.Flow 5 | alias Scrape.IR.Feed 6 | 7 | def from_url(url, opts \\ []) do 8 | Flow.start(opts) 9 | |> Flow.assign(url: url) 10 | |> Flow.assign(xml: &Scrape.Source.HTTP.get!(&1[:url])) 11 | |> process_xml() 12 | end 13 | 14 | def from_file(path, opts \\ []) do 15 | Flow.start(opts) 16 | |> Flow.assign(path: path) 17 | |> Flow.assign(url: nil) 18 | |> Flow.assign(xml: &Scrape.Source.Disk.get!(&1[:path])) 19 | |> process_xml() 20 | end 21 | 22 | def from_string(xml, opts \\ []) do 23 | Flow.start(opts) 24 | |> Flow.assign(xml: xml) 25 | |> Flow.assign(url: nil) 26 | |> process_xml() 27 | end 28 | 29 | defp process_xml(%{assigns: %{xml: nil}}), do: {:error, :xml_invalid} 30 | 31 | defp process_xml(%{assigns: %{xml: ""}}), do: {:error, :xml_invalid} 32 | 33 | defp process_xml(flow) do 34 | flow 35 | |> Flow.assign(tree: &Scrape.Tools.Tree.from_xml_string(&1[:xml])) 36 | |> Flow.assign(title: &Feed.title(&1[:tree])) 37 | |> Flow.assign(description: &Feed.description(&1[:tree])) 38 | |> Flow.assign(website_url: &Feed.website_url(&1[:tree])) 39 | |> Flow.assign(items: &items/1) 40 | |> Flow.finish([:url, :title, :description, :website_url, :items]) 41 | end 42 | 43 | defp items(%{tree: tree, url: url}) do 44 | tree 45 | |> Feed.items() 46 | |> Enum.map(fn item -> Scrape.Flow.FeedItem.from_tree(item, url) end) 47 | |> Enum.filter(fn {status, _} -> status == :ok end) 48 | |> Enum.map(&elem(&1, 1)) 49 | end 50 | end 51 | -------------------------------------------------------------------------------- /lib/scrape/flow/feed_item.ex: -------------------------------------------------------------------------------- 1 | defmodule Scrape.Flow.FeedItem do 2 | @moduledoc false 3 | 4 | alias Scrape.Flow 5 | alias Scrape.IR.FeedItem, as: Item 6 | 7 | def from_tree(tree, url, opts \\ []) do 8 | Flow.start(opts) 9 | |> Flow.assign(tree: tree) 10 | |> Flow.assign(url: url) 11 | |> Flow.assign(title: &Item.title(&1[:tree])) 12 | |> Flow.assign(description: &Item.description(&1[:tree])) 13 | |> Flow.assign(article_url: &Item.article_url(&1[:tree], &1[:url])) 14 | |> Flow.assign(tags: &Item.tags(&1[:tree])) 15 | |> Flow.assign(author: &Item.author(&1[:tree])) 16 | |> Flow.assign(image_url: &Item.image_url(&1[:tree], &1[:url])) 17 | |> Flow.finish([:title, :description, :article_url, :tags, :author, :image_url]) 18 | end 19 | end 20 | -------------------------------------------------------------------------------- /lib/scrape/ir/feed.ex: -------------------------------------------------------------------------------- 1 | defmodule Scrape.IR.Feed do 2 | @moduledoc """ 3 | Information Retrieval implementations to extract data from feeds (RSS or Atom). 4 | 5 | Makes intense use of `Scrape.Tools.Tree` and it's functions to operate on 6 | nested maps instead of raw XML documents. 7 | """ 8 | 9 | alias Scrape.Tools.Tree 10 | alias Scrape.Tools.URL 11 | 12 | @doc """ 13 | Extract the (best) title from the feed. 14 | 15 | ## Example 16 | iex> Feed.title("abc") 17 | "abc" 18 | """ 19 | 20 | @spec title(String.t() | map()) :: nil | String.t() | map() 21 | 22 | def title(feed) when is_binary(feed) do 23 | feed |> Tree.from_xml_string() |> title() 24 | end 25 | 26 | def title(feed) when is_map(feed) do 27 | Tree.first(feed, ["rss.channel.title", "feed.title"]) 28 | end 29 | 30 | @doc """ 31 | Extract the (best) description from the feed. 32 | 33 | ## Example 34 | iex> Feed.description("abc") 35 | "abc" 36 | """ 37 | 38 | @spec description(String.t() | map()) :: nil | String.t() | map() 39 | 40 | def description(feed) when is_binary(feed) do 41 | feed |> Tree.from_xml_string() |> description() 42 | end 43 | 44 | def description(feed) when is_map(feed) do 45 | Tree.first(feed, [ 46 | "rss.channel.description", 47 | "rss.channel.subtitle", 48 | "feed.description", 49 | "feed.subtitle" 50 | ]) 51 | end 52 | 53 | @doc """ 54 | Extract the website_url from the feed. 55 | 56 | ## Example 57 | iex> Feed.website_url("") 58 | "http://example.com" 59 | """ 60 | 61 | @spec website_url(String.t() | map()) :: nil | String.t() | map() 62 | 63 | def website_url(feed) when is_binary(feed) do 64 | feed |> Tree.from_xml_string() |> website_url() 65 | end 66 | 67 | def website_url(feed) when is_map(feed) do 68 | feed 69 | |> Tree.first(["rss.channel.link", "feed.link.href"]) 70 | |> normalize() 71 | end 72 | 73 | defp normalize(nil), do: nil 74 | defp normalize(""), do: nil 75 | defp normalize(url), do: url |> URL.base() 76 | 77 | @doc """ 78 | Returns the list of all feed items. 79 | 80 | ## Example 81 | iex> Feed.items("abc") 82 | [%{"title" => "abc"}] 83 | """ 84 | 85 | @spec items(String.t() | map()) :: nil | [map()] 86 | 87 | def items(feed) when is_binary(feed) do 88 | feed |> Tree.from_xml_string() |> items() 89 | end 90 | 91 | def items(feed) when is_map(feed) do 92 | Tree.find_all(feed, ["feed.entry", "rss.channel.item"]) 93 | end 94 | end 95 | -------------------------------------------------------------------------------- /lib/scrape/ir/feed_item.ex: -------------------------------------------------------------------------------- 1 | defmodule Scrape.IR.FeedItem do 2 | @moduledoc """ 3 | Similar (and used by) `Scrape.IR.Feed`, but has specialized selectors 4 | to extract data from feed items/entries. 5 | """ 6 | 7 | alias Scrape.Tools.Tree 8 | alias Scrape.Tools.URL 9 | 10 | @doc """ 11 | Extract the (best) title from the feed item. 12 | 13 | ## Example 14 | iex> FeedItem.title("abc") 15 | "abc" 16 | """ 17 | 18 | @spec title(String.t() | map()) :: nil | String.t() 19 | 20 | def title(tree) when is_binary(tree) do 21 | tree |> Tree.from_xml_string() |> title() 22 | end 23 | 24 | def title(tree) when is_map(tree) do 25 | tree 26 | |> Tree.first(["title"]) 27 | |> normalize_to_string() 28 | end 29 | 30 | @doc """ 31 | Extract the (best) description from the feed item. 32 | 33 | ## Example 34 | iex> FeedItem.description("abc") 35 | "abc" 36 | """ 37 | 38 | @spec description(String.t() | map()) :: nil | String.t() 39 | 40 | def description(tree) when is_binary(tree) do 41 | tree |> Tree.from_xml_string() |> description() 42 | end 43 | 44 | def description(tree) when is_map(tree) do 45 | tree 46 | |> Tree.first(["description", "summary", "content"]) 47 | |> normalize_to_string() 48 | end 49 | 50 | @doc """ 51 | Extract the article_url from the feed item. 52 | 53 | ## Example 54 | iex> FeedItem.article_url("") 55 | "http://example.com" 56 | 57 | iex> FeedItem.article_url("", "http://example.com") 58 | "http://example.com/url" 59 | """ 60 | 61 | @spec article_url(String.t() | map(), nil | String.t()) :: nil | String.t() 62 | 63 | def article_url(tree, url \\ "") 64 | 65 | def article_url(tree, url) when is_binary(tree) do 66 | tree |> Tree.from_xml_string() |> article_url(url) 67 | end 68 | 69 | def article_url(tree, url) when is_map(tree) do 70 | tree 71 | |> Tree.first(["link.href", "link"]) 72 | |> normalize_to_string() 73 | |> normalize_url(url) 74 | end 75 | 76 | @doc """ 77 | Extract the possible tags from the feed item. 78 | 79 | ## Example 80 | iex> FeedItem.tags("abc") 81 | ["abc"] 82 | 83 | iex> FeedItem.tags("") 84 | [] 85 | """ 86 | 87 | @spec tags(String.t() | map()) :: [String.t()] 88 | 89 | def tags(tree) when is_binary(tree) do 90 | tree |> Tree.from_xml_string() |> tags() 91 | end 92 | 93 | def tags(tree) when is_map(tree) do 94 | tree 95 | |> Tree.find("category") 96 | |> List.wrap() 97 | |> Enum.map(&normalize_to_string/1) 98 | |> Enum.reject(&is_nil/1) 99 | |> Enum.map(&Scrape.IR.Text.clean/1) 100 | |> Enum.map(&String.downcase/1) 101 | end 102 | 103 | @doc """ 104 | Extract the author from the feed item. 105 | 106 | ## Example 107 | iex> FeedItem.author("abc") 108 | "abc" 109 | """ 110 | 111 | @spec author(String.t() | map()) :: nil | String.t() 112 | 113 | def author(tree) when is_binary(tree) do 114 | tree |> Tree.from_xml_string() |> author() 115 | end 116 | 117 | def author(tree) when is_map(tree) do 118 | tree 119 | |> Tree.first(["~creator", "author.name", "author"]) 120 | |> normalize_to_string() 121 | end 122 | 123 | @doc """ 124 | Extract the image_url from the feed item. 125 | 126 | ## Example 127 | iex> FeedItem.image_url("") 128 | "abc" 129 | """ 130 | 131 | @spec image_url(String.t() | map(), nil | String.t()) :: nil | String.t() 132 | 133 | def image_url(tree, url \\ "") 134 | 135 | def image_url(tree, url) when is_binary(tree) do 136 | tree |> Tree.from_xml_string() |> image_url(url) 137 | end 138 | 139 | def image_url(tree, url) when is_map(tree) do 140 | tree 141 | |> Tree.first(["enclosure.url", "media.content"]) 142 | |> normalize_to_string() 143 | |> inline_image(tree) 144 | |> normalize_url(url) 145 | end 146 | 147 | defp inline_image(nil, %{"content" => content}) do 148 | rx = ~r/\ssrc=["']*(([^'"\s]+)\.(jpe?g)|(png))["'\s]/i 149 | 150 | case Regex.run(rx, content, capture: :all_but_first) do 151 | [match] when is_binary(match) -> match 152 | [match | _] when is_binary(match) -> match 153 | _ -> nil 154 | end 155 | end 156 | 157 | defp inline_image(img, _), do: img 158 | 159 | # ensure that a value is either a string or nil, but nothing else 160 | defp normalize_to_string(value) when is_binary(value), do: value 161 | defp normalize_to_string(_), do: nil 162 | 163 | # merge an relative url into an absolute url if possible 164 | defp normalize_url(link, url) when is_binary(url), do: URL.merge(link, url) 165 | defp normalize_url(link, _), do: link 166 | end 167 | -------------------------------------------------------------------------------- /lib/scrape/ir/html.ex: -------------------------------------------------------------------------------- 1 | defmodule Scrape.IR.HTML do 2 | @moduledoc """ 3 | Information Retrieval functions for extracting data out of HTML documents. 4 | 5 | Makes extensive use of `Scrape.Tools.DOM` under the hood, so a customized 6 | jQuery-like approach can be taken. 7 | """ 8 | 9 | alias Scrape.Tools.DOM 10 | alias Scrape.Tools.URL 11 | 12 | @doc """ 13 | Extract the best possible title from a HTML document (string or DOM) or nil. 14 | 15 | ## Examples 16 | iex> HTML.title("") 17 | nil 18 | 19 | iex> HTML.title("abc") 20 | "abc" 21 | """ 22 | 23 | @spec title(DOM.dom()) :: nil | String.t() 24 | 25 | @title_queries [ 26 | {"meta[property='og:title']", "content"}, 27 | {"meta[property='twitter:title']", "content"}, 28 | {"h1"}, 29 | {"title"} 30 | ] 31 | 32 | def title(dom) do 33 | case DOM.first(dom, @title_queries) do 34 | nil -> nil 35 | match -> strip_suffix(match) 36 | end 37 | end 38 | 39 | defp strip_suffix(value) do 40 | rx = ~r/\s[|-].{1}.+$/ 41 | 42 | case String.match?(value, rx) do 43 | true -> value |> String.split(rx) |> List.first() 44 | false -> value 45 | end 46 | end 47 | 48 | @doc """ 49 | Extract the best possible description from a HTML document or nil. 50 | 51 | ## Examples 52 | iex> HTML.description("") 53 | nil 54 | 55 | iex> HTML.description("") 56 | "abc" 57 | """ 58 | 59 | @spec description(DOM.dom() | String.t()) :: nil | String.t() 60 | 61 | @description_queries [ 62 | {"meta[property='og:description']", "content"}, 63 | {"meta[name='twitter:description']", "content"}, 64 | {"meta[name='description']", "content"} 65 | ] 66 | 67 | def description(dom) do 68 | DOM.first(dom, @description_queries) 69 | end 70 | 71 | @doc """ 72 | Attempts to find the best image_url for the website or nil. 73 | 74 | If a root url is given, will transform relative images to absolute urls. 75 | 76 | ## Examples 77 | iex> HTML.image_url("") 78 | nil 79 | 80 | iex> HTML.image_url("") 81 | "img.jpg" 82 | """ 83 | @spec image_url(DOM.dom()) :: nil | String.t() 84 | @spec image_url(DOM.dom(), String.t()) :: nil | String.t() 85 | 86 | @image_url_queries [ 87 | {"meta[property='og:image']", "content"}, 88 | {"meta[name='twitter:image']", "content"} 89 | ] 90 | 91 | def image_url(dom, url \\ "") do 92 | case DOM.first(dom, @image_url_queries) do 93 | nil -> nil 94 | match -> URL.merge(match, url) 95 | end 96 | end 97 | 98 | @doc """ 99 | Attempts to find something resembling a favicon url or nil. 100 | 101 | If a root url is given, will transform relative images to absolute urls. 102 | 103 | ## Examples 104 | iex> HTML.icon_url("") 105 | nil 106 | 107 | iex> HTML.icon_url("") 108 | "img.jpg" 109 | """ 110 | 111 | @spec icon_url(DOM.dom()) :: nil | String.t() 112 | @spec icon_url(DOM.dom(), String.t()) :: nil | String.t() 113 | 114 | @icon_url_queries [ 115 | {"link[rel='apple-touch-icon']", "href"}, 116 | {"link[rel='apple-touch-icon-precomposed']", "href"}, 117 | {"link[rel='shortcut icon']", "href"}, 118 | {"link[rel='icon']", "href"} 119 | ] 120 | 121 | def icon_url(dom, url \\ "") do 122 | case DOM.first(dom, @icon_url_queries) do 123 | nil -> nil 124 | match -> URL.merge(match, url) 125 | end 126 | end 127 | 128 | @doc """ 129 | Attempts to fetch all possible feed_urls from the given HTML document. 130 | 131 | ## Examples 132 | iex> HTML.feed_urls("") 133 | [] 134 | 135 | iex> HTML.feed_urls("") 136 | ["/feed.rss"] 137 | """ 138 | 139 | @spec feed_urls(DOM.dom()) :: [String.t()] 140 | @spec feed_urls(DOM.dom(), String.t()) :: [String.t()] 141 | 142 | def feed_urls(dom, url \\ "") do 143 | list = feed_meta_tag(dom) ++ feed_inline(dom) 144 | 145 | list 146 | |> Enum.filter(&URL.is_http?(&1)) 147 | |> Enum.map(&URL.merge(&1, url)) 148 | |> Enum.uniq() 149 | end 150 | 151 | defp feed_meta_tag(dom) do 152 | selector = """ 153 | link[type='application/rss+xml'], 154 | link[type='application/atom+xml'], 155 | link[rel='alternate'] 156 | """ 157 | 158 | DOM.attrs(dom, selector, "href") 159 | end 160 | 161 | defp feed_inline(dom) do 162 | rx = ~r{href=['"]([^'"]*(rss|atom|feed|xml)[^'"]*)['"]} 163 | str = Floki.raw_html(dom) 164 | matches = Regex.scan(rx, str, capture: :all_but_first) 165 | Enum.map(matches, &List.first/1) 166 | end 167 | 168 | @doc """ 169 | Try to extract the semantically relevant part from a given document. 170 | 171 | 172 | Uses the [Readability](https://hex.pm/packages/readability) algorithm, which 173 | might fail sometimes. Ideally, it returns a single string containing full 174 | sentences. Remember that this method uses a few heuristics that *somehow* 175 | work together nicely in many cases, but nothing more. 176 | """ 177 | 178 | def simple(dom) do 179 | try do 180 | dom 181 | |> Floki.raw_html() 182 | |> Readability.article() 183 | |> Readability.readable_html() 184 | |> String.replace(~r/]*>(.*?)<\/a>/, "\\1") 185 | rescue 186 | _ -> nil 187 | end 188 | end 189 | 190 | @doc """ 191 | Try to extract the relevant text content from a given document. 192 | 193 | Uses the [Readability](https://hex.pm/packages/readability) algorithm, which 194 | might fail sometimes. Ideally, it returns a single string containing full 195 | sentences. Remember that this method uses a few heuristics that *somehow* 196 | work together nicely in many cases, but nothing more. 197 | """ 198 | 199 | @spec content(DOM.dom()) :: nil | String.t() 200 | 201 | def content(dom) do 202 | try do 203 | dom 204 | |> Readability.article() 205 | |> Floki.filter_out("figure") 206 | |> Readability.readable_text() 207 | |> String.replace(~r/\s+/, " ") 208 | |> String.replace(~r/(\s\S+[a-zäöüß]+)([A-ZÄÖÜ]\S+\s)/u, "\\1. \\2") 209 | rescue 210 | _ -> nil 211 | end 212 | end 213 | 214 | @doc """ 215 | Convenient fallback function if `content/1` didn't work. Uses `paragraphs/1` 216 | under the hood. 217 | """ 218 | 219 | @spec sentences(DOM.dom()) :: nil | String.t() 220 | 221 | def sentences(dom) do 222 | case paragraphs(dom) do 223 | [] -> nil 224 | list -> Enum.join(list, ".\n\n") 225 | end 226 | end 227 | 228 | @doc """ 229 | Attempt to find the most meaningful content snippets in the HTML document. 230 | 231 | Can be used as a fallback algorithm if `content/1` did return nil but *some* 232 | text corpus is needed to work with. 233 | 234 | A text paragraph is relevant if it has a minimum amount of characters and 235 | contains any indicators of a sentence-like structure. 236 | Very naive approach, but works surprisingly well so far. 237 | """ 238 | 239 | @spec paragraphs(DOM.dom()) :: [String.t()] 240 | 241 | def paragraphs(dom) do 242 | dom 243 | |> Floki.find("article, p, div, body") 244 | |> Enum.map(&Floki.text(&1, deep: false)) 245 | |> Enum.map(&Scrape.IR.Text.normalize_whitespace/1) 246 | |> Enum.filter(¶graph_is_relevant?/1) 247 | end 248 | 249 | defp paragraph_is_relevant?(paragraph) do 250 | String.length(paragraph) > 30 && 251 | String.contains?(paragraph, [". ", "? ", "! ", "\" ", "\", ", ": "]) 252 | end 253 | end 254 | -------------------------------------------------------------------------------- /lib/scrape/ir/text.ex: -------------------------------------------------------------------------------- 1 | defmodule Scrape.IR.Text do 2 | @moduledoc """ 3 | Collection of text mining algorithms, like summarization, classification and 4 | clustering. 5 | 6 | Details are hidden within the algorithms, so a clean interface can be provided. 7 | """ 8 | 9 | alias Scrape.IR.Text.TFIDF 10 | alias Scrape.Tools.Word 11 | 12 | @doc false 13 | def generate_summary(text) do 14 | # TODO: my markov chain implementation belongs here. 15 | text 16 | end 17 | 18 | @doc """ 19 | Dissect a text into sentences, weight their stemmed keywords against each other and 20 | return the 3 semantically most important sentences. 21 | """ 22 | 23 | def extract_summary(text, start_words, language \\ :en) do 24 | text 25 | |> TFIDF.generate_database(language) 26 | |> TFIDF.query(start_words) 27 | end 28 | 29 | @doc """ 30 | Find out in which natural language the given text is written in. 31 | 32 | Currently only german and (fallback) english are valid results. Uses external 33 | library [Paasaa](https://hex.pm/packages/paasaa). 34 | 35 | ## Example 36 | iex> Scrape.IR.Text.detect_language("the quick brown fox jumps over...") 37 | :en 38 | 39 | iex> Scrape.IR.Text.detect_language("Es ist ein schönes Wetter heute...") 40 | :de 41 | """ 42 | 43 | @spec detect_language(String.t()) :: :de | :en 44 | 45 | def detect_language(text) do 46 | case Paasaa.detect(text) do 47 | "deu" -> :de 48 | _ -> :en 49 | end 50 | end 51 | 52 | @doc """ 53 | Remove all occurences of javascript from a HTML snippet. 54 | 55 | Uses a regex (!) 56 | 57 | ## Example 58 | iex> Scrape.IR.Text.without_js("ac") 59 | "ac" 60 | """ 61 | 62 | @spec without_js(String.t()) :: String.t() 63 | 64 | def without_js(text) do 65 | rx = ~r/)<[^<]*)*<\/script>/i 66 | String.replace(text, rx, "") 67 | end 68 | 69 | @doc """ 70 | Strip all HTML tags from a text. 71 | 72 | ## Example 73 | iex> Scrape.IR.Text.without_html("

stuff

") 74 | "stuff" 75 | """ 76 | 77 | @spec without_html(String.t()) :: String.t() 78 | 79 | def without_html(text) do 80 | text 81 | |> Floki.parse() 82 | |> Floki.text() 83 | end 84 | 85 | @doc """ 86 | A text paragraph shall not include any whitespace except single spaces 87 | between words. 88 | 89 | ## Example 90 | iex> Scrape.IR.Text.normalize_whitespace("\\r\\thello world\\r ") 91 | "hello world" 92 | """ 93 | 94 | @spec normalize_whitespace(String.t()) :: String.t() 95 | 96 | def normalize_whitespace(text) do 97 | text 98 | |> String.replace(~r/\s+/, " ") 99 | |> String.replace(~r/\s+/, " ") 100 | |> String.trim() 101 | end 102 | 103 | @doc """ 104 | Removes all junk from a given text, like javascript, html or mixed whitespace. 105 | 106 | ## Example 107 | iex> Scrape.IR.Text.clean("\\t hello, \\rworld!") 108 | "hello, world!" 109 | """ 110 | def clean(text) do 111 | text 112 | |> without_js() 113 | |> without_html() 114 | |> normalize_whitespace() 115 | end 116 | 117 | @doc """ 118 | Dissect a text into word tokens. 119 | 120 | The resulting list is a list of downcased words with all non-word-characters 121 | stripped. 122 | 123 | ## Examples 124 | iex> Scrape.IR.Text.tokenize("Hello, world!") 125 | ["hello", "world"] 126 | """ 127 | 128 | @spec tokenize(String.t()) :: [String.t()] 129 | 130 | def tokenize(text) do 131 | text 132 | |> String.replace(~r/[^\w\s]/u, " ") 133 | |> normalize_whitespace() 134 | |> String.downcase() 135 | |> String.split() 136 | end 137 | 138 | @doc """ 139 | Dissect a text into word tokens. 140 | 141 | The resulting list is a list of downcased words with all non-word-characters 142 | stripped, but common phrase delimiters still included. 143 | 144 | ## Examples 145 | iex> Scrape.IR.Text.tokenize_preserve_delimiters("Hello, world!") 146 | ["hello", ",", "world", "!"] 147 | """ 148 | 149 | @spec tokenize_preserve_delimiters(String.t()) :: [String.t()] 150 | 151 | def tokenize_preserve_delimiters(text) do 152 | text 153 | |> String.replace(~r/([,\.\!\?])/u, " \\1 ") 154 | |> String.replace(~r/[^\w\s,\.\!\?]/u, " ") 155 | |> normalize_whitespace() 156 | |> String.downcase() 157 | |> String.split() 158 | end 159 | 160 | @doc """ 161 | Dissect a text into word tokens similar to `tokenize/1` but strips words 162 | that carry no semantic value. 163 | 164 | ## Examples 165 | iex> Scrape.IR.Text.semantic_tokenize("A beautiful day!", :en) 166 | ["beautiful", "day"] 167 | """ 168 | 169 | @spec semantic_tokenize(String.t(), :de | :en) :: [String.t()] 170 | 171 | def semantic_tokenize(text, language \\ :en) do 172 | text 173 | |> tokenize() 174 | |> Enum.filter(fn word -> Word.is_meaningful?(word, language) end) 175 | end 176 | 177 | @doc """ 178 | Similar to `semantic_tokenize/2`, but also determines the n (default: 20) 179 | most relevant **stemmed** tokens from the list. 180 | """ 181 | 182 | def semantic_keywords(text, n \\ 20, language \\ :en) do 183 | text 184 | |> semantic_tokenize(language) 185 | |> Enum.map(&Word.stem(&1, language)) 186 | |> Enum.reduce(%{}, &aggregate_word_scores/2) 187 | |> Map.to_list() 188 | |> Enum.sort_by(fn {_word, score} -> score end, &>=/2) 189 | |> Enum.take(n) 190 | |> Enum.map(&elem(&1, 0)) 191 | end 192 | 193 | defp aggregate_word_scores(word, acc) do 194 | existing = Map.get(acc, word, 0) 195 | Map.put(acc, word, existing + 1) 196 | end 197 | end 198 | -------------------------------------------------------------------------------- /lib/scrape/ir/text/rake.ex: -------------------------------------------------------------------------------- 1 | defmodule Scrape.IR.Text.RAKE do 2 | alias Scrape.IR.Text 3 | alias Scrape.Tools.Word 4 | 5 | def sample_text() do 6 | """ 7 | Compatibility of systems of linear constraints over the set of natural numbers 8 | 9 | Criteria of compatibility of a system of linear Diophantine equations, strict inequations, 10 | and nonstrict inequations are considered. Upper bounds for components of a minimal set of 11 | solutions and algorithms of construction of minimal generating sets of solutions for all 12 | types of systems are given. These criteria and the corresponding algorithms for constructing 13 | a minimal supporting set of solutions can be used in solving all the considered types of 14 | systems and systems of mixed types. 15 | """ 16 | end 17 | 18 | def execute(text, language \\ :en) do 19 | text 20 | |> Text.tokenize_preserve_delimiters() 21 | |> calculate_candidates(language) 22 | |> calculate_keyword_scores() 23 | end 24 | 25 | defp calculate_candidates(tokens, language) do 26 | calculate_candidates(tokens, language, [], []) 27 | end 28 | 29 | defp calculate_candidates([], _language, candidates, current_candidate) do 30 | candidates 31 | |> Kernel.++([current_candidate]) 32 | |> List.flatten() 33 | |> Enum.filter(&(String.length(String.trim(&1)) > 0)) 34 | end 35 | 36 | defp calculate_candidates([token | tokens], language, candidates, current_candidate) do 37 | if Word.is_stopword?(token, language) || token in [",", ".", "?", "!"] do 38 | calculate_candidates( 39 | tokens, 40 | language, 41 | candidates ++ [current_candidate |> Enum.join(" ")], 42 | [] 43 | ) 44 | else 45 | calculate_candidates(tokens, language, candidates, current_candidate ++ [token]) 46 | end 47 | end 48 | 49 | defp calculate_keyword_scores(candidates) do 50 | words = candidates |> Enum.map(&String.split(&1, " ")) |> List.flatten() |> Enum.uniq() 51 | len = length(words) 52 | 53 | word_index = 54 | 0..len |> Stream.zip(words) |> Enum.map(fn {k, v} -> {v, k} end) |> Enum.into(%{}) 55 | 56 | table = :ets.new(:co_occurence_matrix, [:set]) 57 | 58 | for candidate <- candidates do 59 | chunks = String.split(candidate, " ") 60 | 61 | Enum.each(chunks, fn chunk -> 62 | i = word_index[chunk] 63 | value = matrix_value(table, i, i) 64 | :ets.insert(table, {{i, i}, value + 1}) 65 | end) 66 | 67 | if length(chunks) > 1 do 68 | Enum.each(permutations(chunks), fn words -> 69 | i1 = word_index[Enum.at(words, 0)] 70 | i2 = word_index[Enum.at(words, 1)] 71 | value = matrix_value(table, i1, i2) 72 | :ets.insert(table, {{i1, i2}, value + 1}) 73 | end) 74 | end 75 | end 76 | 77 | word_scores = 78 | words 79 | |> Enum.map(fn word -> {word, matrix_row(table, word_index[word], len)} end) 80 | |> Enum.into(%{}) 81 | 82 | candidates 83 | |> Enum.uniq() 84 | |> Enum.map(fn candidate -> 85 | chunks = candidate |> String.split(" ") 86 | score = chunks |> Enum.map(&word_scores[&1]) |> Enum.sum() 87 | {candidate, score} 88 | end) 89 | |> Enum.sort_by(fn {_candidate, score} -> score end, &>=/2) 90 | |> Enum.take(length(words) |> Integer.floor_div(3)) 91 | |> Enum.map(fn {candidate, _score} -> candidate end) 92 | end 93 | 94 | defp matrix_value(table, i1, i2) do 95 | case :ets.lookup(table, {i1, i2}) do 96 | [] -> 0 97 | [{_, value}] -> value 98 | end 99 | end 100 | 101 | defp matrix_row(table, index, max) do 102 | 0..max 103 | |> Enum.map(fn i -> matrix_value(table, i, index) end) 104 | |> Enum.sum() 105 | end 106 | 107 | defp permutations(list), do: for(x <- list, y <- list, x != y, do: [x, y]) 108 | end 109 | -------------------------------------------------------------------------------- /lib/scrape/ir/text/tfidf.ex: -------------------------------------------------------------------------------- 1 | defmodule Scrape.IR.Text.TFIDF do 2 | @moduledoc false 3 | 4 | defstruct [:text, :language, :corpus, :idf] 5 | 6 | def generate_database(text, language) do 7 | %__MODULE__{text: text, language: language} 8 | |> create_corpus() 9 | |> tokenize_sentences() 10 | |> calculate_idf_scores() 11 | end 12 | 13 | def query(%__MODULE__{corpus: corpus} = data, words) do 14 | idf = 15 | words 16 | |> Enum.map(fn word -> {word, calculate_inverse_document_frequency(data, word)} end) 17 | |> Enum.into(%{}) 18 | 19 | find_id = fn excludes -> find_best_sentence_id(corpus, idf, words, excludes) end 20 | s1_id = find_id.([]) 21 | s1_sentence = corpus[s1_id] 22 | 23 | s2_id = find_id.(s1_sentence.words) 24 | s2_sentence = corpus[s2_id] 25 | 26 | s3_id = find_id.(List.flatten([s1_sentence.words, s2_sentence.words])) 27 | s3_sentence = corpus[s3_id] 28 | 29 | [s1_sentence, s2_sentence, s3_sentence] 30 | |> Enum.map(fn %{sentence: sentence} -> sentence end) 31 | |> Enum.map(fn sentence -> sentence <> "." end) 32 | |> Enum.join(" ") 33 | end 34 | 35 | defp find_best_sentence_id(corpus, idf, words, blacklist) do 36 | for {id, %{tf: tf}} <- corpus do 37 | score = 38 | tf 39 | |> Enum.filter(fn {word, _} -> word in words end) 40 | |> Enum.filter(fn {word, _} -> word not in blacklist end) 41 | |> Enum.map(fn {word, value} -> value * idf[word] end) 42 | |> Enum.sum() 43 | 44 | {id, score} 45 | end 46 | |> Enum.sort_by(fn {_id, score} -> score end, &>=/2) 47 | |> List.first() 48 | |> elem(0) 49 | end 50 | 51 | defp create_corpus(%__MODULE__{text: text} = data) do 52 | corpus = 53 | text 54 | |> String.replace(~r/(\s\S+[a-zäöüß]+)([A-ZÄÖÜ]\S+\s)/u, "\\1. \\2") 55 | |> String.split(~r/[\?!\.\s]\s/) 56 | |> Enum.map(&String.trim/1) 57 | |> Enum.map(&String.replace(&1, ~r/\.+$/, "")) 58 | |> Enum.reject(fn sentence -> String.length(sentence) < 3 end) 59 | |> Enum.uniq() 60 | |> Enum.with_index() 61 | |> Enum.map(fn {sentence, i} -> {i, %{sentence: sentence}} end) 62 | |> Enum.into(%{}) 63 | 64 | %{data | corpus: corpus} 65 | end 66 | 67 | defp tokenize_sentences(%__MODULE__{corpus: corpus, language: language} = data) do 68 | updated_corpus = 69 | for {id, %{sentence: sentence} = document} <- corpus do 70 | words = tokenize(sentence, language) 71 | 72 | updated_document = 73 | document 74 | |> Map.put(:words, words) 75 | |> Map.put(:tf, calculate_term_frequency(words)) 76 | 77 | {id, updated_document} 78 | end 79 | |> Enum.into(%{}) 80 | 81 | %{data | corpus: updated_corpus} 82 | end 83 | 84 | defp calculate_idf_scores(%__MODULE__{corpus: corpus} = data) do 85 | idf = 86 | corpus 87 | |> Enum.map(fn {_id, %{words: words}} -> words end) 88 | |> List.flatten() 89 | |> Enum.uniq() 90 | |> Enum.map(fn word -> {word, calculate_inverse_document_frequency(data, word)} end) 91 | |> Enum.into(%{}) 92 | 93 | %{data | idf: idf} 94 | end 95 | 96 | defp calculate_term_frequency(list) do 97 | len = length(list) 98 | 99 | list 100 | |> Enum.group_by(& &1) 101 | |> Enum.map(fn {word, occurences} -> {word, length(occurences) / len} end) 102 | |> Enum.into(%{}) 103 | end 104 | 105 | defp calculate_inverse_document_frequency(%__MODULE__{corpus: corpus}, word) do 106 | num_docs = corpus |> Map.keys() |> length 107 | 108 | num_hits = 109 | corpus |> Map.values() |> Enum.filter(fn %{words: words} -> word in words end) |> length 110 | 111 | if num_hits == 0, do: 0, else: :math.log(num_docs / num_hits) 112 | end 113 | 114 | defp tokenize(str, language) do 115 | str 116 | |> String.replace(~r/[^\w\s]/u, "") 117 | |> Scrape.IR.Text.semantic_tokenize(language) 118 | end 119 | end 120 | -------------------------------------------------------------------------------- /lib/scrape/options.ex: -------------------------------------------------------------------------------- 1 | defmodule Scrape.Options do 2 | @moduledoc false 3 | 4 | @defaults num_stems: 30 5 | 6 | def merge(opts \\ []) do 7 | Keyword.merge(@defaults, opts) 8 | end 9 | end 10 | -------------------------------------------------------------------------------- /lib/scrape/source/disk.ex: -------------------------------------------------------------------------------- 1 | defmodule Scrape.Source.Disk do 2 | @moduledoc """ 3 | Abstraction over the native `File` functions. Currently without additional logic. 4 | """ 5 | 6 | @doc """ 7 | Same as `File.read/1`. 8 | """ 9 | def get(path) do 10 | File.read(path) 11 | end 12 | 13 | @doc """ 14 | Same as `File.read!/1`. 15 | """ 16 | def get!(path) do 17 | File.read!(path) 18 | end 19 | end 20 | -------------------------------------------------------------------------------- /lib/scrape/source/http.ex: -------------------------------------------------------------------------------- 1 | defmodule Scrape.Source.HTTP do 2 | alias Scrape.Source.HTTP.Charset 3 | alias Scrape.Source.HTTP.Get 4 | alias Scrape.Source.HTTP.Transcode 5 | 6 | @doc """ 7 | Perform a HTTP GET request against the given url. 8 | 9 | This function is optimized for *text*-based data, not binary like images. 10 | It will try to normalize the response into valid utf-8 and transcode if needed. 11 | 12 | Everything that is not a status code 200 with valid encoding will result in 13 | some error object. 14 | 15 | ## Examples: 16 | iex> HTTP.get("http://example.com") 17 | {:ok, }"some response"} 18 | """ 19 | 20 | @spec get(String.t()) :: {:ok, String.t()} | {:error, any()} | {:http_error, any()} 21 | 22 | def get(url) do 23 | url |> Get.execute() |> evaluate() 24 | end 25 | 26 | @doc """ 27 | Same as `get/1`, but will raise if the result is not `:ok`. 28 | """ 29 | 30 | @spec get!(String.t()) :: String.t() 31 | 32 | def get!(url) do 33 | {:ok, data} = get(url) 34 | data 35 | end 36 | 37 | defp evaluate({:error, _} = response), do: response 38 | 39 | defp evaluate({:ok, %{status_code: 200, headers: headers, body: body}}) do 40 | case Charset.from_headers(headers) do 41 | nil -> {:ok, body} 42 | charset -> {:ok, Transcode.execute(charset, body)} 43 | end 44 | end 45 | 46 | defp evaluate({:ok, %{body: body}}), do: {:http_error, body} 47 | 48 | defp evaluate(response), do: response 49 | end 50 | -------------------------------------------------------------------------------- /lib/scrape/source/http/charset.ex: -------------------------------------------------------------------------------- 1 | defmodule Scrape.Source.HTTP.Charset do 2 | @moduledoc false 3 | 4 | def from_headers(headers) do 5 | header = 6 | headers 7 | |> Enum.filter(fn {k, _} -> k == "Content-Type" end) 8 | |> first 9 | 10 | if header do 11 | {_name, content} = header 12 | 13 | ~r/charset=(ISO-8859-[1-9])/i 14 | |> Regex.run(content, capture: :all_but_first) 15 | |> first 16 | else 17 | nil 18 | end 19 | end 20 | 21 | defp first([h | _]), do: h 22 | defp first(_), do: nil 23 | end 24 | -------------------------------------------------------------------------------- /lib/scrape/source/http/get.ex: -------------------------------------------------------------------------------- 1 | defmodule Scrape.Source.HTTP.Get do 2 | @moduledoc false 3 | 4 | @opts [ 5 | follow_redirect: true, 6 | timeout: 33_000, 7 | recv_timeout: 30_000, 8 | ssl: [{:versions, [:"tlsv1.2"]}] 9 | ] 10 | 11 | @headers [ 12 | "user-agent": 13 | "Mozilla/5.0 (X11; Fedora; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36", 14 | accept: "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8" 15 | ] 16 | 17 | def execute(url, http_headers \\ @headers, http_opts \\ @opts) do 18 | HTTPoison.get(url, http_headers, http_opts) 19 | end 20 | end 21 | -------------------------------------------------------------------------------- /lib/scrape/source/http/transcode.ex: -------------------------------------------------------------------------------- 1 | defmodule Scrape.Source.HTTP.Transcode do 2 | @moduledoc false 3 | 4 | def execute(charset, text) do 5 | encoding = charset_to_encoding(charset) 6 | {_status, result} = Codepagex.to_string(text, encoding) 7 | result 8 | end 9 | 10 | defp charset_to_encoding(charset) do 11 | charset 12 | |> String.replace("-", "_") 13 | |> String.downcase() 14 | |> to_charlist 15 | |> List.to_atom() 16 | end 17 | end 18 | -------------------------------------------------------------------------------- /lib/scrape/tools/dom.ex: -------------------------------------------------------------------------------- 1 | defmodule Scrape.Tools.DOM do 2 | @moduledoc """ 3 | Utility module for selecting/extracting data from a "DOM" (HTML/XML tree-like 4 | structure). Can find text values and attribute values, inspired by jQuery and 5 | implemented with Floki. 6 | """ 7 | 8 | @typedoc """ 9 | DOM tree representation, same as Floki's html_tree. 10 | 11 | Can be created via `from_string/1`. 12 | """ 13 | 14 | @type dom :: String.t() | tuple() | [any()] 15 | 16 | @doc """ 17 | Create a DOM from a given (HTML/XML) string. 18 | 19 | ## Examples 20 | iex> DOM.from_string("") 21 | [] 22 | 23 | iex> DOM.from_string("") 24 | {"html", [], []} 25 | """ 26 | 27 | @spec from_string(String.t()) :: dom 28 | 29 | def from_string(string) do 30 | Floki.parse(string) 31 | end 32 | 33 | @doc """ 34 | Builds a (HTML/XML) string from a DOM structure. 35 | 36 | ## Examples 37 | iex> DOM.to_string([]) 38 | "" 39 | 40 | iex> DOM.to_string({"html", [], []}) 41 | "" 42 | """ 43 | 44 | @spec to_string(dom) :: String.t() 45 | 46 | def to_string(dom) do 47 | case dom do 48 | dom when is_tuple(dom) or is_list(dom) -> Floki.raw_html(dom) 49 | _ -> "" 50 | end 51 | end 52 | 53 | @doc """ 54 | Get the text value of a DOM node (including nested nodes). 55 | 56 | If many nodes match the selector, the first one is used. 57 | 58 | ## Examples 59 | iex> "
abc
" |> DOM.from_string() |> DOM.text("p") 60 | nil 61 | 62 | iex> "
abc
" |> DOM.from_string() |> DOM.text("div") 63 | "abc" 64 | """ 65 | 66 | @spec text(dom, String.t()) :: nil | String.t() 67 | 68 | def text(dom, selector) do 69 | dom 70 | |> Floki.find(selector) 71 | |> List.first() 72 | |> get_text() 73 | |> unwrap_string() 74 | end 75 | 76 | @doc """ 77 | Similar to `text/2` but iterates over all matching nodes. 78 | 79 | Returns always a list result, but with nil values filtered. 80 | 81 | ## Examples 82 | iex> "
abc
" |> DOM.from_string() |> DOM.texts("p") 83 | [] 84 | 85 | iex> "
abc
" |> DOM.from_string() |> DOM.texts("div") 86 | ["abc"] 87 | 88 | iex> "

a

b

" |> DOM.from_string() |> DOM.texts("p") 89 | ["a", "b"] 90 | """ 91 | 92 | @spec texts(dom, String.t()) :: [String.t()] 93 | 94 | def texts(dom, selector) do 95 | dom 96 | |> Floki.find(selector) 97 | |> Enum.map(&get_text/1) 98 | |> Enum.map(&unwrap_string/1) 99 | |> Enum.reject(&is_nil/1) 100 | |> List.wrap() 101 | end 102 | 103 | @doc """ 104 | Similar to `text/2` but but returns a chosen attribute value instead of the 105 | node's text value (or nil). 106 | 107 | ## Examples 108 | iex> "" |> DOM.from_string |> DOM.attr("meta", "unknown") 109 | nil 110 | 111 | iex> "" |> DOM.from_string |> DOM.attr("meta", "content") 112 | "b" 113 | 114 | iex> "" |> DOM.from_string |> DOM.attr("meta[name=a]", "content") 115 | "b" 116 | """ 117 | 118 | @spec attr(dom, String.t(), String.t()) :: nil | String.t() 119 | 120 | def attr(dom, selector, name) do 121 | dom 122 | |> Floki.find(selector) 123 | |> List.first() 124 | |> get_attr(name) 125 | |> unwrap_string() 126 | end 127 | 128 | @doc """ 129 | Similar to `attr/3` but returns a list of all matching results. 130 | 131 | ## Examples 132 | iex> "

b

" |> DOM.from_string() |> DOM.attrs("div", "class") 133 | [] 134 | 135 | iex> "

b

" |> DOM.from_string() |> DOM.attrs("p", "id") 136 | [] 137 | 138 | iex> "

b

" |> DOM.from_string() |> DOM.attrs("p", "class") 139 | ["a", "c"] 140 | """ 141 | 142 | @spec attrs(dom, String.t(), String.t()) :: [String.t()] 143 | 144 | def attrs(dom, selector, name) do 145 | dom 146 | |> Floki.find(selector) 147 | |> Enum.map(&get_attr(&1, name)) 148 | |> Enum.map(&unwrap_string/1) 149 | |> Enum.reject(&is_nil/1) 150 | end 151 | 152 | @doc """ 153 | Cascading query helper, applies either `text/2` or `attr/3` until something 154 | returns a non-nil result or all queries are tried. 155 | 156 | ## Examples 157 | iex> DOM.first([], []) 158 | nil 159 | 160 | iex> DOM.first([], [{"b"}, {"i"}, {"div", "class"}]) 161 | nil 162 | 163 | iex> "

abc
" |> DOM.from_string() |> DOM.first([{"i"}, {"div", "id"}]) 164 | "1" 165 | 166 | iex> "abc" |> DOM.from_string() |> DOM.first([{"i"}, {"b"}]) 167 | "abc" 168 | """ 169 | 170 | @spec first(dom, [{String.t()} | {String.t(), String.t()}]) :: nil | String.t() 171 | 172 | def first(_dom, []), do: nil 173 | 174 | def first(dom, [{selector} | queries]) do 175 | case text(dom, selector) do 176 | nil -> first(dom, queries) 177 | string -> string 178 | end 179 | end 180 | 181 | def first(dom, [{selector, name} | queries]) do 182 | case attr(dom, selector, name) do 183 | nil -> first(dom, queries) 184 | string -> string 185 | end 186 | end 187 | 188 | defp get_text(nil), do: "" 189 | defp get_text(value), do: Floki.text(value) 190 | 191 | defp get_attr(nil, _name), do: nil 192 | defp get_attr(elem, name), do: elem |> Floki.attribute(name) |> List.first() 193 | 194 | defp unwrap_string(value) when not is_binary(value), do: nil 195 | defp unwrap_string(""), do: nil 196 | defp unwrap_string(value), do: value 197 | end 198 | -------------------------------------------------------------------------------- /lib/scrape/tools/tree.ex: -------------------------------------------------------------------------------- 1 | defmodule Scrape.Tools.Tree do 2 | @moduledoc """ 3 | Utility module for interacting with nested Map structures, here called "tree". 4 | """ 5 | 6 | @doc """ 7 | Transform a given xml string into a tree. 8 | 9 | The string must be utf-8 encoded. It will be sanitized via Floki and the xml 10 | declaration header will be stripped. 11 | 12 | ## Examples 13 | iex> Tree.from_xml_string("abc") 14 | %{"feed" => %{"item" => "abc"}} 15 | """ 16 | 17 | @spec from_xml_string(String.t()) :: map() 18 | 19 | def from_xml_string(nil), do: %{} 20 | 21 | def from_xml_string(xml) do 22 | xml 23 | |> String.replace(~r/<\?xml.*?>/i, "") 24 | |> Floki.parse() 25 | |> Floki.raw_html() 26 | |> String.trim() 27 | |> try_build_tree() 28 | |> try_normalize() 29 | end 30 | 31 | defp try_build_tree(""), do: %{} 32 | 33 | defp try_build_tree(xml) do 34 | try do 35 | # XMap.from_xml(xml) 36 | XmlToMap.naive_map(xml) 37 | rescue 38 | _ -> %{} 39 | end 40 | end 41 | 42 | defp try_normalize(map) do 43 | case Morphix.compactiform(map) do 44 | {:ok, tree} -> tree 45 | _ -> map 46 | end 47 | end 48 | 49 | @doc """ 50 | Attempts all queries until one returns a non-nil result or nil. 51 | 52 | ## Examples 53 | iex> Tree.first(%{"hello" => "world"}, ["unknown"]) 54 | nil 55 | 56 | iex> Tree.first(%{"hello" => "world"}, ["unknown", "hello"]) 57 | "world" 58 | """ 59 | 60 | @spec first(map(), [String.t()]) :: nil | any() 61 | 62 | def first(_tree, []), do: nil 63 | 64 | def first(tree, [selector | queries]) do 65 | case find(tree, selector) do 66 | nil -> first(tree, queries) 67 | [] -> first(tree, queries) 68 | [match] -> match 69 | [match | _] -> match 70 | match -> match 71 | end 72 | end 73 | 74 | @doc """ 75 | Applies `find/2` to all given selectors and combines the result. 76 | 77 | ## Examples 78 | iex> Tree.find_all(%{"a" => "b", "c" => "d"}, ["a", "c"]) 79 | ["b", "d"] 80 | 81 | iex> Tree.find_all(%{"a" => "b", "c" => "d"}, ["a", "z"]) 82 | ["b"] 83 | 84 | iex> Tree.find_all(%{"a" => "b", "c" => "d"}, ["x", "y"]) 85 | [] 86 | """ 87 | 88 | @spec find_all(map(), [String.t()]) :: [any()] 89 | 90 | def find_all(_tree, []), do: [] 91 | 92 | def find_all(tree, selectors) do 93 | selectors 94 | |> Enum.map(&find(tree, &1)) 95 | |> normalize() 96 | end 97 | 98 | @doc """ 99 | Attempts to get a nested value from the tree using a string selector syntax. 100 | 101 | Returns nil if nothing matches the selector or all matching results. 102 | 103 | ## Examples 104 | iex> Tree.find(%{"a" => %{"b" => "c"}}, "a") 105 | %{"b" => "c"} 106 | 107 | iex> Tree.find(%{"a" => %{"b" => "c"}}, "a.b") 108 | "c" 109 | 110 | iex> Tree.find(%{"a" => %{"b" => "c"}}, "unknown") 111 | nil 112 | 113 | iex> Tree.find(%{"a" => [%{"b" => "c"}]}, "a.b") 114 | ["c"] 115 | 116 | iex> Tree.find(%{"a" => [%{"b" => [%{"c" => "d"}]}]}, "a.b.c") 117 | ["d"] 118 | 119 | iex> Tree.find(%{"a" => [%{"b" => "c"}, %{"b" => "c"}]}, "a.b") 120 | ["c", "c"] 121 | 122 | iex> Tree.find(%{"a" => [%{"b" => [%{"c" => "d"}]}]}, "a.*.c") 123 | ["d"] 124 | 125 | iex> Tree.find(%{"hello" => "world"}, "~ell") 126 | ["world"] 127 | """ 128 | 129 | @spec find(map(), String.t()) :: any() 130 | 131 | def find(tree, selector) when is_map(tree) and is_binary(selector) do 132 | tree 133 | |> pick(String.split(selector, ".")) 134 | |> normalize() 135 | end 136 | 137 | defp pick(nil, _), do: nil 138 | defp pick(n, []), do: n 139 | defp pick(n, keys) when is_list(n), do: Enum.map(n, &pick(&1, keys)) 140 | defp pick(n, _) when not is_map(n), do: nil 141 | defp pick(n, ["*" | t]), do: n |> Map.values() |> Enum.map(&pick(&1, t)) 142 | 143 | defp pick(n, ["~" <> pattern = _h | t]) do 144 | n 145 | |> Map.keys() 146 | |> Enum.filter(&String.contains?(&1, pattern)) 147 | |> Enum.map(&Map.get(n, &1)) 148 | |> Enum.map(&pick(&1, t)) 149 | end 150 | 151 | defp pick(n, [h | t]) do 152 | case Map.get(n, h) do 153 | nil -> nil 154 | sub -> pick(sub, t) 155 | end 156 | end 157 | 158 | defp normalize(value) when not is_list(value), do: value 159 | 160 | defp normalize(value) when is_list(value) do 161 | value 162 | |> List.flatten() 163 | |> Enum.reject(&is_nil/1) 164 | end 165 | end 166 | -------------------------------------------------------------------------------- /lib/scrape/tools/url.ex: -------------------------------------------------------------------------------- 1 | defmodule Scrape.Tools.URL do 2 | @moduledoc """ 3 | Simple utility functions to extract information from URLs. 4 | """ 5 | 6 | @doc """ 7 | Rebase an URL to another root URL, useful for turning relative URLs into 8 | absolute ones. 9 | 10 | ## Example 11 | iex> URL.merge("/path", "http://example.com") 12 | "http://example.com/path" 13 | """ 14 | 15 | @spec merge(nil | String.t(), String.t()) :: nil | String.t() 16 | 17 | def merge(nil, _), do: nil 18 | 19 | def merge("", _), do: nil 20 | 21 | def merge(url, nil), do: url 22 | 23 | def merge(url, ""), do: url 24 | 25 | def merge(url, root_url) do 26 | root_url |> URI.merge(url) |> URI.to_string() 27 | end 28 | 29 | @doc """ 30 | Checks if a given string actually represents an URL. 31 | 32 | ## Example 33 | iex> URL.is_http?("http://example.com") 34 | true 35 | 36 | iex> URL.is_http?("example") 37 | false 38 | """ 39 | 40 | @spec is_http?(String.t()) :: boolean() 41 | 42 | def is_http?(url) do 43 | ["http", "/"] 44 | |> Enum.any?(&String.starts_with?(url, &1)) 45 | end 46 | 47 | @doc """ 48 | Transforms a given url into it's basic form, only including protocol scheme 49 | and host, without any other things like path, query or hash. 50 | 51 | ## Examples 52 | iex> URL.base("https://example.com/path?param=1#search") 53 | "https://example.com" 54 | 55 | iex> URL.base("//example.com") 56 | "http://example.com" 57 | """ 58 | 59 | @spec base(String.t()) :: String.t() 60 | 61 | def base(url) do 62 | uri = URI.parse(url) 63 | scheme = uri.scheme || "http" 64 | host = uri.host 65 | "#{scheme}://#{host}" 66 | end 67 | end 68 | -------------------------------------------------------------------------------- /lib/scrape/tools/word.ex: -------------------------------------------------------------------------------- 1 | defmodule Scrape.Tools.Word do 2 | @moduledoc """ 3 | Algorithms to extract information from single words. 4 | """ 5 | 6 | @stemmer_codes %{ 7 | :de => :german2, 8 | :en => :english 9 | } 10 | 11 | @doc """ 12 | Extract the stem of a given word. 13 | 14 | Uses the snowball algorithm under the hood via the library 15 | [Stemex](https://hex.pm/packages/stemex), which in turn uses NIFs for raw 16 | speed. Currently only german and english are supported. 17 | 18 | ## Example 19 | iex> Word.stem("beautiful", :en) 20 | "beauti" 21 | 22 | iex> Word.stem("derbsten", :de) 23 | "derb" 24 | """ 25 | 26 | @spec stem(String.t(), :de | :en) :: String.t() 27 | 28 | def stem(word, language \\ :en) 29 | def stem(nil, _), do: nil 30 | def stem("", _), do: "" 31 | 32 | def stem(word, language) do 33 | try do 34 | apply(Stemex, @stemmer_codes[language], [word]) 35 | rescue 36 | _ -> word 37 | end 38 | end 39 | 40 | @doc """ 41 | Check if a given word is a stopword against the provided language lists. 42 | 43 | Note: the provided language lists are all-downcased words. 44 | 45 | ## Examples 46 | iex> Word.IsStopword.execute("when", :en) 47 | true 48 | 49 | iex> Word.IsStopword.execute("linux", :en) 50 | false 51 | 52 | iex> Word.IsStopword.execute("ein", :de) 53 | true 54 | 55 | iex> Word.IsStopword.execute("elixir", :de) 56 | false 57 | """ 58 | 59 | @spec is_stopword?(String.t(), :de | :en) :: boolean() 60 | 61 | defdelegate is_stopword?(word, language \\ :en), 62 | to: Scrape.Tools.Word.IsStopword, 63 | as: :execute 64 | 65 | @doc """ 66 | Determine if a given word might be relevant for analytical purposes. 67 | 68 | Uses a simple heuristic and checks for stopword matches. 69 | 70 | ## Examples 71 | iex> Word.is_meaningful?("a", :en) 72 | false 73 | 74 | iex> Word.is_meaningful?("apple", :en) 75 | true 76 | """ 77 | 78 | @spec is_meaningful?(String.t(), :de | :en) :: boolean() 79 | 80 | def is_meaningful?(word, language \\ :en) do 81 | String.length(word) > 2 and String.match?(word, ~r/^[\p{L}\p{M}\w]+$/u) and 82 | not is_stopword?(word, language) 83 | end 84 | end 85 | -------------------------------------------------------------------------------- /lib/scrape/tools/word/is_stopword.ex: -------------------------------------------------------------------------------- 1 | defmodule Scrape.Tools.Word.IsStopword do 2 | @moduledoc false 3 | 4 | def execute(word, language \\ :en) 5 | 6 | for file <- File.ls!(Path.join([__DIR__, "stopwords"])) do 7 | language = file |> Path.basename(".txt") |> String.to_atom() 8 | 9 | for line <- File.stream!(Path.join([__DIR__, "stopwords", file]), [], :line) do 10 | word = String.trim(line) 11 | def execute(unquote(word), unquote(language)), do: true 12 | end 13 | end 14 | 15 | def execute(_, _), do: false 16 | end 17 | -------------------------------------------------------------------------------- /lib/scrape/tools/word/stopwords/de.txt: -------------------------------------------------------------------------------- 1 | ab 2 | aber 3 | abgerufen 4 | abgerufene 5 | abgerufener 6 | abgerufenes 7 | acht 8 | alle 9 | allein 10 | allem 11 | allen 12 | aller 13 | allerdings 14 | allerlei 15 | alles 16 | allgemein 17 | allmählich 18 | allzu 19 | als 20 | alsbald 21 | ander 22 | andere 23 | anderem 24 | anderen 25 | anderer 26 | andererseits 27 | anderes 28 | anderm 29 | andern 30 | andernfalls 31 | anders 32 | anerkannt 33 | anerkannte 34 | anerkannter 35 | anerkanntes 36 | anfangen 37 | anfing 38 | angefangen 39 | angesetze 40 | angesetzt 41 | angesetzten 42 | angesetzter 43 | ansetzen 44 | anstatt 45 | arbeiten 46 | auch 47 | auf 48 | aufgehört 49 | aufgrund 50 | aufhören 51 | aufhörte 52 | aufzusuchen 53 | aus 54 | ausdrücken 55 | ausdrückt 56 | ausdrückte 57 | ausgenommen 58 | ausser 59 | ausserdem 60 | author 61 | autor 62 | außen 63 | außer 64 | außerdem 65 | außerhalb 66 | bald 67 | bearbeite 68 | bearbeiten 69 | bearbeitete 70 | bearbeiteten 71 | bedarf 72 | bedurfte 73 | bedürfen 74 | befragen 75 | befragte 76 | befragten 77 | befragter 78 | begann 79 | beginnen 80 | begonnen 81 | behalten 82 | behielt 83 | bei 84 | beide 85 | beiden 86 | beiderlei 87 | beides 88 | beim 89 | beinahe 90 | beitragen 91 | beitrugen 92 | bekannt 93 | bekannte 94 | bekannter 95 | bekennen 96 | benutzt 97 | bereits 98 | berichten 99 | berichtet 100 | berichtete 101 | berichteten 102 | besonders 103 | besser 104 | bestehen 105 | besteht 106 | beträchtlich 107 | bevor 108 | bezüglich 109 | bietet 110 | bin 111 | bis 112 | bisher 113 | bislang 114 | bist 115 | bleiben 116 | blieb 117 | bloss 118 | bloß 119 | brachte 120 | brachten 121 | brauchen 122 | braucht 123 | bringen 124 | bräuchte 125 | bsp. 126 | bzw 127 | böden 128 | ca. 129 | da 130 | dabei 131 | dadurch 132 | dafür 133 | dagegen 134 | daher 135 | dahin 136 | damals 137 | damit 138 | danach 139 | daneben 140 | dank 141 | danke 142 | danken 143 | dann 144 | dannen 145 | daran 146 | darauf 147 | daraus 148 | darf 149 | darfst 150 | darin 151 | darum 152 | darunter 153 | darüber 154 | darüberhinaus 155 | das 156 | dass 157 | dasselbe 158 | davon 159 | davor 160 | dazu 161 | daß 162 | dein 163 | deine 164 | deinem 165 | deinen 166 | deiner 167 | deines 168 | dem 169 | demnach 170 | demselben 171 | den 172 | denen 173 | denn 174 | dennoch 175 | denselben 176 | der 177 | derart 178 | derartig 179 | derem 180 | deren 181 | derer 182 | derjenige 183 | derjenigen 184 | derselbe 185 | derselben 186 | derzeit 187 | des 188 | deshalb 189 | desselben 190 | dessen 191 | desto 192 | deswegen 193 | dich 194 | die 195 | diejenige 196 | dies 197 | diese 198 | dieselbe 199 | dieselben 200 | diesem 201 | diesen 202 | dieser 203 | dieses 204 | diesseits 205 | dinge 206 | dir 207 | direkt 208 | direkte 209 | direkten 210 | direkter 211 | doch 212 | doppelt 213 | dort 214 | dorther 215 | dorthin 216 | drauf 217 | drei 218 | dreißig 219 | drin 220 | dritte 221 | drunter 222 | drüber 223 | du 224 | dunklen 225 | durch 226 | durchaus 227 | durfte 228 | durften 229 | dürfen 230 | dürfte 231 | eben 232 | ebenfalls 233 | ebenso 234 | ehe 235 | eher 236 | eigenen 237 | eigenes 238 | eigentlich 239 | ein 240 | einbaün 241 | eine 242 | einem 243 | einen 244 | einer 245 | einerseits 246 | eines 247 | einfach 248 | einführen 249 | einführte 250 | einführten 251 | eingesetzt 252 | einig 253 | einige 254 | einigem 255 | einigen 256 | einiger 257 | einigermaßen 258 | einiges 259 | einmal 260 | eins 261 | einseitig 262 | einseitige 263 | einseitigen 264 | einseitiger 265 | einst 266 | einstmals 267 | einzig 268 | ende 269 | entsprechend 270 | entweder 271 | er 272 | ergänze 273 | ergänzen 274 | ergänzte 275 | ergänzten 276 | erhalten 277 | erhielt 278 | erhielten 279 | erhält 280 | erneut 281 | erst 282 | erste 283 | ersten 284 | erster 285 | eröffne 286 | eröffnen 287 | eröffnet 288 | eröffnete 289 | eröffnetes 290 | es 291 | etliche 292 | etwa 293 | etwas 294 | euch 295 | euer 296 | eure 297 | eurem 298 | euren 299 | eurer 300 | eures 301 | fall 302 | falls 303 | fand 304 | fast 305 | ferner 306 | finden 307 | findest 308 | findet 309 | folgende 310 | folgenden 311 | folgender 312 | folgendes 313 | folglich 314 | fordern 315 | fordert 316 | forderte 317 | forderten 318 | fortsetzen 319 | fortsetzt 320 | fortsetzte 321 | fortsetzten 322 | fragte 323 | frau 324 | frei 325 | freie 326 | freier 327 | freies 328 | fuer 329 | fünf 330 | für 331 | gab 332 | ganz 333 | ganze 334 | ganzem 335 | ganzen 336 | ganzer 337 | ganzes 338 | gar 339 | gbr 340 | geb 341 | geben 342 | geblieben 343 | gebracht 344 | gedurft 345 | geehrt 346 | geehrte 347 | geehrten 348 | geehrter 349 | gefallen 350 | gefiel 351 | gefälligst 352 | gefällt 353 | gegeben 354 | gegen 355 | gehabt 356 | gehen 357 | geht 358 | gekommen 359 | gekonnt 360 | gemacht 361 | gemocht 362 | gemäss 363 | genommen 364 | genug 365 | gern 366 | gesagt 367 | gesehen 368 | gestern 369 | gestrige 370 | getan 371 | geteilt 372 | geteilte 373 | getragen 374 | gewesen 375 | gewissermaßen 376 | gewollt 377 | geworden 378 | ggf 379 | gib 380 | gibt 381 | gleich 382 | gleichwohl 383 | gleichzeitig 384 | glücklicherweise 385 | gmbh 386 | gratulieren 387 | gratuliert 388 | gratulierte 389 | gute 390 | guten 391 | gängig 392 | gängige 393 | gängigen 394 | gängiger 395 | gängiges 396 | gänzlich 397 | hab 398 | habe 399 | haben 400 | haette 401 | halb 402 | hallo 403 | hast 404 | hat 405 | hatte 406 | hatten 407 | hattest 408 | hattet 409 | hen 410 | heraus 411 | heute 412 | heutige 413 | hier 414 | hiermit 415 | hiesige 416 | hin 417 | hinein 418 | hinten 419 | hinter 420 | hinterher 421 | hoch 422 | hundert 423 | hätt 424 | hätte 425 | hätten 426 | höchstens 427 | ich 428 | igitt 429 | ihm 430 | ihn 431 | ihnen 432 | ihr 433 | ihre 434 | ihrem 435 | ihren 436 | ihrer 437 | ihres 438 | immer 439 | immerhin 440 | indem 441 | indessen 442 | info 443 | infolge 444 | innen 445 | innerhalb 446 | ins 447 | insofern 448 | inzwischen 449 | irgend 450 | irgendeine 451 | irgendwas 452 | irgendwen 453 | irgendwer 454 | irgendwie 455 | irgendwo 456 | ist 457 | ja 458 | je 459 | jede 460 | jedem 461 | jeden 462 | jedenfalls 463 | jeder 464 | jederlei 465 | jedes 466 | jedoch 467 | jemand 468 | jene 469 | jenem 470 | jenen 471 | jener 472 | jenes 473 | jenseits 474 | jetzt 475 | jährig 476 | jährige 477 | jährigen 478 | jähriges 479 | kam 480 | kann 481 | kannst 482 | kaum 483 | kein 484 | keine 485 | keinem 486 | keinen 487 | keiner 488 | keinerlei 489 | keines 490 | keineswegs 491 | klar 492 | klare 493 | klaren 494 | klares 495 | klein 496 | kleinen 497 | kleiner 498 | kleines 499 | koennen 500 | koennt 501 | koennte 502 | koennten 503 | komme 504 | kommen 505 | kommt 506 | konkret 507 | konkrete 508 | konkreten 509 | konkreter 510 | konkretes 511 | konnte 512 | konnten 513 | könn 514 | können 515 | könnt 516 | könnte 517 | könnten 518 | künftig 519 | lag 520 | lagen 521 | langsam 522 | lassen 523 | laut 524 | lediglich 525 | leer 526 | legen 527 | legte 528 | legten 529 | leicht 530 | leider 531 | lesen 532 | letze 533 | letzten 534 | letztendlich 535 | letztens 536 | letztes 537 | letztlich 538 | lichten 539 | liegt 540 | liest 541 | links 542 | längst 543 | längstens 544 | mache 545 | machen 546 | machst 547 | macht 548 | machte 549 | machten 550 | mag 551 | magst 552 | mal 553 | man 554 | manche 555 | manchem 556 | manchen 557 | mancher 558 | mancherorts 559 | manches 560 | manchmal 561 | mann 562 | margin 563 | mehr 564 | mehrere 565 | mein 566 | meine 567 | meinem 568 | meinen 569 | meiner 570 | meines 571 | meist 572 | meiste 573 | meisten 574 | meta 575 | mich 576 | mindestens 577 | mir 578 | mit 579 | mithin 580 | mochte 581 | morgen 582 | morgige 583 | muessen 584 | muesst 585 | muesste 586 | muss 587 | musst 588 | musste 589 | mussten 590 | muß 591 | mußt 592 | möchte 593 | möchten 594 | möchtest 595 | mögen 596 | möglich 597 | mögliche 598 | möglichen 599 | möglicher 600 | möglicherweise 601 | müssen 602 | müsste 603 | müssten 604 | müßt 605 | müßte 606 | nach 607 | nachdem 608 | nacher 609 | nachhinein 610 | nacht 611 | nahm 612 | natürlich 613 | neben 614 | nebenan 615 | nehmen 616 | nein 617 | neu 618 | neue 619 | neuem 620 | neuen 621 | neuer 622 | neues 623 | neun 624 | nicht 625 | nichts 626 | nie 627 | niemals 628 | niemand 629 | nimm 630 | nimmer 631 | nimmt 632 | nirgends 633 | nirgendwo 634 | noch 635 | nun 636 | nur 637 | nutzen 638 | nutzt 639 | nutzung 640 | nächste 641 | nämlich 642 | nötigenfalls 643 | nützt 644 | ob 645 | oben 646 | oberhalb 647 | obgleich 648 | obschon 649 | obwohl 650 | oder 651 | oft 652 | ohne 653 | pfui 654 | plötzlich 655 | pro 656 | reagiere 657 | reagieren 658 | reagiert 659 | reagierte 660 | rechts 661 | regelmäßig 662 | rief 663 | rund 664 | sage 665 | sagen 666 | sagt 667 | sagte 668 | sagten 669 | sagtest 670 | sang 671 | sangen 672 | schlechter 673 | schließlich 674 | schnell 675 | schon 676 | schreibe 677 | schreiben 678 | schreibens 679 | schreiber 680 | schwierig 681 | schätzen 682 | schätzt 683 | schätzte 684 | schätzten 685 | sechs 686 | sect 687 | sehe 688 | sehen 689 | sehr 690 | sehrwohl 691 | seht 692 | sei 693 | seid 694 | sein 695 | seine 696 | seinem 697 | seinen 698 | seiner 699 | seines 700 | seit 701 | seitdem 702 | seite 703 | seiten 704 | seither 705 | selber 706 | selbst 707 | senke 708 | senken 709 | senkt 710 | senkte 711 | senkten 712 | setzen 713 | setzt 714 | setzte 715 | setzten 716 | sich 717 | sicher 718 | sicherlich 719 | sie 720 | sieben 721 | siebte 722 | siehe 723 | sieht 724 | sind 725 | singen 726 | singt 727 | sobald 728 | sodaß 729 | soeben 730 | sofern 731 | sofort 732 | sog 733 | sogar 734 | solange 735 | solc 736 | solch 737 | solche 738 | solchem 739 | solchen 740 | solcher 741 | solches 742 | soll 743 | sollen 744 | sollst 745 | sollt 746 | sollte 747 | sollten 748 | solltest 749 | somit 750 | sondern 751 | sonst 752 | sonstwo 753 | sooft 754 | soviel 755 | soweit 756 | sowie 757 | sowohl 758 | spielen 759 | später 760 | startet 761 | startete 762 | starteten 763 | statt 764 | stattdessen 765 | steht 766 | steige 767 | steigen 768 | steigt 769 | stets 770 | stieg 771 | stiegen 772 | suchen 773 | sämtliche 774 | tages 775 | tat 776 | tatsächlich 777 | tatsächlichen 778 | tatsächlicher 779 | tatsächliches 780 | tausend 781 | teile 782 | teilen 783 | teilte 784 | teilten 785 | titel 786 | total 787 | trage 788 | tragen 789 | trotzdem 790 | trug 791 | trägt 792 | tun 793 | tust 794 | tut 795 | txt 796 | tät 797 | ueber 798 | um 799 | umso 800 | unbedingt 801 | und 802 | ungefähr 803 | unmöglich 804 | unmögliche 805 | unmöglichen 806 | unmöglicher 807 | unnötig 808 | uns 809 | unse 810 | unsem 811 | unsen 812 | unser 813 | unsere 814 | unserem 815 | unseren 816 | unserer 817 | unseres 818 | unserm 819 | unses 820 | unten 821 | unter 822 | unterbrach 823 | unterbrechen 824 | unterhalb 825 | unwichtig 826 | usw 827 | vergangen 828 | vergangene 829 | vergangener 830 | vergangenes 831 | vermag 832 | vermutlich 833 | vermögen 834 | verrate 835 | verraten 836 | verriet 837 | verrieten 838 | version 839 | versorge 840 | versorgen 841 | versorgt 842 | versorgte 843 | versorgten 844 | versorgtes 845 | veröffentlichen 846 | veröffentlicher 847 | veröffentlicht 848 | veröffentlichte 849 | veröffentlichten 850 | veröffentlichtes 851 | viel 852 | viele 853 | vielen 854 | vieler 855 | vieles 856 | vielleicht 857 | vielmals 858 | vier 859 | vollständig 860 | vom 861 | von 862 | vor 863 | voran 864 | vorbei 865 | vorgestern 866 | vorher 867 | vorne 868 | vorüber 869 | völlig 870 | wachen 871 | waere 872 | wann 873 | war 874 | waren 875 | warst 876 | warum 877 | weder 878 | weg 879 | wegen 880 | weil 881 | weiter 882 | weitere 883 | weiterem 884 | weiteren 885 | weiterer 886 | weiteres 887 | weiterhin 888 | weiß 889 | welche 890 | welchem 891 | welchen 892 | welcher 893 | welches 894 | wem 895 | wen 896 | wenig 897 | wenige 898 | weniger 899 | wenigstens 900 | wenn 901 | wenngleich 902 | wer 903 | werde 904 | werden 905 | werdet 906 | weshalb 907 | wessen 908 | wichtig 909 | wie 910 | wieder 911 | wieso 912 | wieviel 913 | wiewohl 914 | will 915 | willst 916 | wir 917 | wird 918 | wirklich 919 | wirst 920 | wo 921 | wodurch 922 | wogegen 923 | woher 924 | wohin 925 | wohingegen 926 | wohl 927 | wohlweislich 928 | wolle 929 | wollen 930 | wollt 931 | wollte 932 | wollten 933 | wolltest 934 | wolltet 935 | womit 936 | woraufhin 937 | woraus 938 | worin 939 | wurde 940 | wurden 941 | während 942 | währenddessen 943 | wär 944 | wäre 945 | wären 946 | würde 947 | würden 948 | z.B. 949 | zahlreich 950 | zehn 951 | zeitweise 952 | ziehen 953 | zieht 954 | zog 955 | zogen 956 | zu 957 | zudem 958 | zuerst 959 | zufolge 960 | zugleich 961 | zuletzt 962 | zum 963 | zumal 964 | zur 965 | zurück 966 | zusammen 967 | zuviel 968 | zwanzig 969 | zwar 970 | zwei 971 | zwischen 972 | zwölf 973 | ähnlich 974 | übel 975 | über 976 | überall 977 | überallhin 978 | überdies 979 | übermorgen 980 | übrig 981 | übrigens -------------------------------------------------------------------------------- /lib/scrape/tools/word/stopwords/en.txt: -------------------------------------------------------------------------------- 1 | 'll 2 | a 3 | able 4 | about 5 | above 6 | abst 7 | accordance 8 | according 9 | accordingly 10 | across 11 | act 12 | actually 13 | added 14 | adj 15 | affected 16 | affecting 17 | affects 18 | after 19 | afterwards 20 | again 21 | against 22 | ah 23 | all 24 | almost 25 | alone 26 | along 27 | already 28 | also 29 | although 30 | always 31 | am 32 | among 33 | amongst 34 | an 35 | and 36 | announce 37 | another 38 | any 39 | anybody 40 | anyhow 41 | anymore 42 | anyone 43 | anything 44 | anyway 45 | anyways 46 | anywhere 47 | apparently 48 | approximately 49 | are 50 | aren 51 | arent 52 | arise 53 | around 54 | as 55 | aside 56 | ask 57 | asking 58 | at 59 | auth 60 | available 61 | away 62 | awfully 63 | b 64 | back 65 | be 66 | became 67 | because 68 | become 69 | becomes 70 | becoming 71 | been 72 | before 73 | beforehand 74 | begin 75 | beginning 76 | beginnings 77 | begins 78 | behind 79 | being 80 | believe 81 | below 82 | beside 83 | besides 84 | between 85 | beyond 86 | biol 87 | both 88 | brief 89 | briefly 90 | but 91 | by 92 | c 93 | ca 94 | came 95 | can 96 | can't 97 | cannot 98 | cause 99 | causes 100 | certain 101 | certainly 102 | co 103 | com 104 | come 105 | comes 106 | contain 107 | containing 108 | contains 109 | could 110 | couldnt 111 | d 112 | date 113 | did 114 | didn't 115 | different 116 | do 117 | does 118 | doesn't 119 | doing 120 | don't 121 | done 122 | down 123 | downwards 124 | due 125 | during 126 | e 127 | each 128 | ed 129 | edu 130 | effect 131 | eg 132 | eight 133 | eighty 134 | either 135 | else 136 | elsewhere 137 | end 138 | ending 139 | enough 140 | especially 141 | et 142 | et-al 143 | etc 144 | even 145 | ever 146 | every 147 | everybody 148 | everyone 149 | everything 150 | everywhere 151 | ex 152 | except 153 | f 154 | far 155 | few 156 | ff 157 | fifth 158 | first 159 | five 160 | fix 161 | followed 162 | following 163 | follows 164 | for 165 | former 166 | formerly 167 | forth 168 | found 169 | four 170 | from 171 | further 172 | furthermore 173 | g 174 | gave 175 | get 176 | gets 177 | getting 178 | give 179 | given 180 | gives 181 | giving 182 | go 183 | goes 184 | gone 185 | got 186 | gotten 187 | h 188 | had 189 | happens 190 | hardly 191 | has 192 | hasn't 193 | have 194 | haven't 195 | having 196 | he 197 | hed 198 | hence 199 | her 200 | here 201 | hereafter 202 | hereby 203 | herein 204 | heres 205 | hereupon 206 | hers 207 | herself 208 | hes 209 | hi 210 | hid 211 | him 212 | himself 213 | his 214 | hither 215 | home 216 | how 217 | howbeit 218 | however 219 | hundred 220 | i 221 | i'll 222 | i've 223 | id 224 | ie 225 | if 226 | im 227 | immediate 228 | immediately 229 | importance 230 | important 231 | in 232 | inc 233 | indeed 234 | index 235 | information 236 | instead 237 | into 238 | invention 239 | inward 240 | is 241 | isn't 242 | it 243 | it'll 244 | itd 245 | its 246 | itself 247 | j 248 | just 249 | k 250 | keep 251 | keeps 252 | kept 253 | kg 254 | km 255 | know 256 | known 257 | knows 258 | l 259 | largely 260 | last 261 | lately 262 | later 263 | latter 264 | latterly 265 | least 266 | less 267 | lest 268 | let 269 | lets 270 | like 271 | liked 272 | likely 273 | line 274 | little 275 | look 276 | looking 277 | looks 278 | ltd 279 | m 280 | made 281 | mainly 282 | make 283 | makes 284 | many 285 | may 286 | maybe 287 | me 288 | mean 289 | means 290 | meantime 291 | meanwhile 292 | merely 293 | mg 294 | might 295 | million 296 | miss 297 | ml 298 | more 299 | moreover 300 | most 301 | mostly 302 | mr 303 | mrs 304 | much 305 | mug 306 | must 307 | my 308 | myself 309 | n 310 | na 311 | name 312 | namely 313 | nay 314 | nd 315 | near 316 | nearly 317 | necessarily 318 | necessary 319 | need 320 | needs 321 | neither 322 | never 323 | nevertheless 324 | new 325 | next 326 | nine 327 | ninety 328 | no 329 | nobody 330 | non 331 | none 332 | nonetheless 333 | noone 334 | nor 335 | normally 336 | nos 337 | not 338 | noted 339 | nothing 340 | now 341 | nowhere 342 | o 343 | obtain 344 | obtained 345 | obviously 346 | of 347 | off 348 | often 349 | oh 350 | ok 351 | okay 352 | old 353 | omitted 354 | on 355 | once 356 | one 357 | ones 358 | only 359 | onto 360 | or 361 | ord 362 | other 363 | others 364 | otherwise 365 | ought 366 | our 367 | ours 368 | ourselves 369 | out 370 | outside 371 | over 372 | overall 373 | owing 374 | own 375 | p 376 | page 377 | pages 378 | part 379 | particular 380 | particularly 381 | past 382 | per 383 | perhaps 384 | placed 385 | please 386 | plus 387 | poorly 388 | possible 389 | possibly 390 | potentially 391 | pp 392 | predominantly 393 | present 394 | previously 395 | primarily 396 | probably 397 | promptly 398 | proud 399 | provides 400 | put 401 | q 402 | que 403 | quickly 404 | quite 405 | qv 406 | r 407 | ran 408 | rather 409 | rd 410 | re 411 | readily 412 | really 413 | recent 414 | recently 415 | ref 416 | refs 417 | regarding 418 | regardless 419 | regards 420 | related 421 | relatively 422 | research 423 | respectively 424 | resulted 425 | resulting 426 | results 427 | right 428 | run 429 | s 430 | said 431 | same 432 | saw 433 | say 434 | saying 435 | says 436 | sec 437 | section 438 | see 439 | seeing 440 | seem 441 | seemed 442 | seeming 443 | seems 444 | seen 445 | self 446 | selves 447 | sent 448 | seven 449 | several 450 | shall 451 | she 452 | she'll 453 | shed 454 | shes 455 | should 456 | shouldn't 457 | show 458 | showed 459 | shown 460 | showns 461 | shows 462 | significant 463 | significantly 464 | similar 465 | similarly 466 | since 467 | six 468 | slightly 469 | so 470 | some 471 | somebody 472 | somehow 473 | someone 474 | somethan 475 | something 476 | sometime 477 | sometimes 478 | somewhat 479 | somewhere 480 | soon 481 | sorry 482 | specifically 483 | specified 484 | specify 485 | specifying 486 | still 487 | stop 488 | strongly 489 | sub 490 | substantially 491 | successfully 492 | such 493 | sufficiently 494 | suggest 495 | sup 496 | sure 497 | than 498 | that 499 | that's 500 | the 501 | their 502 | theirs 503 | them 504 | themselves 505 | then 506 | there 507 | there's 508 | these 509 | they 510 | they'd 511 | they'll 512 | they're 513 | they've 514 | this 515 | those 516 | through 517 | to 518 | too 519 | under 520 | until 521 | up 522 | very 523 | was 524 | wasn't 525 | we 526 | we'd 527 | we'll 528 | we're 529 | we've 530 | were 531 | weren't 532 | what 533 | what's 534 | when 535 | when's 536 | where 537 | where's 538 | which 539 | while 540 | who 541 | who's 542 | whom 543 | why 544 | why's 545 | with 546 | won't 547 | would 548 | wouldn't 549 | you 550 | you'd 551 | you'll 552 | you're 553 | you've 554 | your 555 | yours 556 | yourself 557 | yourselves -------------------------------------------------------------------------------- /mix.exs: -------------------------------------------------------------------------------- 1 | defmodule Scrape.MixProject do 2 | use Mix.Project 3 | 4 | def project do 5 | [ 6 | app: :scrape, 7 | version: "3.1.0", 8 | elixir: "~> 1.10", 9 | description: description(), 10 | package: package(), 11 | start_permanent: Mix.env() == :prod, 12 | deps: deps() 13 | ] 14 | end 15 | 16 | # Run "mix help compile.app" to learn about applications. 17 | def application do 18 | [ 19 | extra_applications: [:logger], 20 | mod: {Scrape.Application, []} 21 | ] 22 | end 23 | 24 | defp description do 25 | """ 26 | Scrape any website, article or RSS/Atom feed with ease! 27 | """ 28 | end 29 | 30 | defp package do 31 | [ 32 | files: ["lib", "mix.exs", "README.md", "LICENSE.txt"], 33 | maintainers: ["Maximilian Stroh"], 34 | licenses: ["LGPLv3"], 35 | links: %{"GitHub" => "https://github.com/Anonyfox/elixir-scrape"} 36 | ] 37 | end 38 | 39 | # Run "mix help deps" to learn about dependencies. 40 | defp deps do 41 | [ 42 | # enable development with `mix test.watch --stale` 43 | {:mix_test_watch, "~> 0.8", only: :dev, runtime: false}, 44 | # documentation generation 45 | {:ex_doc, "~> 0.20.2", only: :dev, runtime: false}, 46 | # language detection 47 | {:paasaa, "~> 0.3.1"}, 48 | # snowball stemmer for multiple languages with a NIF 49 | {:stemex, "~> 0.1.1"}, 50 | # HTML/XML parser with CSS3 selectors 51 | {:floki, "~> 0.21.0"}, 52 | # clone of arc90's readability algorithm 53 | {:readability, "~> 0.10.0"}, 54 | # iconv written in pure elixir 55 | {:codepagex, "~> 0.1.4"}, 56 | # http client 57 | {:httpoison, "~> 0.13.0"}, 58 | # xml to map 59 | {:elixir_xml_to_map, "~> 0.1.2"}, 60 | # map transformation functions 61 | {:morphix, "~> 0.8.0"} 62 | ] 63 | end 64 | end 65 | -------------------------------------------------------------------------------- /mix.lock: -------------------------------------------------------------------------------- 1 | %{ 2 | "certifi": {:hex, :certifi, "2.5.1", "867ce347f7c7d78563450a18a6a28a8090331e77fa02380b4a21962a65d36ee5", [:rebar3], [{:parse_trans, "~>3.3", [hex: :parse_trans, repo: "hexpm", optional: false]}], "hexpm", "805abd97539caf89ec6d4732c91e62ba9da0cda51ac462380bbd28ee697a8c42"}, 3 | "codepagex": {:hex, :codepagex, "0.1.4", "dae3bc57e9334c324914b32ed61c0a30929fac3e73dc71fc611ed7eeb2dcb867", [:mix], [], "hexpm", "21710d98fb2bc03a4d44365b66aba569c3a9267437cfafd09ca27ed92a99c75e"}, 4 | "combine": {:hex, :combine, "0.10.0", "eff8224eeb56498a2af13011d142c5e7997a80c8f5b97c499f84c841032e429f", [:mix], [], "hexpm"}, 5 | "dogma": {:hex, :dogma, "0.1.13", "7b6c6ad2b3ee6501eda3bd39e197dd5198be8d520d1c175c7f713803683cf27a", [:mix], [{:poison, ">= 2.0.0", [hex: :poison, repo: "hexpm", optional: false]}], "hexpm"}, 6 | "earmark": {:hex, :earmark, "1.3.2", "b840562ea3d67795ffbb5bd88940b1bed0ed9fa32834915125ea7d02e35888a5", [:mix], [], "hexpm", "e3be2bc3ae67781db529b80aa7e7c49904a988596e2dbff897425b48b3581161"}, 7 | "elixir_xml_to_map": {:hex, :elixir_xml_to_map, "0.1.2", "e3d1bd2f6562711117ae209657f385a1c1c34c8c720c748eeba2e22815797071", [:mix], [{:erlsom, "~>1.4", [hex: :erlsom, repo: "hexpm", optional: false]}], "hexpm", "a134d24496ebb25e1ab7027bba18a3be1f91f44aa3e6701bdc6ea5807d98ef0a"}, 8 | "erlsom": {:hex, :erlsom, "1.5.0", "c5a5cdd0ee0e8dca62bcc4b13ff08da24fdefc16ccd8b25282a2fda2ba1be24a", [:rebar3], [], "hexpm", "55a9dbf9cfa77fcfc108bd8e2c4f9f784dea228a8f4b06ea10b684944946955a"}, 9 | "ex_doc": {:hex, :ex_doc, "0.20.2", "1bd0dfb0304bade58beb77f20f21ee3558cc3c753743ae0ddbb0fd7ba2912331", [:mix], [{:earmark, "~> 1.3", [hex: :earmark, repo: "hexpm", optional: false]}, {:makeup_elixir, "~> 0.10", [hex: :makeup_elixir, repo: "hexpm", optional: false]}], "hexpm", "8e24fc8ff9a50b9f557ff020d6c91a03cded7e59ac3e0eec8a27e771430c7d27"}, 10 | "exjsx": {:hex, :exjsx, "4.0.0", "60548841e0212df401e38e63c0078ec57b33e7ea49b032c796ccad8cde794b5c", [:mix], [{:jsx, "~> 2.8.0", [hex: :jsx, repo: "hexpm", optional: false]}], "hexpm", "32e95820a97cffea67830e91514a2ad53b888850442d6d395f53a1ac60c82e07"}, 11 | "file_system": {:hex, :file_system, "0.2.7", "e6f7f155970975789f26e77b8b8d8ab084c59844d8ecfaf58cbda31c494d14aa", [:mix], [], "hexpm", "b4cfa2d69c7f0b18fd06db222b2398abeef743a72504e6bd7df9c52f171b047f"}, 12 | "floki": {:hex, :floki, "0.21.0", "0c0191a6dbc559300bac232f716c55fb5738d45ae846b3141b19e5f5741c1907", [:mix], [{:html_entities, "~> 0.4.0", [hex: :html_entities, repo: "hexpm", optional: false]}, {:mochiweb, "~> 2.15", [hex: :mochiweb, repo: "hexpm", optional: false]}], "hexpm", "4558100b255f5143d42432e75ceb731d04dbe824d1cf57c38e7e0f3c644ca0cd"}, 13 | "gettext": {:hex, :gettext, "0.16.1", "e2130b25eebcbe02bb343b119a07ae2c7e28bd4b146c4a154da2ffb2b3507af2", [:mix], [], "hexpm"}, 14 | "hackney": {:hex, :hackney, "1.15.1", "9f8f471c844b8ce395f7b6d8398139e26ddca9ebc171a8b91342ee15a19963f4", [:rebar3], [{:certifi, "2.5.1", [hex: :certifi, repo: "hexpm", optional: false]}, {:idna, "6.0.0", [hex: :idna, repo: "hexpm", optional: false]}, {:metrics, "1.0.1", [hex: :metrics, repo: "hexpm", optional: false]}, {:mimerl, "~>1.1", [hex: :mimerl, repo: "hexpm", optional: false]}, {:ssl_verify_fun, "1.1.4", [hex: :ssl_verify_fun, repo: "hexpm", optional: false]}], "hexpm", "c2790c9f0f7205f4a362512192dee8179097394400e745e4d20bab7226a8eaad"}, 15 | "html5ever": {:hex, :html5ever, "0.7.0", "9f63ec1c783b2dc9f326840fcc993c01e926dbdef4e51ba1bbe5355993c258b4", [:mix], [{:rustler, "~> 0.18.0", [hex: :rustler, repo: "hexpm", optional: false]}], "hexpm"}, 16 | "html_entities": {:hex, :html_entities, "0.4.0", "f2fee876858cf6aaa9db608820a3209e45a087c5177332799592142b50e89a6b", [:mix], [], "hexpm", "3e3d7156a272950373ce5a4018b1490bea26676f8d6a7d409f6fac8568b8cb9a"}, 17 | "httpoison": {:hex, :httpoison, "0.13.0", "bfaf44d9f133a6599886720f3937a7699466d23bb0cd7a88b6ba011f53c6f562", [:mix], [{:hackney, "~> 1.8", [hex: :hackney, repo: "hexpm", optional: false]}], "hexpm", "4846958172d6401c4f34ecc5c2c4607b5b0d90b8eec8f6df137ca4907942ed0f"}, 18 | "idna": {:hex, :idna, "6.0.0", "689c46cbcdf3524c44d5f3dde8001f364cd7608a99556d8fbd8239a5798d4c10", [:rebar3], [{:unicode_util_compat, "0.4.1", [hex: :unicode_util_compat, repo: "hexpm", optional: false]}], "hexpm", "4bdd305eb64e18b0273864920695cb18d7a2021f31a11b9c5fbcd9a253f936e2"}, 19 | "jsx": {:hex, :jsx, "2.8.3", "a05252d381885240744d955fbe3cf810504eb2567164824e19303ea59eef62cf", [:mix, :rebar3], [], "hexpm", "fc3499fed7a726995aa659143a248534adc754ebd16ccd437cd93b649a95091f"}, 20 | "makeup": {:hex, :makeup, "0.8.0", "9cf32aea71c7fe0a4b2e9246c2c4978f9070257e5c9ce6d4a28ec450a839b55f", [:mix], [{:nimble_parsec, "~> 0.5.0", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "5fbc8e549aa9afeea2847c0769e3970537ed302f93a23ac612602e805d9d1e7f"}, 21 | "makeup_elixir": {:hex, :makeup_elixir, "0.13.0", "be7a477997dcac2e48a9d695ec730b2d22418292675c75aa2d34ba0909dcdeda", [:mix], [{:makeup, "~> 0.8", [hex: :makeup, repo: "hexpm", optional: false]}], "hexpm", "adf0218695e22caeda2820eaba703fa46c91820d53813a2223413da3ef4ba515"}, 22 | "metrics": {:hex, :metrics, "1.0.1", "25f094dea2cda98213cecc3aeff09e940299d950904393b2a29d191c346a8486", [:rebar3], [], "hexpm", "69b09adddc4f74a40716ae54d140f93beb0fb8978d8636eaded0c31b6f099f16"}, 23 | "mimerl": {:hex, :mimerl, "1.2.0", "67e2d3f571088d5cfd3e550c383094b47159f3eee8ffa08e64106cdf5e981be3", [:rebar3], [], "hexpm", "f278585650aa581986264638ebf698f8bb19df297f66ad91b18910dfc6e19323"}, 24 | "mix_test_watch": {:hex, :mix_test_watch, "0.9.0", "c72132a6071261893518fa08e121e911c9358713f62794a90c95db59042af375", [:mix], [{:file_system, "~> 0.2.1 or ~> 0.3", [hex: :file_system, repo: "hexpm", optional: false]}], "hexpm", "817dec4a7f6edf260258002f99ac8ffaf7a8f395b27bf2d13ec24018beecec8a"}, 25 | "mochiweb": {:hex, :mochiweb, "2.18.0", "eb55f1db3e6e960fac4e6db4e2db9ec3602cc9f30b86cd1481d56545c3145d2e", [:rebar3], [], "hexpm", "b93e2b1e564bdbadfecc297277f9e6d0902da645b417d6c9210f6038ac63489a"}, 26 | "morphix": {:hex, :morphix, "0.8.0", "69ea4b2bc89eed7a85d5f3af7176862e1fd6e64af7f788a9d976cf599f0695af", [:mix], [], "hexpm", "307683e71d74af44da4af07ec7cc978d242f3395a159b64515f093d44280169f"}, 27 | "nimble_parsec": {:hex, :nimble_parsec, "0.5.0", "90e2eca3d0266e5c53f8fbe0079694740b9c91b6747f2b7e3c5d21966bba8300", [:mix], [], "hexpm", "5c040b8469c1ff1b10093d3186e2e10dbe483cd73d79ec017993fb3985b8a9b3"}, 28 | "paasaa": {:hex, :paasaa, "0.3.1", "94e1c4fc83bdd7b8c06fd90f965ff90a6198cbcf6ddf27b64de62f5dbcb2ccf7", [:mix], [{:exjsx, "~> 4.0", [hex: :exjsx, repo: "hexpm", optional: false]}], "hexpm", "5e02b49d9a968f6ccffa130c9a9f977a8ec3403b7a26069547bfda9daa557d10"}, 29 | "parallel": {:hex, :parallel, "0.0.3", "d1c9a03f0fd6c85ba174938b9823db51e01a68f9f0e76e3f3e11989cbeb607e7", [:mix], [], "hexpm"}, 30 | "parse_trans": {:hex, :parse_trans, "3.3.0", "09765507a3c7590a784615cfd421d101aec25098d50b89d7aa1d66646bc571c1", [:rebar3], [], "hexpm", "17ef63abde837ad30680ea7f857dd9e7ced9476cdd7b0394432af4bfc241b960"}, 31 | "poison": {:hex, :poison, "3.1.0", "d9eb636610e096f86f25d9a46f35a9facac35609a7591b3be3326e99a0484665", [:mix], [], "hexpm"}, 32 | "readability": {:hex, :readability, "0.10.0", "934212018e70346a982927ee4b32d3ddb3d5feba7bf7ab04f57da66ced5ab7a2", [:mix], [{:floki, "~> 0.20", [hex: :floki, repo: "hexpm", optional: false]}, {:httpoison, "~> 0.13.0", [hex: :httpoison, repo: "hexpm", optional: false]}], "hexpm", "b0edfbd1878cbb27a81d7d3678670cdfb2d2b8fef6a9ca9cbc4013eb640082cd"}, 33 | "rustler": {:hex, :rustler, "0.18.0", "db4bd0c613d83a1badc31be90ddada6f9821de29e4afd15c53a5da61882e4f2d", [:mix], [], "hexpm"}, 34 | "ssl_verify_fun": {:hex, :ssl_verify_fun, "1.1.4", "f0eafff810d2041e93f915ef59899c923f4568f4585904d010387ed74988e77b", [:make, :mix, :rebar3], [], "hexpm", "603561dc0fd62f4f2ea9b890f4e20e1a0d388746d6e20557cafb1b16950de88c"}, 35 | "stemex": {:hex, :stemex, "0.1.1", "726d693b67c4ee82398ca6f1bfbacc8d7aad20861a0371e44e9c6f9dee1e042d", [:mix], [], "hexpm", "219b8e81fedba5a9bb978b8f7eaf230e77f2702d58e409adcca998fde1788521"}, 36 | "timex": {:hex, :timex, "3.4.2", "d74649c93ad0e12ce5b17cf5e11fbd1fb1b24a3d114643e86dba194b64439547", [:mix], [{:combine, "~> 0.10", [hex: :combine, repo: "hexpm", optional: false]}, {:gettext, "~> 0.10", [hex: :gettext, repo: "hexpm", optional: false]}, {:tzdata, "~> 0.1.8 or ~> 0.5", [hex: :tzdata, repo: "hexpm", optional: false]}], "hexpm"}, 37 | "tzdata": {:hex, :tzdata, "0.5.19", "7962a3997bf06303b7d1772988ede22260f3dae1bf897408ebdac2b4435f4e6a", [:mix], [{:hackney, "~> 1.0", [hex: :hackney, repo: "hexpm", optional: false]}], "hexpm"}, 38 | "unicode_util_compat": {:hex, :unicode_util_compat, "0.4.1", "d869e4c68901dd9531385bb0c8c40444ebf624e60b6962d95952775cac5e90cd", [:rebar3], [], "hexpm", "1d1848c40487cdb0b30e8ed975e34e025860c02e419cb615d255849f3427439d"}, 39 | } 40 | -------------------------------------------------------------------------------- /test/flow/article_test.exs: -------------------------------------------------------------------------------- 1 | defmodule Scrape.Flow.ArticleTest do 2 | use ExUnit.Case 3 | 4 | alias Scrape.Flow.Article 5 | 6 | describe "Article#from_url" do 7 | end 8 | 9 | describe "Article#from_file" do 10 | test "works when a valid article file exists" do 11 | {:ok, data} = Article.from_file("cache/article/nytimes.html") 12 | assert data.title =~ "Highest Minimum Wage" 13 | assert data.summary =~ "raising the minimum wage" 14 | end 15 | 16 | test "refuses when no file exists" do 17 | {:error, error} = Article.from_file("missing") 18 | 19 | assert error == 20 | {:assign, :html, 21 | %File.Error{action: "read file", path: "missing", reason: :enoent}} 22 | end 23 | end 24 | 25 | describe "Article#from_string" do 26 | test "works when a valid string is given" do 27 | html = File.read!("cache/article/nytimes.html") 28 | {:ok, data} = Article.from_string(html) 29 | assert data.title =~ "Highest Minimum Wage" 30 | assert data.summary =~ "raising the minimum wage" 31 | end 32 | 33 | test "refuses when nil is given" do 34 | {:error, error} = Article.from_string(nil) 35 | assert error == :html_invalid 36 | end 37 | 38 | test "refuses when empty string is given" do 39 | {:error, error} = Article.from_string("") 40 | assert error == :html_invalid 41 | end 42 | end 43 | end 44 | -------------------------------------------------------------------------------- /test/flow/domain_test.exs: -------------------------------------------------------------------------------- 1 | defmodule Scrape.Flow.DomainTest do 2 | use ExUnit.Case 3 | 4 | alias Scrape.Flow.Domain 5 | 6 | describe "Domain#from_url" do 7 | end 8 | 9 | describe "Domain#from_file" do 10 | test "works when a valid domain file exists" do 11 | {:ok, data} = Domain.from_file("cache/domain/venturebeat.html") 12 | assert data.title =~ "Fortnite teams up with Avengers" 13 | assert length(data.feed_urls) == 3 14 | end 15 | 16 | test "refuses when no file exists" do 17 | {:error, error} = Domain.from_file("missing") 18 | 19 | assert error == 20 | {:assign, :html, 21 | %File.Error{action: "read file", path: "missing", reason: :enoent}} 22 | end 23 | end 24 | 25 | describe "Domain#from_string" do 26 | test "works when a valid string is given" do 27 | html = File.read!("cache/domain/venturebeat.html") 28 | {:ok, data} = Domain.from_string(html) 29 | assert data.title =~ "Fortnite teams up with Avengers" 30 | assert length(data.feed_urls) == 3 31 | end 32 | 33 | test "refuses when nil is given" do 34 | {:error, error} = Domain.from_string(nil) 35 | assert error == :html_invalid 36 | end 37 | 38 | test "refuses when empty string is given" do 39 | {:error, error} = Domain.from_string("") 40 | assert error == :html_invalid 41 | end 42 | end 43 | end 44 | -------------------------------------------------------------------------------- /test/flow/feed_test.exs: -------------------------------------------------------------------------------- 1 | defmodule Scrape.Flow.FeedTest do 2 | use ExUnit.Case 3 | 4 | alias Scrape.Flow.Feed 5 | 6 | describe "Feed#from_url" do 7 | end 8 | 9 | describe "Feed#from_string" do 10 | test "works when a valid string is given" do 11 | xml = File.read!("cache/feed/latimes.xml") 12 | {:ok, data} = Feed.from_string(xml) 13 | assert data.title =~ "latimes.com - Los Angeles Times" 14 | assert data.website_url == "http://www.latimes.com" 15 | 16 | item = data[:items] |> List.first() 17 | item.title =~ "guitar" 18 | end 19 | 20 | test "refuses when nil is given" do 21 | {:error, error} = Feed.from_string(nil) 22 | assert error == :xml_invalid 23 | end 24 | 25 | test "refuses when empty string is given" do 26 | {:error, error} = Feed.from_string("") 27 | assert error == :xml_invalid 28 | end 29 | end 30 | end 31 | -------------------------------------------------------------------------------- /test/ir/feed_item_test.exs: -------------------------------------------------------------------------------- 1 | defmodule Scrape.IR.FeedItemTest do 2 | use ExUnit.Case 3 | 4 | alias Scrape.IR.Feed 5 | alias Scrape.IR.FeedItem 6 | 7 | doctest FeedItem 8 | 9 | describe "FeedItem#title/1" do 10 | test "can extract from xml string of type atom" do 11 | xml = "abc" 12 | assert FeedItem.title(xml) == "abc" 13 | end 14 | 15 | test "can extract from xml string of type rss" do 16 | xml = "abc" 17 | assert FeedItem.title(xml) == "abc" 18 | end 19 | 20 | test "can extract from german atom feed" do 21 | xml = File.read!("cache/feed/heise.xml") 22 | item = xml |> Feed.items() |> List.first() 23 | assert FeedItem.title(item) =~ "Fachkräftemangel" 24 | end 25 | 26 | test "can extract from german rss feed" do 27 | xml = File.read!("cache/feed/spiegel.xml") 28 | item = xml |> Feed.items() |> List.first() 29 | assert FeedItem.title(item) =~ "Schwertransporter" 30 | end 31 | 32 | test "can extract from english atom feed" do 33 | xml = File.read!("cache/feed/elixir-lang.xml") 34 | item = xml |> Feed.items() |> List.first() 35 | assert FeedItem.title(item) == "Elixir v1.0 released" 36 | end 37 | 38 | test "can extract from english rss feed" do 39 | xml = File.read!("cache/feed/latimes.xml") 40 | item = xml |> Feed.items() |> List.first() 41 | assert FeedItem.title(item) =~ "Essential tracks" 42 | end 43 | end 44 | 45 | describe "FeedItem#description/1" do 46 | test "can extract from xml string of type atom" do 47 | xml = "abc" 48 | assert FeedItem.description(xml) == "abc" 49 | end 50 | 51 | test "can extract from xml string of type rss" do 52 | xml = "abc" 53 | assert FeedItem.description(xml) == "abc" 54 | end 55 | 56 | test "can extract from german atom feed" do 57 | xml = File.read!("cache/feed/heise.xml") 58 | item = xml |> Feed.items() |> List.first() 59 | assert FeedItem.description(item) =~ "730.000 Mitarbeiter" 60 | end 61 | 62 | test "can extract from german rss feed" do 63 | xml = File.read!("cache/feed/spiegel.xml") 64 | item = xml |> Feed.items() |> List.first() 65 | assert FeedItem.description(item) =~ "Schweres Unglück in der Oberpfalz" 66 | end 67 | 68 | test "can extract from english atom feed" do 69 | xml = File.read!("cache/feed/elixir-lang.xml") 70 | item = xml |> Feed.items() |> List.first() 71 | assert FeedItem.description(item) =~ "Elixir v1.0 is finally out" 72 | end 73 | 74 | test "can extract from english rss feed" do 75 | xml = File.read!("cache/feed/latimes.xml") 76 | item = xml |> Feed.items() |> List.first() 77 | assert FeedItem.description(item) =~ "high-energy party music" 78 | end 79 | end 80 | 81 | describe "FeedItem#website_url/1" do 82 | test "can extract from xml string of type atom" do 83 | xml = "" 84 | assert FeedItem.article_url(xml) == "http://example.com" 85 | end 86 | 87 | test "can extract from xml string of type rss" do 88 | xml = "http://example.com" 89 | assert FeedItem.article_url(xml) == "http://example.com" 90 | end 91 | 92 | test "can extract from german atom feed" do 93 | xml = File.read!("cache/feed/heise.xml") 94 | item = xml |> Feed.items() |> List.first() 95 | assert FeedItem.article_url(item) =~ "https://www.heise.de/newsticker" 96 | end 97 | 98 | test "can extract from german rss feed" do 99 | xml = File.read!("cache/feed/spiegel.xml") 100 | item = xml |> Feed.items() |> List.first() 101 | assert FeedItem.article_url(item) =~ "http://www.spiegel.de/panorama" 102 | end 103 | 104 | test "can extract from english atom feed" do 105 | xml = File.read!("cache/feed/elixir-lang.xml") 106 | item = xml |> Feed.items() |> List.first() 107 | assert FeedItem.article_url(item) =~ "http://elixir-lang.org/blog" 108 | end 109 | 110 | test "can extract from english rss feed" do 111 | xml = File.read!("cache/feed/latimes.xml") 112 | item = xml |> Feed.items() |> List.first() 113 | assert FeedItem.article_url(item) =~ "http://www.latimes.com/la-et-ms" 114 | end 115 | end 116 | 117 | describe "FeedItem#tags/1" do 118 | test "can extract from xml string of type atom" do 119 | xml = "abc" 120 | assert FeedItem.tags(xml) == ["abc"] 121 | end 122 | 123 | test "can extract from xml string of type rss" do 124 | xml = "abc" 125 | assert FeedItem.tags(xml) == ["abc"] 126 | end 127 | 128 | test "can extract from german atom feed" do 129 | xml = File.read!("cache/feed/heise.xml") 130 | item = xml |> Feed.items() |> List.first() 131 | assert FeedItem.tags(item) == [] 132 | end 133 | 134 | test "can extract from german rss feed" do 135 | xml = File.read!("cache/feed/spiegel.xml") 136 | item = xml |> Feed.items() |> List.first() 137 | assert FeedItem.tags(item) == ["panorama"] 138 | end 139 | 140 | test "can extract from english atom feed" do 141 | xml = File.read!("cache/feed/elixir-lang.xml") 142 | item = xml |> Feed.items() |> List.first() 143 | assert FeedItem.tags(item) == [] 144 | end 145 | 146 | test "can extract from english rss feed" do 147 | xml = File.read!("cache/feed/latimes.xml") 148 | item = xml |> Feed.items() |> List.first() 149 | assert FeedItem.tags(item) == [] 150 | end 151 | end 152 | 153 | describe "FeedItem#author/1" do 154 | test "can extract from xml string of type atom" do 155 | xml = "abc" 156 | assert FeedItem.author(xml) == "abc" 157 | end 158 | 159 | test "can extract from xml string of type rss" do 160 | xml = "abc" 161 | assert FeedItem.author(xml) == "abc" 162 | end 163 | 164 | test "can extract from german atom feed" do 165 | xml = File.read!("cache/feed/heise.xml") 166 | item = xml |> Feed.items() |> List.first() 167 | assert FeedItem.author(item) == nil 168 | end 169 | 170 | test "can extract from german rss feed" do 171 | xml = File.read!("cache/feed/spiegel.xml") 172 | item = xml |> Feed.items() |> List.first() 173 | assert FeedItem.author(item) == nil 174 | end 175 | 176 | test "can extract from english atom feed" do 177 | xml = File.read!("cache/feed/elixir-lang.xml") 178 | item = xml |> Feed.items() |> List.first() 179 | assert FeedItem.author(item) == "José Valim" 180 | end 181 | 182 | test "can extract from english rss feed" do 183 | xml = File.read!("cache/feed/latimes.xml") 184 | item = xml |> Feed.items() |> List.first() 185 | assert FeedItem.author(item) == "Randall Roberts" 186 | end 187 | end 188 | 189 | describe "FeedItem#image_url/1" do 190 | test "can extract from xml string of type atom" do 191 | xml = "" 192 | assert FeedItem.image_url(xml) == "abc" 193 | end 194 | 195 | test "can extract from xml string of type rss" do 196 | xml = "" 197 | assert FeedItem.image_url(xml) == "abc" 198 | end 199 | 200 | test "can extract from german atom feed" do 201 | xml = File.read!("cache/feed/heise.xml") 202 | item = xml |> Feed.items() |> List.first() 203 | assert FeedItem.image_url(item) =~ "https://www.heise.de/scale/geometry/" 204 | end 205 | 206 | test "can extract from german rss feed" do 207 | xml = File.read!("cache/feed/spiegel.xml") 208 | item = xml |> Feed.items() |> List.first() 209 | assert FeedItem.image_url(item) == nil 210 | end 211 | 212 | test "can extract from english atom feed" do 213 | xml = File.read!("cache/feed/elixir-lang.xml") 214 | item = xml |> Feed.items() |> List.first() 215 | assert FeedItem.image_url(item) == nil 216 | end 217 | 218 | test "can extract from english rss feed" do 219 | xml = File.read!("cache/feed/latimes.xml") 220 | item = xml |> Feed.items() |> List.first() 221 | assert FeedItem.image_url(item) == nil 222 | end 223 | end 224 | end 225 | -------------------------------------------------------------------------------- /test/ir/feed_test.exs: -------------------------------------------------------------------------------- 1 | defmodule Scrape.IR.FeedTest do 2 | use ExUnit.Case 3 | 4 | alias Scrape.IR.Feed 5 | 6 | doctest Feed 7 | 8 | describe "Feed#title/1" do 9 | test "can extract from xml string of type atom" do 10 | assert Feed.title("abc") == "abc" 11 | end 12 | 13 | test "can extract from xml string of type rss" do 14 | assert Feed.title("abc") == "abc" 15 | end 16 | 17 | test "can extract from german atom feed" do 18 | html = File.read!("cache/feed/heise.xml") 19 | assert Feed.title(html) == "heise online News" 20 | end 21 | 22 | test "can extract from german rss feed" do 23 | html = File.read!("cache/feed/spiegel.xml") 24 | assert Feed.title(html) == "SPIEGEL ONLINE - Schlagzeilen" 25 | end 26 | 27 | test "can extract from english atom feed" do 28 | html = File.read!("cache/feed/elixir-lang.xml") 29 | assert Feed.title(html) == "Elixir Lang" 30 | end 31 | 32 | test "can extract from english rss feed" do 33 | html = File.read!("cache/feed/latimes.xml") 34 | assert Feed.title(html) == "latimes.com - Los Angeles Times" 35 | end 36 | end 37 | 38 | describe "Feed#description/1" do 39 | test "can extract from xml string of type atom" do 40 | xml = "abc" 41 | assert Feed.description(xml) == "abc" 42 | end 43 | 44 | test "can extract from xml string of type rss" do 45 | xml = "abc" 46 | assert Feed.description(xml) == "abc" 47 | end 48 | 49 | test "can extract from german atom feed" do 50 | xml = File.read!("cache/feed/heise.xml") 51 | assert Feed.description(xml) == "Nachrichten nicht nur aus der Welt der Computer" 52 | end 53 | 54 | test "can extract from german rss feed" do 55 | xml = File.read!("cache/feed/spiegel.xml") 56 | assert Feed.description(xml) =~ "Alles Wichtige aus" 57 | end 58 | 59 | test "can extract from english atom feed" do 60 | xml = File.read!("cache/feed/elixir-lang.xml") 61 | assert Feed.description(xml) == nil 62 | end 63 | 64 | test "can extract from english rss feed" do 65 | xml = File.read!("cache/feed/latimes.xml") 66 | assert Feed.description(xml) =~ "source of breaking news" 67 | end 68 | end 69 | 70 | describe "Feed#website_url/1" do 71 | test "can extract from xml string of type atom" do 72 | xml = "" 73 | assert Feed.website_url(xml) == "http://example.com" 74 | end 75 | 76 | test "can extract from xml string of type rss" do 77 | xml = "http://example.com" 78 | assert Feed.website_url(xml) == "http://example.com" 79 | end 80 | 81 | test "can extract from german atom feed" do 82 | xml = File.read!("cache/feed/heise.xml") 83 | assert Feed.website_url(xml) == "https://www.heise.de" 84 | end 85 | 86 | test "can extract from german rss feed" do 87 | xml = File.read!("cache/feed/spiegel.xml") 88 | assert Feed.website_url(xml) == "http://www.spiegel.de" 89 | end 90 | 91 | test "can extract from english atom feed" do 92 | xml = File.read!("cache/feed/elixir-lang.xml") 93 | assert Feed.website_url(xml) == "http://elixir-lang.org" 94 | end 95 | 96 | test "can extract from english rss feed" do 97 | xml = File.read!("cache/feed/latimes.xml") 98 | assert Feed.website_url(xml) == "http://www.latimes.com" 99 | end 100 | end 101 | 102 | describe "Feed#items/1" do 103 | test "can extract from xml string of type atom" do 104 | xml = "abc" 105 | assert Feed.items(xml) == [%{"title" => "abc"}] 106 | end 107 | 108 | test "can extract from xml string of type rss" do 109 | xml = "abc" 110 | assert Feed.items(xml) == [%{"title" => "abc"}] 111 | end 112 | 113 | test "can extract from german atom feed" do 114 | xml = File.read!("cache/feed/heise.xml") 115 | item = xml |> Feed.items() |> List.first() 116 | assert item["title"] =~ "Fachkräftemangel" 117 | end 118 | 119 | test "can extract from german rss feed" do 120 | xml = File.read!("cache/feed/spiegel.xml") 121 | item = xml |> Feed.items() |> List.first() 122 | assert item["title"] =~ "Schwertransporter" 123 | end 124 | 125 | test "can extract from english atom feed" do 126 | xml = File.read!("cache/feed/elixir-lang.xml") 127 | item = xml |> Feed.items() |> List.first() 128 | assert item["title"] =~ "v1.0 released" 129 | end 130 | 131 | test "can extract from english rss feed" do 132 | xml = File.read!("cache/feed/latimes.xml") 133 | item = xml |> Feed.items() |> List.first() 134 | assert item["title"] =~ "Instrumental guitar music" 135 | end 136 | end 137 | end 138 | -------------------------------------------------------------------------------- /test/ir/html_test.exs: -------------------------------------------------------------------------------- 1 | defmodule Scrape.IR.HTMLTest do 2 | use ExUnit.Case 3 | 4 | alias Scrape.IR.HTML 5 | 6 | doctest HTML 7 | 8 | describe "DOM#title/1" do 9 | test "can extract title from html string" do 10 | assert HTML.title("abc") == "abc" 11 | end 12 | 13 | test "can extract title from html website" do 14 | html = File.read!("cache/domain/venturebeat.html") 15 | assert HTML.title(html) =~ "Fortnite teams up with Avengers" 16 | end 17 | 18 | test "can extract title from german html article" do 19 | html = File.read!("cache/article/spiegel.html") 20 | assert HTML.title(html) =~ "Forscher über schwarzes Loch" 21 | end 22 | 23 | test "can extract title from english html article" do 24 | html = File.read!("cache/article/nytimes.html") 25 | assert HTML.title(html) =~ "Americans Are Seeing" 26 | end 27 | end 28 | 29 | describe "DOM#image_url/2" do 30 | test "can extract image_url from html string" do 31 | url = "http://example.com" 32 | html = ~s() 33 | assert HTML.image_url(html, url) == "http://example.com/img.jpg" 34 | assert HTML.image_url(html) == "img.jpg" 35 | end 36 | end 37 | 38 | describe "DOM#icon_url/2" do 39 | test "can extract image_url from html string" do 40 | url = "http://example.com" 41 | html = ~s() 42 | assert HTML.icon_url(html, url) == "http://example.com/img.jpg" 43 | assert HTML.icon_url(html) == "img.jpg" 44 | end 45 | end 46 | 47 | describe "DOM#description/1" do 48 | test "can extract description from html string" do 49 | html = "" 50 | assert HTML.description(html) == "interesting!" 51 | end 52 | end 53 | 54 | describe "DOM#content/1" do 55 | test "can extract text from english html string" do 56 | html = File.read!("cache/article/nytimes.html") 57 | assert HTML.content(html) =~ "Minimum Wage Increases Have Trade-Offs." 58 | end 59 | 60 | test "can extract text from german html string" do 61 | html = File.read!("cache/article/spiegel.html") 62 | assert HTML.content(html) =~ "Im Interview erklärt er die Faszination schwarzer Löcher" 63 | end 64 | end 65 | 66 | describe "DOM#paragraphs/1" do 67 | test "can extract text from english html string" do 68 | html = File.read!("cache/article/nytimes.html") 69 | assert HTML.paragraphs(html) |> List.first() =~ "It hasn’t budged since." 70 | end 71 | 72 | test "can extract text from german html string" do 73 | html = File.read!("cache/article/spiegel.html") 74 | assert HTML.paragraphs(html) |> List.first() =~ "Volltreffer gelandet" 75 | end 76 | end 77 | end 78 | -------------------------------------------------------------------------------- /test/ir/text_test.exs: -------------------------------------------------------------------------------- 1 | defmodule Scrape.IR.TextTest do 2 | use ExUnit.Case 3 | 4 | alias Scrape.IR.Text 5 | 6 | doctest Text 7 | 8 | # test "greets the world" do 9 | # example = "lorem ipsum..." 10 | # assert Text.extract_summary(example, ["lorem"]) == ["lorem ipsum"] 11 | # assert Text.generate_summary(example) == example 12 | # end 13 | 14 | test "can detect language of text" do 15 | assert Text.detect_language("the quick brown fox jumps over...") == :en 16 | assert Text.detect_language("Es ist ein schönes Wetter heute...") == :de 17 | end 18 | end 19 | -------------------------------------------------------------------------------- /test/scrape_test.exs: -------------------------------------------------------------------------------- 1 | defmodule ScrapeTest do 2 | use ExUnit.Case 3 | # doctest Scrape 4 | 5 | # test "greets the world" do 6 | # assert Scrape.hello() == :world 7 | # end 8 | end 9 | -------------------------------------------------------------------------------- /test/test_helper.exs: -------------------------------------------------------------------------------- 1 | ExUnit.start() 2 | -------------------------------------------------------------------------------- /test/tools/dom_test.exs: -------------------------------------------------------------------------------- 1 | defmodule Scrape.Tools.DomTest do 2 | use ExUnit.Case 3 | 4 | alias Scrape.Tools.DOM 5 | 6 | doctest DOM 7 | 8 | describe "DOM#from_string/1" do 9 | test "works with nil" do 10 | assert DOM.from_string(nil) == [] 11 | end 12 | 13 | test "works with empty string" do 14 | assert DOM.from_string("") == [] 15 | end 16 | 17 | test "works with html string with one root node" do 18 | assert DOM.from_string("") == {"html", [], []} 19 | end 20 | 21 | test "works with html string with two root nodes" do 22 | assert DOM.from_string("") == [{"head", [], []}, {"body", [], []}] 23 | end 24 | end 25 | 26 | describe "DOM#to_string/1" do 27 | test "works with nil" do 28 | assert DOM.to_string(nil) == "" 29 | end 30 | 31 | test "works with empty dom" do 32 | assert DOM.to_string([]) == "" 33 | end 34 | 35 | test "works with dom with one root node" do 36 | assert DOM.to_string({"html", [], []}) == "" 37 | end 38 | 39 | test "works with dom with two root nodes" do 40 | dom = [{"head", [], []}, {"body", [], []}] 41 | assert DOM.to_string(dom) == "" 42 | end 43 | end 44 | 45 | describe "DOM#text/2" do 46 | test "returns nil if nothing is found" do 47 | dom = DOM.from_string("

hello world

") 48 | assert DOM.text(dom, "div") == nil 49 | end 50 | 51 | test "returns string if something is found" do 52 | dom = DOM.from_string("

hello world

") 53 | assert DOM.text(dom, "p") == "hello world" 54 | end 55 | 56 | test "returns first string if many matches are found" do 57 | dom = DOM.from_string("

hello

world

") 58 | assert DOM.text(dom, "p") == "hello" 59 | end 60 | end 61 | 62 | describe "DOM#texts/2" do 63 | test "returns empty list if nothing is found" do 64 | dom = DOM.from_string("

hello world

") 65 | assert DOM.texts(dom, "div") == [] 66 | end 67 | 68 | test "returns string list if one match is found" do 69 | dom = DOM.from_string("

hello world

") 70 | assert DOM.texts(dom, "p") == ["hello world"] 71 | end 72 | 73 | test "returns string list if many matches are found" do 74 | dom = DOM.from_string("

hello

world

") 75 | assert DOM.texts(dom, "p") == ["hello", "world"] 76 | end 77 | end 78 | 79 | describe "DOM#attr/3" do 80 | test "returns nil if nothing is found" do 81 | dom = DOM.from_string("") 82 | assert DOM.attr(dom, "unknown", "unknown") == nil 83 | assert DOM.attr(dom, "meta", "unknown") == nil 84 | assert DOM.attr(dom, "meta[name='unknown']", "unknown") == nil 85 | assert DOM.attr(dom, "unknown", "content") == nil 86 | end 87 | 88 | test "returns string if something is found" do 89 | dom = DOM.from_string("") 90 | assert DOM.attr(dom, "meta[name='a']", "content") == "b" 91 | end 92 | 93 | test "returns first string if many matches are found" do 94 | dom = DOM.from_string("") 95 | assert DOM.attr(dom, "meta[name='a']", "content") == "b" 96 | end 97 | end 98 | 99 | describe "DOM#attrs/3" do 100 | test "returns empty list if nothing is found" do 101 | dom = DOM.from_string("") 102 | assert DOM.attrs(dom, "unknown", "unknown") == [] 103 | end 104 | 105 | test "returns string list if one match is found" do 106 | dom = DOM.from_string("") 107 | assert DOM.attrs(dom, "meta[name=a]", "content") == ["b"] 108 | end 109 | 110 | test "returns string list if many matches are found" do 111 | dom = DOM.from_string("") 112 | assert DOM.attrs(dom, "meta[name=a]", "content") == ["b", "c"] 113 | end 114 | end 115 | end 116 | -------------------------------------------------------------------------------- /test/tools/tree_test.exs: -------------------------------------------------------------------------------- 1 | defmodule Scrape.Tools.TreeTest do 2 | use ExUnit.Case 3 | 4 | alias Scrape.Tools.Tree 5 | 6 | doctest Tree 7 | 8 | describe "Tree#from_string/1" do 9 | test "works with nil" do 10 | assert Tree.from_xml_string(nil) == %{} 11 | end 12 | 13 | test "works with empty string" do 14 | assert Tree.from_xml_string("") == %{} 15 | end 16 | 17 | test "works with xml string" do 18 | assert Tree.from_xml_string("abc") == %{"node" => "abc"} 19 | end 20 | end 21 | end 22 | -------------------------------------------------------------------------------- /test/tools/url_test.exs: -------------------------------------------------------------------------------- 1 | defmodule Scrape.Tools.URLTest do 2 | use ExUnit.Case 3 | 4 | alias Scrape.Tools.URL 5 | 6 | doctest URL 7 | 8 | describe "URL.merge/2" do 9 | test "can merge relative paths" do 10 | root_url = "http://example.com" 11 | assert URL.merge("/path", root_url) == "http://example.com/path" 12 | assert URL.merge("/path", root_url <> "/something") == "http://example.com/path" 13 | end 14 | end 15 | end 16 | -------------------------------------------------------------------------------- /test/tools/word_test.exs: -------------------------------------------------------------------------------- 1 | defmodule Scrape.Tools.WordTest do 2 | use ExUnit.Case 3 | 4 | alias Scrape.Tools.Word 5 | 6 | doctest Word 7 | 8 | describe "Word.stem/2" do 9 | test "can stem english words" do 10 | assert Word.stem("beautiful", :en) == "beauti" 11 | end 12 | 13 | test "can stem german words" do 14 | assert Word.stem("derbsten", :de) == "derb" 15 | end 16 | end 17 | 18 | describe "Word.is_stopword?/2" do 19 | test "can check english words" do 20 | assert Word.is_stopword?("a", :en) == true 21 | assert Word.is_stopword?("apple", :en) == false 22 | end 23 | 24 | test "can check german words" do 25 | assert Word.is_stopword?("eine", :de) == true 26 | assert Word.is_stopword?("vitamin", :de) == false 27 | end 28 | end 29 | end 30 | --------------------------------------------------------------------------------