├── .gitignore ├── LICENSE ├── README.md ├── crawlers ├── Crawling-Google │ ├── README.md │ ├── google_2000 │ │ ├── __init__.py │ │ ├── items.py │ │ ├── pipelines.py │ │ ├── settings.py │ │ └── spiders │ │ │ ├── __init__.py │ │ │ └── google_2000_spider.py │ ├── googlebyear.py │ ├── output.txt │ ├── pictures │ │ ├── click-Documents-subtab.png │ │ ├── click-network-tab.png │ │ ├── copy-user-agent.png │ │ ├── correct-lxml-download.png │ │ ├── correct-user-agent.png │ │ ├── custom-date-range-Google-Search.png │ │ ├── finding-Google-results.png │ │ ├── highlight-request-headers.png │ │ ├── highlight-search-document.png │ │ ├── right-click-copy-xpath.png │ │ ├── right-click-open-devtools.png │ │ └── use-magnifying-glass.png │ ├── scrapy.cfg │ ├── simpledownload.py │ └── simpleselect.py └── Way-Back │ ├── README.md │ ├── pictures │ ├── clinton-culkin-2002-02-09.png │ ├── nytimes-2000-11-19.png │ ├── reuters-2000.png │ ├── reuters-is-hiring.png │ ├── reuters-products-2000.png │ └── way-back-makes-it-easy.png │ └── waybacktrack.py ├── dataset ├── README.md ├── crawl_extract.py ├── entertainment.msn.com.7z ├── news.bbc.co.uk.7z ├── news.yahoo.com.7z ├── thenation.com.7z ├── www.cnn.com.7z ├── www.esquire.com.7z ├── www.forbes.com.7z ├── www.foxnews.com.7z ├── www.latimes.com.7z └── www.nymag.com.7z ├── testing ├── README.md ├── avgs.pkl ├── dreampie-eatiht-session.html ├── dreampie-wcbe-session.html ├── eatiht_praf_output.pkl ├── eatiht_results.pkl ├── praf.py ├── trimmed.pkl ├── wbce-tests.7z ├── wbce-tests │ ├── BodyTextExtractor2Filter │ │ ├── avgs.pkl │ │ ├── results.pkl │ │ └── trimmed.pkl │ ├── BodyTextExtractorFilter │ │ ├── avgs.pkl │ │ ├── results.pkl │ │ └── trimmed.pkl │ ├── ContentCodeBlurringFilter │ │ ├── avgs.pkl │ │ ├── results.pkl │ │ └── trimmed.pkl │ ├── DocumentSlopeCurveFilter │ │ ├── avgs.pkl │ │ ├── results.pkl │ │ └── trimmed.pkl │ ├── FeatureExtractorDomFilter │ │ ├── avgs.pkl │ │ ├── results.pkl │ │ └── trimmed.pkl │ ├── FeatureExtractorSplitFilter │ │ ├── avgs.pkl │ │ ├── results.pkl │ │ └── trimmed.pkl │ ├── GeneralCCB │ │ ├── avgs.pkl │ │ ├── results.pkl │ │ └── trimmed.pkl │ ├── KFeatureExtractorDomFilter │ │ ├── avgs.pkl │ │ ├── results.pkl │ │ └── trimmed.pkl │ ├── LinkQuotaFilter │ │ ├── avgs.pkl │ │ ├── results.pkl │ │ └── trimmed.pkl │ ├── eatiht_results.png │ ├── extractor_avgs.pkl │ ├── wbce_results.png │ ├── wbce_results2of3.png │ └── wbce_results3of3.png └── wbce_process_results.py └── timelines └── 1.1.2015-1.7.2015 /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | lib/ 17 | lib64/ 18 | parts/ 19 | sdist/ 20 | var/ 21 | *.egg-info/ 22 | .installed.cfg 23 | *.egg 24 | 25 | # PyInstaller 26 | # Usually these files are written by a python script from a template 27 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 28 | *.manifest 29 | *.spec 30 | 31 | # Installer logs 32 | pip-log.txt 33 | pip-delete-this-directory.txt 34 | 35 | # Unit test / coverage reports 36 | htmlcov/ 37 | .tox/ 38 | .coverage 39 | .cache 40 | nosetests.xml 41 | coverage.xml 42 | 43 | # Translations 44 | *.mo 45 | *.pot 46 | 47 | # Django stuff: 48 | *.log 49 | 50 | # Sphinx documentation 51 | docs/_build/ 52 | 53 | # PyBuilder 54 | target/ 55 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 2, June 1991 3 | 4 | Copyright (C) 1989, 1991 Free Software Foundation, Inc., 5 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 6 | Everyone is permitted to copy and distribute verbatim copies 7 | of this license document, but changing it is not allowed. 8 | 9 | Preamble 10 | 11 | The licenses for most software are designed to take away your 12 | freedom to share and change it. By contrast, the GNU General Public 13 | License is intended to guarantee your freedom to share and change free 14 | software--to make sure the software is free for all its users. This 15 | General Public License applies to most of the Free Software 16 | Foundation's software and to any other program whose authors commit to 17 | using it. (Some other Free Software Foundation software is covered by 18 | the GNU Lesser General Public License instead.) You can apply it to 19 | your programs, too. 20 | 21 | When we speak of free software, we are referring to freedom, not 22 | price. Our General Public Licenses are designed to make sure that you 23 | have the freedom to distribute copies of free software (and charge for 24 | this service if you wish), that you receive source code or can get it 25 | if you want it, that you can change the software or use pieces of it 26 | in new free programs; and that you know you can do these things. 27 | 28 | To protect your rights, we need to make restrictions that forbid 29 | anyone to deny you these rights or to ask you to surrender the rights. 30 | These restrictions translate to certain responsibilities for you if you 31 | distribute copies of the software, or if you modify it. 32 | 33 | For example, if you distribute copies of such a program, whether 34 | gratis or for a fee, you must give the recipients all the rights that 35 | you have. You must make sure that they, too, receive or can get the 36 | source code. And you must show them these terms so they know their 37 | rights. 38 | 39 | We protect your rights with two steps: (1) copyright the software, and 40 | (2) offer you this license which gives you legal permission to copy, 41 | distribute and/or modify the software. 42 | 43 | Also, for each author's protection and ours, we want to make certain 44 | that everyone understands that there is no warranty for this free 45 | software. If the software is modified by someone else and passed on, we 46 | want its recipients to know that what they have is not the original, so 47 | that any problems introduced by others will not reflect on the original 48 | authors' reputations. 49 | 50 | Finally, any free program is threatened constantly by software 51 | patents. We wish to avoid the danger that redistributors of a free 52 | program will individually obtain patent licenses, in effect making the 53 | program proprietary. To prevent this, we have made it clear that any 54 | patent must be licensed for everyone's free use or not licensed at all. 55 | 56 | The precise terms and conditions for copying, distribution and 57 | modification follow. 58 | 59 | GNU GENERAL PUBLIC LICENSE 60 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 61 | 62 | 0. This License applies to any program or other work which contains 63 | a notice placed by the copyright holder saying it may be distributed 64 | under the terms of this General Public License. The "Program", below, 65 | refers to any such program or work, and a "work based on the Program" 66 | means either the Program or any derivative work under copyright law: 67 | that is to say, a work containing the Program or a portion of it, 68 | either verbatim or with modifications and/or translated into another 69 | language. (Hereinafter, translation is included without limitation in 70 | the term "modification".) Each licensee is addressed as "you". 71 | 72 | Activities other than copying, distribution and modification are not 73 | covered by this License; they are outside its scope. The act of 74 | running the Program is not restricted, and the output from the Program 75 | is covered only if its contents constitute a work based on the 76 | Program (independent of having been made by running the Program). 77 | Whether that is true depends on what the Program does. 78 | 79 | 1. You may copy and distribute verbatim copies of the Program's 80 | source code as you receive it, in any medium, provided that you 81 | conspicuously and appropriately publish on each copy an appropriate 82 | copyright notice and disclaimer of warranty; keep intact all the 83 | notices that refer to this License and to the absence of any warranty; 84 | and give any other recipients of the Program a copy of this License 85 | along with the Program. 86 | 87 | You may charge a fee for the physical act of transferring a copy, and 88 | you may at your option offer warranty protection in exchange for a fee. 89 | 90 | 2. You may modify your copy or copies of the Program or any portion 91 | of it, thus forming a work based on the Program, and copy and 92 | distribute such modifications or work under the terms of Section 1 93 | above, provided that you also meet all of these conditions: 94 | 95 | a) You must cause the modified files to carry prominent notices 96 | stating that you changed the files and the date of any change. 97 | 98 | b) You must cause any work that you distribute or publish, that in 99 | whole or in part contains or is derived from the Program or any 100 | part thereof, to be licensed as a whole at no charge to all third 101 | parties under the terms of this License. 102 | 103 | c) If the modified program normally reads commands interactively 104 | when run, you must cause it, when started running for such 105 | interactive use in the most ordinary way, to print or display an 106 | announcement including an appropriate copyright notice and a 107 | notice that there is no warranty (or else, saying that you provide 108 | a warranty) and that users may redistribute the program under 109 | these conditions, and telling the user how to view a copy of this 110 | License. (Exception: if the Program itself is interactive but 111 | does not normally print such an announcement, your work based on 112 | the Program is not required to print an announcement.) 113 | 114 | These requirements apply to the modified work as a whole. If 115 | identifiable sections of that work are not derived from the Program, 116 | and can be reasonably considered independent and separate works in 117 | themselves, then this License, and its terms, do not apply to those 118 | sections when you distribute them as separate works. But when you 119 | distribute the same sections as part of a whole which is a work based 120 | on the Program, the distribution of the whole must be on the terms of 121 | this License, whose permissions for other licensees extend to the 122 | entire whole, and thus to each and every part regardless of who wrote it. 123 | 124 | Thus, it is not the intent of this section to claim rights or contest 125 | your rights to work written entirely by you; rather, the intent is to 126 | exercise the right to control the distribution of derivative or 127 | collective works based on the Program. 128 | 129 | In addition, mere aggregation of another work not based on the Program 130 | with the Program (or with a work based on the Program) on a volume of 131 | a storage or distribution medium does not bring the other work under 132 | the scope of this License. 133 | 134 | 3. You may copy and distribute the Program (or a work based on it, 135 | under Section 2) in object code or executable form under the terms of 136 | Sections 1 and 2 above provided that you also do one of the following: 137 | 138 | a) Accompany it with the complete corresponding machine-readable 139 | source code, which must be distributed under the terms of Sections 140 | 1 and 2 above on a medium customarily used for software interchange; or, 141 | 142 | b) Accompany it with a written offer, valid for at least three 143 | years, to give any third party, for a charge no more than your 144 | cost of physically performing source distribution, a complete 145 | machine-readable copy of the corresponding source code, to be 146 | distributed under the terms of Sections 1 and 2 above on a medium 147 | customarily used for software interchange; or, 148 | 149 | c) Accompany it with the information you received as to the offer 150 | to distribute corresponding source code. (This alternative is 151 | allowed only for noncommercial distribution and only if you 152 | received the program in object code or executable form with such 153 | an offer, in accord with Subsection b above.) 154 | 155 | The source code for a work means the preferred form of the work for 156 | making modifications to it. For an executable work, complete source 157 | code means all the source code for all modules it contains, plus any 158 | associated interface definition files, plus the scripts used to 159 | control compilation and installation of the executable. However, as a 160 | special exception, the source code distributed need not include 161 | anything that is normally distributed (in either source or binary 162 | form) with the major components (compiler, kernel, and so on) of the 163 | operating system on which the executable runs, unless that component 164 | itself accompanies the executable. 165 | 166 | If distribution of executable or object code is made by offering 167 | access to copy from a designated place, then offering equivalent 168 | access to copy the source code from the same place counts as 169 | distribution of the source code, even though third parties are not 170 | compelled to copy the source along with the object code. 171 | 172 | 4. You may not copy, modify, sublicense, or distribute the Program 173 | except as expressly provided under this License. Any attempt 174 | otherwise to copy, modify, sublicense or distribute the Program is 175 | void, and will automatically terminate your rights under this License. 176 | However, parties who have received copies, or rights, from you under 177 | this License will not have their licenses terminated so long as such 178 | parties remain in full compliance. 179 | 180 | 5. You are not required to accept this License, since you have not 181 | signed it. However, nothing else grants you permission to modify or 182 | distribute the Program or its derivative works. These actions are 183 | prohibited by law if you do not accept this License. Therefore, by 184 | modifying or distributing the Program (or any work based on the 185 | Program), you indicate your acceptance of this License to do so, and 186 | all its terms and conditions for copying, distributing or modifying 187 | the Program or works based on it. 188 | 189 | 6. Each time you redistribute the Program (or any work based on the 190 | Program), the recipient automatically receives a license from the 191 | original licensor to copy, distribute or modify the Program subject to 192 | these terms and conditions. You may not impose any further 193 | restrictions on the recipients' exercise of the rights granted herein. 194 | You are not responsible for enforcing compliance by third parties to 195 | this License. 196 | 197 | 7. If, as a consequence of a court judgment or allegation of patent 198 | infringement or for any other reason (not limited to patent issues), 199 | conditions are imposed on you (whether by court order, agreement or 200 | otherwise) that contradict the conditions of this License, they do not 201 | excuse you from the conditions of this License. If you cannot 202 | distribute so as to satisfy simultaneously your obligations under this 203 | License and any other pertinent obligations, then as a consequence you 204 | may not distribute the Program at all. For example, if a patent 205 | license would not permit royalty-free redistribution of the Program by 206 | all those who receive copies directly or indirectly through you, then 207 | the only way you could satisfy both it and this License would be to 208 | refrain entirely from distribution of the Program. 209 | 210 | If any portion of this section is held invalid or unenforceable under 211 | any particular circumstance, the balance of the section is intended to 212 | apply and the section as a whole is intended to apply in other 213 | circumstances. 214 | 215 | It is not the purpose of this section to induce you to infringe any 216 | patents or other property right claims or to contest validity of any 217 | such claims; this section has the sole purpose of protecting the 218 | integrity of the free software distribution system, which is 219 | implemented by public license practices. Many people have made 220 | generous contributions to the wide range of software distributed 221 | through that system in reliance on consistent application of that 222 | system; it is up to the author/donor to decide if he or she is willing 223 | to distribute software through any other system and a licensee cannot 224 | impose that choice. 225 | 226 | This section is intended to make thoroughly clear what is believed to 227 | be a consequence of the rest of this License. 228 | 229 | 8. If the distribution and/or use of the Program is restricted in 230 | certain countries either by patents or by copyrighted interfaces, the 231 | original copyright holder who places the Program under this License 232 | may add an explicit geographical distribution limitation excluding 233 | those countries, so that distribution is permitted only in or among 234 | countries not thus excluded. In such case, this License incorporates 235 | the limitation as if written in the body of this License. 236 | 237 | 9. The Free Software Foundation may publish revised and/or new versions 238 | of the General Public License from time to time. Such new versions will 239 | be similar in spirit to the present version, but may differ in detail to 240 | address new problems or concerns. 241 | 242 | Each version is given a distinguishing version number. If the Program 243 | specifies a version number of this License which applies to it and "any 244 | later version", you have the option of following the terms and conditions 245 | either of that version or of any later version published by the Free 246 | Software Foundation. If the Program does not specify a version number of 247 | this License, you may choose any version ever published by the Free Software 248 | Foundation. 249 | 250 | 10. If you wish to incorporate parts of the Program into other free 251 | programs whose distribution conditions are different, write to the author 252 | to ask for permission. For software which is copyrighted by the Free 253 | Software Foundation, write to the Free Software Foundation; we sometimes 254 | make exceptions for this. Our decision will be guided by the two goals 255 | of preserving the free status of all derivatives of our free software and 256 | of promoting the sharing and reuse of software generally. 257 | 258 | NO WARRANTY 259 | 260 | 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY 261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN 262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES 263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED 264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS 266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE 267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, 268 | REPAIR OR CORRECTION. 269 | 270 | 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR 272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, 273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING 274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED 275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY 276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER 277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE 278 | POSSIBILITY OF SUCH DAMAGES. 279 | 280 | END OF TERMS AND CONDITIONS 281 | 282 | How to Apply These Terms to Your New Programs 283 | 284 | If you develop a new program, and you want it to be of the greatest 285 | possible use to the public, the best way to achieve this is to make it 286 | free software which everyone can redistribute and change under these terms. 287 | 288 | To do so, attach the following notices to the program. It is safest 289 | to attach them to the start of each source file to most effectively 290 | convey the exclusion of warranty; and each file should have at least 291 | the "copyright" line and a pointer to where the full notice is found. 292 | 293 | {description} 294 | Copyright (C) {year} {fullname} 295 | 296 | This program is free software; you can redistribute it and/or modify 297 | it under the terms of the GNU General Public License as published by 298 | the Free Software Foundation; either version 2 of the License, or 299 | (at your option) any later version. 300 | 301 | This program is distributed in the hope that it will be useful, 302 | but WITHOUT ANY WARRANTY; without even the implied warranty of 303 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 304 | GNU General Public License for more details. 305 | 306 | You should have received a copy of the GNU General Public License along 307 | with this program; if not, write to the Free Software Foundation, Inc., 308 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 309 | 310 | Also add information on how to contact you by electronic and paper mail. 311 | 312 | If the program is interactive, make it output a short notice like this 313 | when it starts in an interactive mode: 314 | 315 | Gnomovision version 69, Copyright (C) year name of author 316 | Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 317 | This is free software, and you are welcome to redistribute it 318 | under certain conditions; type `show c' for details. 319 | 320 | The hypothetical commands `show w' and `show c' should show the appropriate 321 | parts of the General Public License. Of course, the commands you use may 322 | be called something other than `show w' and `show c'; they could even be 323 | mouse-clicks or menu items--whatever suits your program. 324 | 325 | You should also get your employer (if you work as a programmer) or your 326 | school, if any, to sign a "copyright disclaimer" for the program, if 327 | necessary. Here is a sample; alter the names: 328 | 329 | Yoyodyne, Inc., hereby disclaims all copyright interest in the program 330 | `Gnomovision' (which makes passes at compilers) written by James Hacker. 331 | 332 | {signature of Ty Coon}, 1 April 1989 333 | Ty Coon, President of Vice 334 | 335 | This General Public License does not permit incorporating your program into 336 | proprietary programs. If your program is a subroutine library, you may 337 | consider it more useful to permit linking proprietary applications with the 338 | library. If this is what you want to do, use the GNU Lesser General 339 | Public License instead of this License. 340 | 341 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | crawl to the future 2 | =================== 3 | 4 | Structure of this project 5 | ------------------------- 6 | 7 | 1. [Datasets](dataset) - covers years 2000, 2005, 2010, 2015 - read more about this below 8 | * news.bbc.co.uk 9 | * cnn.com 10 | * news.yahoo.com 11 | * thenation.com 12 | * latimes.com 13 | * entertainment.msn.com 14 | * foxnews.com 15 | * forbes.com 16 | * nymag.com 17 | * esquire.com 18 | 19 | 2. [Crawlers](crawlers) 20 | * [**WayBack Machine**](crawlers/Way-Back) - only possible source and candidate for dataset 21 | * [The Archival Year](crawlers/Way-Back#the-archival-year) - First specification for a dataset built from WayBack archives 22 | * [Intro to WayBackTrack](crawlers/Way-Back#intro-to-waybacktrack) - A tool for downloading archived html's from any given year 23 | * [Crawling Google](crawlers/Crawling-Google) - not a candidate source for dataset 24 | * [Part I](crawlers/Crawling-Google/README.md#part-i) - Google's query string 25 | * [Part II](crawlers/Crawling-Google/README.md#part-ii) - Google Search By Year 26 | 27 | --- 28 | 29 | ###Update 1/23/2015 30 | 31 | This update is long overdue. Without saying too much about how I've lost all faith with so-called "tripple-padded ergonomic office chairs", I'm glad to say 32 | that there are +1000 sites and +1000 text files (containing the extracted articles) from 10 domains, originally published in years 2000, 2005, 2010, and 2015. 33 | 34 | I unfortunately didn't have time to document the process, but if I were to summarize what I did, it would be this: 35 | 36 | 1. Run all sites through a content-extractor (*cought* [eatiht](http://github.com/rodricios/eatiht) *cough*) 37 | 2. Hand verify the resulting text files to make sure they contain: article, title, author, and date 38 | 39 | A disclaimer: this is a **dataset**, not a testset, yet. 40 | 41 | I've done only one comprehensive scan of each htmlfile-textfile pair. If you do plan on using this dataset, please be aware that you may 42 | and likely will find a duplicate. 43 | 44 | That said, I'll have a *testset* uploaded once I've been able to do another comprehensive scan (in progress) inorder to weed out any discrepencies, duplicates, 45 | etc. 46 | 47 | Cheers 48 | 49 | ###Update 1/12/2015 50 | 51 | Woo, finally got to manually seperating the majority of the files into either of two groups: "has article" or "doesn't have article" 52 | 53 | It's pretty easy to visuallize what I mean. Sites that have no content can be described as such: landing pages, directory pages 54 | (very popular in the early 2000's from what I can tell), or just about anything which does not have a central piece or group of text. 55 | 56 | Sites that an *article* are simply those sites under a domain, and particularly so with the newspaper domains I've targetted, that have 57 | an article waiting to be read. 58 | 59 | You'll find that in this latest commit, I've tried my best to remove those sites that have no content (lack an article) by hand. What I've included 60 | though is the preliminary automatically-extracted content. These extractions are within a text file under the same name of the original 61 | HTML file it was extracted from. 62 | 63 | The following datasets are closest to being primed for testing: 64 | 65 | * [www.cnn.com](dataset/www.cnn.com) 66 | * [news.bbc.co.uk](dataset/news.bbc.co.uk) 67 | * [news.yahoo.com](dataset/news.yahoo.com) 68 | * [www.latimes.com](dataset/www.latimes.com) 69 | * [entertainment.msn.com](dataset/entertainment.msn.com) 70 | * [www.forbes.com](dataset/www.forbes.com) 71 | 72 | The following datasets are not for reasons relating, but not limited to: unavailable/unreacheable archive servers, too many non-content 73 | sites, too many sites I'm unsure about (somewhere in between article and non-article). The biggest reason of them all is that 74 | sites from the year 2000 and, to a lesser degree, 2005 are rare to come by, but this was expected. 75 | 76 | Anyways, the following datasets need more work: 77 | 78 | * [www.nytimes.com](dataset/www.nytimes.com) 79 | * [www.foxnews.com](dataset/www.foxnews.com) 80 | 81 | I've dropped the following datasets for the same reasons mentioned above: 82 | 83 | * reuters.com 84 | * bbc.com 85 | * yahoo.com 86 | * msn.com 87 | 88 | I'm also in the process of finding new, heavily-archived domains. If anyone has a tip on which domains to try out, please send me an 89 | email: rodrigopala91@gmail.com 90 | 91 | Anyways, the last bit of work is related to hand-correcting the extracted text files and making sure they include the title, the author, 92 | and the article itself - every single sentence :| 93 | 94 | I hope to be update once more this week with datasets that can be consider "golden". 95 | 96 | Update 1/8/2015 97 | --------------- 98 | 99 | I've been able to gather enough HTML files to begin the process of extracting the "content." 100 | This process consists of using automatic methods of extraction, followed by manual curation - 101 | making sure the title or *by * line are present in the final content, for example. 102 | 103 | This repo may likely not get updated with the raw, freshly-downloaded dumps as I've been doing. Instead, 104 | I will filter out many HTML's currently in the dataset before updating the repo. Soon after that, I'll add 105 | a new directory containing the extracted-text files, and finally we'll be that much closer to starting 106 | our experiments :) 107 | 108 | 109 | Where am I? What year is this? 110 | ------------------------------ 111 | 112 | This repository is home to a subtask within a larger project that [Tim Weninger](http://www3.nd.edu/~tweninge/) and I have undertaken. 113 | 114 | The closest thing to a writeup about the above referenced "larger project" is 115 | [this](https://github.com/rodricios/eatiht#123114). 116 | 117 | ####"larger project" tl;dr 118 | 119 | We're attempting to [backtest](http://en.wikipedia.org/wiki/Backtesting) 120 | content-extractors from the last 15 years against a dataset of archived 121 | websites (ie. websites that have existed in the last 15 years). 122 | 123 | ####crawl-to-the-future tl;dr 124 | 125 | I'm documenting and prototyping methods for building up a dataset of 126 | **immediately-at-our-disposal** HTML files. 127 | 128 | --- 129 | 130 | The small task of crawling back to the future 131 | --------------------------------------------- 132 | 133 | The subtask itself is determining whether or not it is possible to do the following*: 134 | 135 | 1. download an appropriate test set of websites from 2000, 2005, 2010 and 2015 136 | (addressed in this repo) 137 | 138 | 2. randomly select about 200 pages from 20 different test - this would be our attempt 139 | at creating a silver or gold standard (probably will be addressed in this repo) 140 | 141 | 3. need to have content manually extracted - yes, this means having to **hand extract** the content 142 | * refer to: 143 | * [CleanEval's homepage](http://cleaneval.sigwac.org.uk/); [formal paper here](http://cleaneval.sigwac.org.uk/lrec08-cleaneval.pdf) 144 | * [Serge Sharoff's 2006 paper on creating a general-purpose corpora of websites](http://www.comp.leeds.ac.uk/ssharoff/publications/wacky-paper.pdf) 145 | * Please [email me](rodrigopala91@gmail.com) if you know of similar papers 146 | 147 | 4. write up specifications for the above processes (the various README's 148 | in this repo will hopefully address this part) 149 | 150 | *note: these steps were derived from emails with Tim 151 | 152 | Again, the goal is to figure out if we can, in a timely manner, define a process for 153 | creating a dataset. This dataset or the process itself can then be used for further 154 | research in the area of data and text extraction. As for the origins of the name 155 | "crawl to the future," *crawling* will be a significant process in aquiring our dataset. 156 | 157 | --- 158 | 159 | Timeline 160 | -------- 161 | 162 | Refer to this [folder](https://github.com/rodricios/crawl-to-the-future/tree/master/timelines) 163 | for a week by week plan-of-action 164 | 165 | 166 | --- 167 | 168 | ###Where's the work? 169 | 170 | Because I'm approaching this project in a very active manner - rapid 171 | README updates that are riddled with typos, quick prototyping of scripts 172 | used to crawl and acquire web pages - you'll likely see me working in a 173 | single subdirectory for at most a day. Please refer to this section to see where 174 | I'm writing. 175 | 176 | ~~Currently, I'm updating the section describing how to quickly prototype a 177 | simple crawling script for google. Unfortunately, attempting to build a dataset 178 | of archival/historical web pages using Google's custom date range filter was 179 | unrewarding, to say the least. Read my notes [here](crawlers/Crawling-Google#bitter-sweet-conclusion).~~ 180 | 181 | ~~This leaves me with one last option of building a dataset: [Way Back Machine](https://archive.org/web/).~~ 182 | 183 | ~~There seems to be more [online discussion](http://superuser.com/questions/828907/how-to-download-a-website-from-the-archive-org-wayback-machine) 184 | behind downloading entire sites using way back archives, so hopefully this last attempt will be fruitful :)~~ 185 | 186 | ~~I'll be writing/updating on this [page](crawlers/Way-Back).~~ 187 | 188 | I've written [waybacktrack.py](crawlers/Way-Back/waybacktrack.py) as tool 189 | for extracting .html's from WayBack Machine's archives. It's pretty buggy, 190 | so use at your discretion! 191 | 192 | --- 193 | -------------------------------------------------------------------------------- /crawlers/Crawling-Google/README.md: -------------------------------------------------------------------------------- 1 | Crawling Google 2 | =============== 3 | 4 | T. of C. 5 | -------- 6 | 7 | * [Part I](#part-i) 8 | 1. [Intro](#intro) 9 | 2. [Brainstorming](#brainstorming) 10 | 3. [Decision?](#decision) 11 | 4. [Crawling into Google's Backend](#crawling-into-googles-backend) 12 | 5. [Interpreting Google's query string](#interpreting-googles-query-string) 13 | 6. [Final query string](#final-query-string) 14 | 7. [One last thing, Pagination](#one-last-thing-pagination) 15 | 8. [Pagination](#pagination) 16 | 17 | * [Part II](#part-ii) 18 | 1. [Motivation](#motivation) 19 | 2. [The Easy Way is Usually Best](#the-easy-way-is-usually-best) 20 | 1. [Download](#download-search-results) 21 | 2. [Select](#select-search-results) 22 | 3. [Store](#store-website-links) 23 | 3. [Google By Year(s)](#google-by-year) 24 | 25 | * [Conclusion](#bitter-sweet-conclusion) 26 | Part I 27 | ====== 28 | 29 | 30 | Intro 31 | ----- 32 | 33 | *Crawling Google*'s README is split up into more than one part. Part I is all about simplifying Google's 34 | query string. Why is this important? Because we are going to use that to our advantage when it comes building a substantial dataset, from which we can "randomly" pick out web pages. 35 | 36 | Part II will be us exploring Scrapy and using the url+query string we acquired in Part I. 37 | 38 | ###The simple query 39 | 40 | First thing I did was go to Google, enter the query "New York Times" 41 | 42 | Once the results came back, I clicked on the *search tools* tab 43 | 44 | And changed the "Any Time" to a "Custom Range" from Jan 1, 2000 to Jan 1, 2001 45 | 46 | --- 47 | 48 | Brainstorming 49 | ------------- 50 | 51 | In an email with Tim Weninger, Tim asked this: 52 | 53 | > Can you think of a way to get 10 random nytimes articles from each time period, 10 reuters articles, 10 cnn articles, etc for 10-20 sources? 54 | 55 | 56 | From the results page one gets from the simple Google query above, I can say that we did get 10 nytimes articles. 57 | 58 | But the crux in the above excerpt is **_random_**. Random, to a statician is a very important thing, idea, principle, study, ~~religion~~. 59 | 60 | So what I'm going to ignore the "random" part for now and go towards building a data set of old websites. 61 | 62 | Can I build a repository of .htmls from NYTimes during Jan. 1 of 2000 to Jan. 1 of 2001? 63 | 64 | The first obvious approach is... yes, left click on each one of Google's results, and right 65 | click anywhere on the page that's not a link, and left click on "Save As" 66 | 67 | At this point, a window pops up asking you to save, but it gives you two options: 68 | 69 | 1. Webpage, HTML Only 70 | 71 | or 72 | 73 | 2. Webpage, Complete 74 | 75 | Here's where I and Tim, but really I because he's got few other papers to spearhead; anyways, here's where I 76 | can start noting down that this may very well be a specification for what type of page we'd like to 77 | have in our dataset. 78 | 79 | --- 80 | 81 | ###Decision? 82 | 83 | After looking at both *Complete* and *HTML Only* instances, it's easy to say that we should specify "HTML Only" 84 | because for whatever reason, "Complete" looks, frankly, *incomplete*. On a sidenote, programmers should not 85 | give name to filetypes, only functions and classes. 86 | 87 | I've included both types in the folder within the [**datasets** directory structure](https://github.com/rodricios/crawl-to-the-future/tree/master/dataset#dataset-aka-websites-from-2000-2005-2010-and-2015) 88 | 89 | And I repeat this method for the rest of the pages on the first page of Google's results. See the current hand picked [dataset here](https://github.com/rodricios/crawl-to-the-future/tree/master/dataset/NYTimes/byhand) 90 | 91 | But seriously though. Hand picking as an option has yet to go up against web-crawling. Moving on. 92 | 93 | Crawling into Google's Backend 94 | ------------------------------ 95 | 96 | Want to be a **power crawler**? 97 | 98 | Do everything in the [intro](#intro). There's one more step. 99 | 100 | Go to the url bar and copy and paste the contents into some text document 101 | 102 | This is what it should look like: 103 | 104 | https://www.google.com/search?q=new+york+times&safe=off&client=opera&hs=SBu&biw=1309&bih=810&source=lnt&tbs=cdr%3A1%2Ccd_min%3A1%2F1%2F2000%2Ccd_max%3A1%2F1%2F2001&tbm= 105 | 106 | Yeah, it's a giant mess. 107 | 108 | Let's try to make sense of Google's query string: 109 | 110 | ``` 111 | ?q=new+york+times ----> query is "new york times" 112 | 113 | ? ----> the "?" is a "seperator" 114 | q ----> the "q" is a "field", everything after the "=" is the value 115 | = ----> "=" seperates a field from a value 116 | new ----> "new"d 117 | + ----> " " (whitespace) 118 | york ----> "york 119 | + ----> " " 120 | times ----> "times" 121 | ``` 122 | 123 | What you've just seen was a dissection of a pretty standard 124 | query string, usually preceded with a "?" (this is considered 125 | a "seperator") or other query strings, the structure "fiend=value" 126 | is what's commonly referred to as the field, value pair. 127 | 128 | If a value contains spaces, we replace the spaces with "+". 129 | In essence, everything after the "?" seperator is one or more 130 | field-value pairs[1] 131 | 132 | [1](http://en.wikipedia.org/wiki/Query_string#Web_forms) 133 | 134 | --- 135 | 136 | ###Interpreting Google's query string 137 | 138 | Now we'll try to skip most of the query string until we reach something that's date related, 139 | because what we really want to focus on is the query field,value pair 140 | that controls the custom range dates. 141 | 142 | Again, we're looking for something that has the following encoded in query string: 143 | 144 | * jan 1 2000 or 145 | * 1 1 2000 146 | * 01 01 2000 147 | * etc. 148 | 149 | Skipping all the way to the end of the query string, we see...: 150 | 151 | &tbs=cdr%3A1%2Ccd_min%3A1%2F1%2F2000%2Ccd_max%3A1%2F1%2F2001&tbm= 152 | 153 | The %3A, %2C, basically all "%" followed by a # and letter are 154 | url encoded characters. Read about them [here](http://www.degraeve.com/reference/urlencoding.php) 155 | 156 | But if we swap those characters in the last query string with the url-unencoded(?) characters, 157 | we'll actually start to see the parameters for the dates: 158 | 159 | tbs=cdr:1,Ccd_min:1/1/2000,Ccd_max:1/1/2001&tbm= 160 | 161 | Ok, there we go, now we have something we can kind of make out: 162 | 163 | "cdr" stands for "custom date range" and that's set to '1', meaning 164 | 'true, we have chosen that option. 165 | 166 | Ccd_min is a little harder to de-abbreviate, any shots? Maybe 167 | Custom custom date min? Well, at this point, it really doesn't matter 168 | because we've found the bit of query that's necessary in order to specify 169 | our date-ranged search :) 170 | 171 | You can skip to the [final url+query string](#final-query-string) that we'll use in some crawling 172 | agent. 173 | 174 | What about the stuff in the query string that we skipped? 175 | 176 | &safe=off&client=opera&hs=SBu&biw=1309&bih=810&source=lnt& 177 | 178 | In case I haven't said this already, the "&" just means "and" like in, 179 | 'field1=True and field2=False' etc. 180 | 181 | So "&safe=off" is saying, I presume, "safe search" is off. And our browser 182 | client is "opera" and hs=SBu, I don't know what that means; biw=1309, what? 183 | 184 | bih=810? source=1nt? Yeah, thank Google, you make it really easy for us to 185 | interpret your auto generated query string parameters. 186 | 187 | ###Final query string 188 | 189 | So to finish, we'll decide on a query string to use in our crawler. 190 | 191 | We are going to need: 192 | 193 | https://www.google.com/search?q=new+york+times& 194 | 195 | and, 196 | 197 | tbs=cdr%3A1%2Ccd_min%3A1%2F1%2F2000%2Ccd_max%3A1%2F1%2F2001&tbm= 198 | 199 | Let's skip all the stuff before the custom date range field to get this: 200 | 201 | https://www.google.com/search?q=new+york+times&tbs=cdr%3A1%2Ccd_min%3A1%2F1%2F2000%2Ccd_max%3A1%2F1%2F2001&tbm= 202 | 203 | Woohoo! If you copy and paste that into your browser (I'm using Opera), you should 204 | get a typical Google Results page. Success! 205 | 206 | ###One last thing, Pagination 207 | 208 | If we want to satisfy the [*random 10 pages*](#brainstorming) mentioned 209 | in a quote in the Brainstorming section, we have to have a pretty large 210 | data set. Not too large, but I've seen numbers getting thrown around in the 211 | hundreds to thousands range. Let's refer to ClearEval and see what they 212 | specified. Be right back! 213 | 214 | Note: the next section basically talks about how ClearEval wasn't 215 | very specific in terms of data set acquisition nor data set size 216 | requirements, unless I've missed it. 217 | 218 | [Skip to the Pagination](#pagination) 219 | 220 | --- 221 | 222 | Alright, interesting... Here's what I found from the ClearEval paper: 223 | 224 | > The corpora were collected from URLs returned by making queries to Google, 225 | which consisted of four words frequent in an individual language. 226 | We have previously established that if mid-frequency words like 227 | picture, extent, raised and events are all used in a query, retrieved 228 | pages are likely to contain extended stretches of text (Sharoff, 2006) 229 | 230 | So the ClearEval specification says that it simply search and downloaded 231 | the resulting html pages. Fair enough. And they did so on a specific 232 | set of key terms to search. 233 | 234 | As was mentioned in [this part](https://github.com/rodricios/crawl-to-the-future/tree/master/dataset) 235 | of the project, we have a specific list of online newspapers to fetch 236 | from. I guess in principle, it doesn't really matter where we fetch from 237 | as long as we have **many** webpages to be able to pick at random, 238 | and that it covers the appropriate era (2000,2005,2010,2015) 239 | 240 | --- 241 | 242 | ###Pagination 243 | 244 | We want to get not just the first 20 or so results from Google - 245 | as in, we want more than just the first page of results. 246 | 247 | The way to figure out how to get more than one page of results 248 | is by figuring out the "paging" parameter within the query string. 249 | 250 | But Rodrigo, you already went thru the entire query string! 251 | 252 | Nu uh! Go to your browser and enter this url (if you haven't already): 253 | 254 | https://www.google.com/search?q=new+york+times&tbs=cdr%3A1%2Ccd_min%3A1%2F1%2F2000%2Ccd_max%3A1%2F1%2F2001&tbm= 255 | 256 | Again, that's the short url w/ query string arguments that set the 257 | search date range from Jan 1, 2000 to Jan 1, 2001. 258 | 259 | Once you've gotten back the first page of Google results, scroll down 260 | and click the button that takes you to the next page of results. 261 | 262 | This may feel strange to some of you ;) 263 | 264 | Now let's look at that new url: 265 | 266 | https://www.google.com/search?q=new+york+times&tbs=cdr%3A1%2Ccd_min%3A1%2F1%2F2000%2Ccd_max%3A1%2F1%2F2001&tbm=#q=new+york+times&safe=off&tbs=cdr:1,cd_min:1/1/2000,cd_max:1/1/2001&start=10 267 | 268 | Did it not just get unnecessarily long again? 269 | 270 | Let's skip a lot of the "interpreting" stuff, and see if we can fish out 271 | the parameter with "2" in it, for "page 2" 272 | 273 | ... 274 | 275 | You'll likely find nothing. Why? Because search engines use a different way 276 | provide "pagination." Read this [Elasticsearch](http://www.elasticsearch.org/guide/en/elasticsearch/guide/current/pagination.html) 277 | to get a feel for pagination. Great tool btw! 278 | 279 | Once you're satisfied, let's try to now look for our "from" parameter. 280 | 281 | The closest thing that I was able to find was "start=10" 282 | 283 | That should do it; let's try to add that bit to the end of our shorter url+query: 284 | 285 | https://www.google.com/search?q=new+york+times&tbs=cdr%3A1%2Ccd_min%3A1%2F1%2F2000%2Ccd_max%3A1%2F1%2F2001&start=10 286 | 287 | Alright! Now change the value to 20; your entire url+query string will look like: 288 | 289 | https://www.google.com/search?q=new+york+times&tbs=cdr%3A1%2Ccd_min%3A1%2F1%2F2000%2Ccd_max%3A1%2F1%2F2001&start=20 290 | 291 | If you checked what page you were on after the last query, you would be on the 3rd page :) 292 | 293 | That should do it. What we've just done is disect the necessary 294 | parameters for custom-date-range and paged queries! 295 | 296 | Part II 297 | ======= 298 | 299 | Motivation 300 | ---------- 301 | 302 | So what was the point of the last section? Well, there is no "point." That section is simply an explanation. The explanation should be clear - it's about simplifying a query string. In the process of simplifying said query string, I hope I was able to show the typical process that one goes through when **beginning** to create a crawler that's designed to crawl search engines. 303 | You can apply the same query-simplifying principle to Amazon, Yahoo, Bing, etc. But I should emphasize this: the key is not to simplify for simplicity sake, the key is for you to understand the field, value; the parameter names, the "language" that seperates the searcher from the searching. 304 | 305 | Now, here's where things will start to get interesting. Instead of right away going to Scrapy, let's just bring out a very simple Python tool: lxml 306 | 307 | 308 | The Easy Way is Usually Best 309 | ---------------------------- 310 | 311 | I know I said we would likely use Scrapy, but now that I think about it, why should we? 312 | 313 | Let me specify what we need from Google 314 | 315 | 1. We need to be able to search for a specific set of documents within a range of time. Check - we figured out the field, value pair for date ranges 316 | 317 | 2. We need to get be able to generate a lot of results. Check - we figured out how to request the next page, and the next, and the next... 318 | 319 | 3. We need to **download** Google's search results as HTML, **select** the results and **store the links.** 320 | 321 | Seeing that the first two requirements are more abstract/generalized explanations 322 | of what is required *overall*, the third requirement pretty straight forward. 323 | 324 | 325 | Ok. Let's look at what I laid out in step 3: 326 | 327 | A. download 328 | 329 | B. select 330 | 331 | C. store 332 | 333 | 334 | There's three steps there, and we're going to translate those three steps directly 335 | into Python code. 336 | 337 | Download search results 338 | ------------------------ 339 | 340 | ... into memory. 341 | 342 | There's a bit preliminary work required to being able to *select* our *results* - and what exactly do I mean by that? [I'll answer that later.](#select-search-results) 343 | 344 | This step leads directly to the Python library lxml. I'm going to be very brief. lxml.html is a module that can download, parse 345 | and select nodes in an HTML tree. That's all the power we need. 346 | 347 | To select, we're going to use your browsers developer tools - but more on that in a minute. 348 | 349 | Note: the following steps in Python can be downloaded in its entirety [here](simpledownload.py) 350 | Note: the '\' is only for visual purposes, the file itself will have the url declared in single line. 351 | 352 | ```python 353 | import lxml 354 | 355 | # url and query string from PART I 356 | # this is a custom range from Jan 1, 2000 to Jan 1, 2001 357 | url = 'https://www.google.com/search?q=new+york+times&tbs=cdr%3A1%2Ccd_min%3A1%2F1%2F2000%2Ccd_max%3A1%2F1%2F2001&start=10' 358 | ``` 359 | 360 | Now what we just did was simple enough, we import lxml and we declared our query string that we've derived in Part I. 361 | 362 | But let's begin to "modularize" this a little bit: 363 | 364 | ```python 365 | from lxml import html 366 | 367 | # To address paging in Google 368 | PAGE = 0 369 | 370 | # url and query string from PART I 371 | # this is a custom range from Jan 1, 2000 to Jan 1, 2001 372 | URL = 'https://www.google.com/search?q=new+york+times&\ 373 | tbs=cdr%3A1%2Ccd_min%3A1%2F1%2F2000%2Ccd_max%3A1%2F1%2F2001\ 374 | &start=' + str(PAGE*10) 375 | ``` 376 | 377 | Ok, I also reformatted the URL string so that can see different parts of the 378 | query string more clearly. 379 | 380 | Also notice the "PAGE*10" 381 | 382 | Now let's see lxml in action. We'll use the [lxml.html.parse](http://lxml.de/lxmlhtml.html#parsing-html) function to download and parse the page we would 383 | have gotten if you copy and pasted the query in question: 384 | 385 | Note: We are also going to use lxml.html.tostring in order to quickly see what we get, but this should be 386 | a little obvious to you now 387 | 388 | 389 | ```python 390 | from lxml import html 391 | 392 | # To address paging in Google 393 | PAGE = 0 394 | 395 | # url and query string from PART I 396 | # this is a custom range from Jan 1, 2000 to Jan 1, 2001 397 | URL = 'https://www.google.com/search?q=new+york+times&\ 398 | tbs=cdr%3A1%2Ccd_min%3A1%2F1%2F2000%2Ccd_max%3A1%2F1%2F2001\ 399 | &start=' + str(PAGE*10) 400 | 401 | google_parsed = html.parse(URL) 402 | 403 | print html.tostring(google_parsed) 404 | ``` 405 | 406 | Now if you tried running that, you'll likely get an error. 407 | 408 | Refer to this [S.O. post](http://stackoverflow.com/questions/11450649/python-urllib2-cant-get-google-url) to find out why. 409 | 410 | Here's the updated [script](simpledownload.py): 411 | 412 | ```python 413 | import urllib2 414 | 415 | from lxml import html 416 | 417 | # To address paging in Google 418 | PAGE = 0 419 | 420 | # url and query string from PART I 421 | # this is a custom range from Jan 1, 2000 to Jan 1, 2001 422 | URL = 'https://www.google.com/search?q=new+york+times&tbs=cdr%3A1%2Ccd_min%3A1%2F1%2F2000%2Ccd_max%3A1%2F1%2F2001&start=' + str(PAGE*10) 423 | 424 | # here we setup the necessary agent to download a google html page 425 | opener = urllib2.build_opener() 426 | # this line will work, but Google fails to respond to the custom date range 427 | # opener.addheaders = [('User-agent', 'Mozilla/5.0')] 428 | opener.addheaders = [('User-agent', 429 | 'Mozilla/5.0 (Windows NT 6.3; WOW64) \ 430 | AppleWebKit/537.36 (KHTML, like Gecko) \ 431 | Chrome/39.0.2171.95 Safari/537.36 \ 432 | OPR/26.0.1656.60')] 433 | 434 | 435 | # let's download 436 | google_html = opener.open(URL) 437 | 438 | # parse the html 439 | google_parsed = html.parse(google_html) 440 | print html.tostring(google_parsed) 441 | ``` 442 | 443 | You'll notice that we've added urllib2, no biggie. 444 | 445 | That script can be found [here](simpledownload.py) 446 | 447 | Now, there's a bit of unintentional misinformation/presumption in the above user-agent "fix." 448 | 449 | First, I assumed it would work. I was wrong, and this set me back a couple of hours. 450 | 451 | Second, you'll have read in earlier editions of this README that I go further in the process, without 452 | realizing that the HTML tree I was working with was wrong. 453 | 454 | 455 | What I should have done instead of copy & pasting the user-agent from the S.O. post was to 456 | bring up my [browser's developer tools](https://www.google.com/search?client=opera&q=how+to+open+developer+tools&sourceid=opera&ie=UTF-8&oe=UTF-8). 457 | 458 | Select search results 459 | ------------------------- 460 | 461 | ... using xpath. 462 | 463 | Here are a few screenshots of what I did: 464 | 465 | Search using the custom date range filter 466 | ![Search using the custom date range](pictures/custom-date-range-Google-Search.png?raw=true "") 467 | 468 | 469 | Open my browser's developer tools 470 | ![Open dev. tools](pictures/right-click-open-devtools.png?raw=true "Open Developer Tools") 471 | 472 | Click the Network tab 473 | ![Click the Network tab](pictures/click-network-tab.png?raw=true "Click the Network tab") 474 | 475 | Click the Documents subtab 476 | ![Click the Documents subtab](pictures/click-Documents-subtab.png?raw=true "Click the Documents subtab") 477 | 478 | Highlight the search document 479 | ![Highlight the search document](pictures/highlight-search-document.png?raw=true "Highlight the search document") 480 | 481 | Highlight the request headers 482 | ![Highlight the request headers](pictures/highlight-request-headers.png?raw=true "Highlight the request headers") 483 | 484 | Copy the user agent value 485 | ![Copy the user agent value](pictures/copy-user-agent.png?raw=true "Copy the user agent value") 486 | 487 | 488 | Wooo! That might have seemed like an eyeful, but trust me, it was better than using any of the results in this [Google search](https://www.google.com/search?client=opera&q=google+custom+search+with+python&sourceid=opera&ie=UTF-8&oe=UTF-8#q=how+to+request+google+search+with+python) 489 | 490 | Here's what our updated simpleselect.py script looks like: 491 | 492 | ```python 493 | import urllib2 494 | 495 | from lxml import html 496 | 497 | # To address paging in Google 498 | PAGE = 0 499 | 500 | # url and query string from PART I 501 | # this is a custom range from Jan 1, 2000 to Jan 1, 2001 502 | URL = 'https://www.google.com/search?q=new+york+times&tbs=cdr%3A1%2Ccd_min%3A1%2F1%2F2000%2Ccd_max%3A1%2F1%2F2001&start=' + str(PAGE*10) 503 | 504 | # here we setup the necessary agent to download a google html page 505 | opener = urllib2.build_opener() 506 | opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60')] 507 | 508 | # let's download 509 | google_html = opener.open(URL) 510 | 511 | # parse the html 512 | google_parsed = html.parse(google_html) 513 | 514 | # Here's a smarter way to see what exactly it is you've downloaded/parsed with lxml: 515 | html.open_in_browser(google_parsed) 516 | #file://c:/users/rodrigo/appdata/local/temp/tmp1xllau.html 517 | ``` 518 | 519 | Note: in case the last line of the above code doesn't actually open a browser for you, 520 | copy and paste the path that's printed into your browser. 521 | 522 | Here's what I see: 523 | 524 | Correctly downloaded Google HTML (look at the dates)! 525 | ![Correctly downloaded Google HTML](pictures/correct-lxml-download.png?raw=true "Correctly downloaded Google HTML") 526 | 527 | Now, let's use the devtools once more and see if we can find the xpath that selects the parent HTML node|element 528 | that contains the hyperlinks to those outside New York Times related websites: 529 | 530 | Use the magnifying glass on the top-left of the devtools window 531 | ![Select the parent node which highlights the DOM box containig results](pictures/use-magnifying-glass.png?raw=true "Select search results parent node") 532 | 533 | Now right-click and copy the xpath 534 | 535 | ![Right-click and copy xpath](pictures/right-click-copy-xpath.png?raw=true "Right-click and copy xpath") 536 | 537 | 538 | Now, we can update our simpleselect.py script by adding this piece of code: 539 | 540 | ```python 541 | # Here comes the 'selecting'! 542 | google_results = google_parsed.xpath('//*[@id="rso"]/div[2]') 543 | 544 | # the xpath in this line basically selects all children, which in our 545 | # case are the 10 'li' elements 546 | print len(google_results[0].xpath('./*')) 547 | #10 548 | ``` 549 | 550 | If you'd like to see the links we need to have stored, run add these lines into your console or script: 551 | 552 | ```python 553 | # print out hyperlinks 554 | # Note: after using devtool's magnifying glass and 'copy xpath', I got: 555 | # //*[@id="rso"]/div[2]/li[1]/div/h3/a 556 | google_list_items = google_results[0].xpath('.//h3/a/@href') 557 | for elem in google_list_items: 558 | print elem 559 | ``` 560 | 561 | Download [simpleselect.py](simpleselect.py) 562 | 563 | 564 | Finally we've come to part C. 565 | 566 | Store website links 567 | ------------------- 568 | 569 | ... again into memory or... 570 | 571 | Really, this part is up to the imagination of who's implementing this. 572 | 573 | We can create some crazy multi-modular Python package, or we can leave this a simple script 574 | that takes in 2 or 3 command line arguments. 575 | 576 | Let's do the latter first. 577 | 578 | Let's specify the following arguments: 579 | 580 | * custom year range 581 | * ie. 2000,2001 582 | * Number of Google Search pages to gather up links from 583 | * ie. 3 584 | * and search query 585 | * ie. New York Times 586 | 587 | We'll just have the script print to console, and we'll redirect ('>,>>,|') that into a text file. 588 | 589 | Oh, there's one more thing! We have to make sure we don't get our IP blocked by google. 590 | So in-between one page of results to another, we'll delay the download by 1 second. 591 | 592 | 593 | Google By Year 594 | -------------- 595 | 596 | Finally, here's the [script](googlebyear.py) I'll be using to build up a long list of potential 597 | sites of which we can download, store, hand-extract, etc. 598 | 599 | Here's a little demonstration of how to use this little script: 600 | 601 | ```python 602 | from googlebyear import search 603 | 604 | links = search("new york times", total_pages=3, year_range=(2001,2002), debug=True) 605 | # total_pages: 2 606 | # year_range: (2000, 2001) 607 | # url: https://www.google.com/search?q=new+york+times&start=0&tbs=cdr%3A1%2Ccd_min%3A1%2F1%2F2000%2Ccd_max%3A1%2F1%2F2001 608 | # url: https://www.google.com/search?q=new+york+times&start=10&tbs=cdr%3A1%2Ccd_min%3A1%2F1%2F2000%2Ccd_max%3A1%2F1%2F2001 609 | ``` 610 | 611 | and stored within "links" is, 612 | 613 | ```python 614 | ['http://www.nytimes.com/content/help/search/search/search.html', 615 | 'http://www.nytimes.com/books/00/12/24/specials/fitzgerald-gatsby.html', 616 | 'http://www.nytimes.com/books/00/01/09/specials/joyce-ulysses.html', 617 | 'http://www.nytimes.com/travel/guides/europe/portugal/sintra/overview.html', 618 | 'http://www.nytimes.com/2000/01/01/us/c-a-correction-welcome-to-51254-000005.html', 619 | 'http://www.nytimes.com/2000/04/09/us/they-threaten-seethe-and-unhinge-then-kill-in-quantity.html', 620 | ...] 621 | ``` 622 | 623 | But wait, there's more! 624 | 625 | It's also a command line script! 626 | 627 | ```bash 628 | googlebyear.py "new york times" -p 2 -y 2000 2001 629 | ``` 630 | 631 | Note: don't forget to symlink! 632 | 633 | Bitter sweet conclusion 634 | ---------- 635 | 636 | While this may seem like a neat script - and I'm struggling to not sound passive-aggressive - the links themselves can 637 | serve no purpose with regards to this project. The goal was to build a data set of pages that existed in those periods 638 | (2000, 2005, etc.), but if you were to visit any of those pages, you'll see that most of those links will send 639 | you to archived pages, served by updated web frameworks. 640 | 641 | 642 | Next, and last resort: Way Back Machine 643 | -------------------------------------------------------------------------------- /crawlers/Crawling-Google/google_2000/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rodricios/crawl-to-the-future/aebed4d2ac836a116b40bde75a473726362d95db/crawlers/Crawling-Google/google_2000/__init__.py -------------------------------------------------------------------------------- /crawlers/Crawling-Google/google_2000/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class Google2000Item(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | pass 15 | -------------------------------------------------------------------------------- /crawlers/Crawling-Google/google_2000/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class Google2000Pipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /crawlers/Crawling-Google/google_2000/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for google_2000 project 4 | # 5 | # For simplicity, this file contains only the most important settings by 6 | # default. All the other settings are documented here: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # 10 | 11 | BOT_NAME = 'google_2000' 12 | 13 | SPIDER_MODULES = ['google_2000.spiders'] 14 | NEWSPIDER_MODULE = 'google_2000.spiders' 15 | 16 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 17 | #USER_AGENT = 'google_2000 (+http://www.yourdomain.com)' 18 | -------------------------------------------------------------------------------- /crawlers/Crawling-Google/google_2000/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /crawlers/Crawling-Google/google_2000/spiders/google_2000_spider.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rodricios/crawl-to-the-future/aebed4d2ac836a116b40bde75a473726362d95db/crawlers/Crawling-Google/google_2000/spiders/google_2000_spider.py -------------------------------------------------------------------------------- /crawlers/Crawling-Google/googlebyear.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """googlextract.py 4 | 5 | It's Google Search! But stripped from 6 | all its user-friendliness! 7 | 8 | This was written for 9 | """ 10 | 11 | import time 12 | import argparse 13 | import urllib 14 | import urllib2 15 | 16 | from lxml import html 17 | 18 | def search(terms, total_pages=1, year_range=None, debug=False): 19 | """It's the minimalist programmer's favorite 20 | way to search on Google. 21 | 22 | total_pages - ie. 2 23 | year_range - ie. [2000,2001] 24 | """ 25 | if debug: 26 | print "total_pages: ", str(total_pages) 27 | print "year_range: ", str(year_range) 28 | 29 | query = urllib.urlencode({'q': terms}) 30 | #terms = terms.replace(' ','+') 31 | 32 | page = 0 33 | links = [] 34 | while page < total_pages: 35 | 36 | custom_range = '' 37 | 38 | if year_range is not None: 39 | custom_range = '&tbs=cdr%3A1%2Ccd_min%3A1%2F1%2F' +\ 40 | str(year_range[0]) + '%2Ccd_max%3A1%2F1%2F' +\ 41 | str(year_range[1]) 42 | 43 | url = 'https://www.google.com/search'+\ 44 | '?'+ query + '&start=' + str(page*10) + custom_range 45 | 46 | # here we setup the necessary agent to download a google html page 47 | opener = urllib2.build_opener() 48 | opener.addheaders = [('User-agent', 49 | 'Mozilla/5.0 (Windows NT 6.3; WOW64) \ 50 | AppleWebKit/537.36 (KHTML, like Gecko) \ 51 | Chrome/39.0.2171.95 Safari/537.36 \ 52 | OPR/26.0.1656.60')] 53 | 54 | # let's download 55 | google_html = opener.open(url) 56 | 57 | # parse the html 58 | google_parsed = html.parse(google_html) 59 | 60 | # Here comes the 'selecting'! 61 | google_results = google_parsed.xpath('//*[@id="rso"]/div[2]') 62 | 63 | # print out hyperlinks 64 | # Note: after using devtool's magnifying glass and 'copy xpath', I got: 65 | # //*[@id="rso"]/div[2]/li[1]/div/h3/a 66 | if not google_results[0].xpath('.//h3/a/@href'): 67 | links.extend(google_results[0].xpath('.//a/@href')) 68 | else: 69 | links.extend(google_results[0].xpath('.//h3/a/@href')) 70 | 71 | if debug: 72 | print "url: ", url 73 | 74 | page += 1 75 | time.sleep(1) 76 | 77 | return links 78 | 79 | 80 | if __name__ == '__main__': 81 | PARSER = argparse.ArgumentParser(description='A simple tool used to \ 82 | extract the resulting webpage links \ 83 | from Google Search.') 84 | 85 | PARSER.add_argument('terms', metavar='t', type=str, 86 | help='the terms to search on Google') 87 | 88 | PARSER.add_argument('-p', '--page-count', metavar='pg', type=int, 89 | help='the number of resulting pages to extract \ 90 | links from') 91 | 92 | PARSER.add_argument('-y', '--year-range', metavar='yr', nargs=2, 93 | help='the range of years to search through, ie. \ 94 | 2000 2001') 95 | 96 | PARSER.add_argument('-d', '--debug', metavar='yrng', nargs=2, 97 | help='the range of years to search through, ie. \ 98 | 2000 2001') 99 | 100 | ARGS = PARSER.parse_args() 101 | 102 | for link in search(ARGS.terms, ARGS.page_count or 1, ARGS.year_range): 103 | print link 104 | 105 | -------------------------------------------------------------------------------- /crawlers/Crawling-Google/output.txt: -------------------------------------------------------------------------------- 1 | https://www.gnu.org/fun/jokes/helloworld.ko.html 2 | http://www.keil.com/download/docs/73.asp 3 | http://www.cafeaulait.org/books/jdr/chapters/HelloWorld.html 4 | https://docs.oracle.com/cd/E13211_01/wle/wle50/rmi/samphelo.htm 5 | http://cs.lmu.edu/~ray/notes/introml/ 6 | http://cm.bell-labs.com/who/dmr/bintro.html 7 | http://msdn.microsoft.com/en-us/library/ms524741%28v=vs.90%29.aspx 8 | https://courses.cs.washington.edu/courses/cse378/00au/ctomips1.pdf 9 | http://www.tcm.phy.cam.ac.uk/~mjr/C/hello_world.html 10 | http://help.voxeo.com/go/help/xml.callxml.learning.serverside.coldfus 11 | http://www.asx.com.au/asx/research/companyInfo.do?by=asxCode&asxCode=HLO 12 | http://pages.cs.wisc.edu/~ghost/doc/intro.htm 13 | http://mahi.ucsd.edu/shearer/COMPCLASS/c.txt 14 | http://tdc-www.harvard.edu/IOTA/TECHNICAL/VxWTutorial.html 15 | http://www.arl.wustl.edu/projects/fpx/workshop_0101/fpx_hello.pdf 16 | http://www.codeproject.com/Articles/863/Your-first-C-Web-Service 17 | http://www.rogerharford.com/hello-world-mvc 18 | http://www.ccp14.ac.uk/tutorial/compiler/cygnus-gnuwin32/ 19 | http://www.coderanch.com/t/386681/java/java/HelloWorld-java-txt 20 | http://mail-archives.apache.org/mod_mbox/cocoon-users/200003.mbox/%3C1161B9F1E3E5D111A25700A02461F2AB02B37EBA@usrymx04.merck.com%3E 21 | -------------------------------------------------------------------------------- /crawlers/Crawling-Google/pictures/click-Documents-subtab.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rodricios/crawl-to-the-future/aebed4d2ac836a116b40bde75a473726362d95db/crawlers/Crawling-Google/pictures/click-Documents-subtab.png -------------------------------------------------------------------------------- /crawlers/Crawling-Google/pictures/click-network-tab.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rodricios/crawl-to-the-future/aebed4d2ac836a116b40bde75a473726362d95db/crawlers/Crawling-Google/pictures/click-network-tab.png -------------------------------------------------------------------------------- /crawlers/Crawling-Google/pictures/copy-user-agent.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rodricios/crawl-to-the-future/aebed4d2ac836a116b40bde75a473726362d95db/crawlers/Crawling-Google/pictures/copy-user-agent.png -------------------------------------------------------------------------------- /crawlers/Crawling-Google/pictures/correct-lxml-download.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rodricios/crawl-to-the-future/aebed4d2ac836a116b40bde75a473726362d95db/crawlers/Crawling-Google/pictures/correct-lxml-download.png -------------------------------------------------------------------------------- /crawlers/Crawling-Google/pictures/correct-user-agent.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rodricios/crawl-to-the-future/aebed4d2ac836a116b40bde75a473726362d95db/crawlers/Crawling-Google/pictures/correct-user-agent.png -------------------------------------------------------------------------------- /crawlers/Crawling-Google/pictures/custom-date-range-Google-Search.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rodricios/crawl-to-the-future/aebed4d2ac836a116b40bde75a473726362d95db/crawlers/Crawling-Google/pictures/custom-date-range-Google-Search.png -------------------------------------------------------------------------------- /crawlers/Crawling-Google/pictures/finding-Google-results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rodricios/crawl-to-the-future/aebed4d2ac836a116b40bde75a473726362d95db/crawlers/Crawling-Google/pictures/finding-Google-results.png -------------------------------------------------------------------------------- /crawlers/Crawling-Google/pictures/highlight-request-headers.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rodricios/crawl-to-the-future/aebed4d2ac836a116b40bde75a473726362d95db/crawlers/Crawling-Google/pictures/highlight-request-headers.png -------------------------------------------------------------------------------- /crawlers/Crawling-Google/pictures/highlight-search-document.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rodricios/crawl-to-the-future/aebed4d2ac836a116b40bde75a473726362d95db/crawlers/Crawling-Google/pictures/highlight-search-document.png -------------------------------------------------------------------------------- /crawlers/Crawling-Google/pictures/right-click-copy-xpath.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rodricios/crawl-to-the-future/aebed4d2ac836a116b40bde75a473726362d95db/crawlers/Crawling-Google/pictures/right-click-copy-xpath.png -------------------------------------------------------------------------------- /crawlers/Crawling-Google/pictures/right-click-open-devtools.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rodricios/crawl-to-the-future/aebed4d2ac836a116b40bde75a473726362d95db/crawlers/Crawling-Google/pictures/right-click-open-devtools.png -------------------------------------------------------------------------------- /crawlers/Crawling-Google/pictures/use-magnifying-glass.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rodricios/crawl-to-the-future/aebed4d2ac836a116b40bde75a473726362d95db/crawlers/Crawling-Google/pictures/use-magnifying-glass.png -------------------------------------------------------------------------------- /crawlers/Crawling-Google/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html 5 | 6 | [settings] 7 | default = google_2000.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = google_2000 12 | -------------------------------------------------------------------------------- /crawlers/Crawling-Google/simpledownload.py: -------------------------------------------------------------------------------- 1 | import urllib2 2 | 3 | from lxml import html 4 | 5 | # To address paging in Google 6 | PAGE = 0 7 | 8 | # url and query string from PART I 9 | # this is a custom range from Jan 1, 2000 to Jan 1, 2001 10 | URL = 'https://www.google.com/search?q=new+york+times&tbs=cdr%3A1%2Ccd_min%3A1%2F1%2F2000%2Ccd_max%3A1%2F1%2F2001&start=' + str(PAGE*10) 11 | 12 | # here we setup the necessary agent to download a google html page 13 | opener = urllib2.build_opener() 14 | opener.addheaders = [('User-agent', 15 | 'Mozilla/5.0 (Windows NT 6.3; WOW64) \ 16 | AppleWebKit/537.36 (KHTML, like Gecko) \ 17 | Chrome/39.0.2171.95 Safari/537.36 \ 18 | OPR/26.0.1656.60')] 19 | 20 | 21 | # let's download 22 | google_html = opener.open(URL) 23 | 24 | # parse the html 25 | google_parsed = html.parse(google_html) 26 | 27 | #print html.tostring(google_parsed) 28 | -------------------------------------------------------------------------------- /crawlers/Crawling-Google/simpleselect.py: -------------------------------------------------------------------------------- 1 | import urllib2 2 | 3 | from lxml import html 4 | 5 | # To address paging in Google 6 | PAGE = 0 7 | 8 | # url and query string from PART I 9 | # this is a custom range from Jan 1, 2000 to Jan 1, 2001 10 | URL = 'https://www.google.com/search?q=new+york+times&tbs=cdr%3A1%2Ccd_min%3A1%2F1%2F2000%2Ccd_max%3A1%2F1%2F2001&start=' + str(PAGE*10) 11 | 12 | # here we setup the necessary agent to download a google html page 13 | opener = urllib2.build_opener() 14 | opener.addheaders = [('User-agent', 15 | 'Mozilla/5.0 (Windows NT 6.3; WOW64) \ 16 | AppleWebKit/537.36 (KHTML, like Gecko) \ 17 | Chrome/39.0.2171.95 Safari/537.36 \ 18 | OPR/26.0.1656.60')] 19 | 20 | # let's download 21 | google_html = opener.open(URL) 22 | 23 | # parse the html 24 | google_parsed = html.parse(google_html) 25 | 26 | # Here's a smarter way to see what exactly it is you've downloaded/parsed with lxml: 27 | html.open_in_browser(google_parsed) 28 | #file://c:/users/rodrigo/appdata/local/temp/tmp1xllau.html 29 | 30 | # Here comes the 'selecting'! 31 | google_results = google_parsed.xpath('//*[@id="rso"]/div[2]') 32 | 33 | print len(google_results) 34 | #1 35 | 36 | # the xpath in this line basically selects all children, which in our 37 | # case are the 10 'li' elements 38 | print len(google_results[0].xpath('./*')) 39 | #10 40 | 41 | # print out hyperlinks 42 | # Note: after using devtool's magnifying glass and 'copy xpath', I got: 43 | # //*[@id="rso"]/div[2]/li[1]/div/h3/a 44 | google_list_items = google_results[0].xpath('.//h3/a/@href') 45 | for elem in google_list_items: 46 | print elem 47 | 48 | -------------------------------------------------------------------------------- /crawlers/Way-Back/README.md: -------------------------------------------------------------------------------- 1 | Way Back 2 | ======== 3 | 4 | Similar in purpose to [*Crawling-Google*](../Crawling-Google), this directory will 5 | hold the files, notes and scripts related to *trying to build an archival/historical* 6 | set of websites. 7 | 8 | Fortunately, there's more discussion about trying to download archived websites from 9 | the past, as you'll read [here](http://superuser.com/questions/828907/how-to-download-a-website-from-the-archive-org-wayback-machine). 10 | 11 | Unlike my attempt with [*Crawling Google*](../Crawling-Google#bitter-sweet-conclusion), 12 | it does not seem like I will have to "hack" away at query strings, experiment with 13 | user-agen values, etc. I hope that this will translate to finishing this part of the 14 | project sooner. 15 | 16 | --- 17 | 18 | T.o.C. 19 | ----- 20 | 21 | 1. [The archival year](#the-archival-year) 22 | 23 | 2. [Intro to WayBackTrack](#intro-to-waybacktrack) 24 | 25 | 3. [WayBackTrack](#waybacktrack) 26 | 27 | --- 28 | 29 | Let's begin with this [post](http://superuser.com/questions/828907/how-to-download-a-website-from-the-archive-org-wayback-machine), which I copy&paste a few lines down. 30 | 31 | While there's a few other resources you'll find if you Google "how to download WayBack archives", that 32 | *superuser* post may be the most succint post describing how we should go about this problem. 33 | 34 | First, let's start with requesting, parsing, and opening an arbitrary archived page from the WayBack Machine, 35 | just to make sure we don't start off on the wrong foot. 36 | 37 | ```python 38 | from lxml import html 39 | 40 | ia_year_url = "http://web.archive.org/web/20001216014200/http://nytimes.com" 41 | 42 | ia_parsed = html.parse(ia_year_url) 43 | 44 | html.open_in_browser(ia_parsed) 45 | # file://c:/users/rodrigo/appdata/local/temp/tmpr46riv.html 46 | ``` 47 | 48 | Note: if your browser doesn't pop up with the page you've just parsed, just copy and paste 49 | the filepath into your browser. 50 | 51 | --- 52 | 53 | From my end, so far so good :) 54 | 55 | Now take a minute to read the *superuser* post: 56 | 57 | > The idea is to use some of the nide URL features of the wayback machine: 58 | > 59 | > * http://web.archive.org/web/*/http://domain/* will list all saved pages from http://domain/ recursively. It can be used to construct an index of pages to download and avoid heuristics to detect links in webpages. For each link, there is also the date of the first version and the last version. 60 | * http://web.archive.org/web/YYYYMMDDhhmmss*/http://domain/page will list all version of http://domain/page for year YYYY. Within that page, specific links to versions can be found (with exact timestamp) 61 | * http://web.archive.org/web/YYYYMMDDhhmmssid_/http://domain/page will return the unmodified page http://domain/page at the given timestamp. Notice the id_ token. 62 | 63 | 64 | Alright, you heard the man, let's experiment with the following url, 65 | 66 | http://web.archive.org/web/2000*/http://www.nytimes.com/ 67 | 68 | All we've specified in the above url is that we want **all** snapshots belonging to New York Time's domain from the year 2000 69 | 70 | Now take a look at this screenshot: 71 | 72 | ![Way Back makes it easy](pictures/way-back-makes-it-easy.png?raw=true "Way Back makes it easy") 73 | 74 | Do you notice the "id" attribute in each of those month div's?! 75 | 76 | This makes my life so much easier in terms of being able to select the urls leading to 77 | archived pages. Let's add the following selection: 78 | 79 | ```python 80 | ... 81 | ia_parsed = html.parse(ia_year_url) 82 | 83 | nov_elem = ia_parsed.xpath('//*[@id="2000-11"]//a/@href') 84 | # this is good! 85 | # ['/web/20001201203900/http://www10.nytimes.com/', 86 | # '/web/20001201203900/http://www10.nytimes.com/', 87 | # '/web/20001202140900/http://www.nytimes.com/', 88 | # '/web/20001202140900/http://www.nytimes.com/', 89 | # '/web/20001204142500/http://nytimes.com/', 90 | # ...] 91 | ``` 92 | 93 | Well... what about getting all of NYTime's archived sites from the year 2000? While we can just 94 | loop 12 times and in each loop we specify a different month in the 'id' string, what using wild 95 | cards instead (ie. [@id='2000-*'])? 96 | 97 | Turns out xpath doesn't support the above wild card notation. Instead we can use [xpath's "starts-with" function](http://stackoverflow.com/questions/2556897/yql-how-to-use-wildcard-in-xpath). 98 | 99 | ```python 100 | nyt_2000_archived = list(set(ia_parsed.xpath('//*[starts-with(@id,"2000-")]//a/@href'))) 101 | ``` 102 | 103 | What the line above does is provide a list of url-strings referencing various www.nytimes.com 104 | snapshots (each string is a differnt snapshot url). 105 | 106 | What would be cool is if we could somehow crawl within not only the index.html, but also the 107 | forward links **in** that index.html. 108 | 109 | There's an issue that comes up though. This issue is best shown, not told. 110 | 111 | Look at the this NYTimes.com archive from Nov. 19, 2000: 112 | 113 | ![NYTimes Nov. 19, 2000](pictures/nytimes-2000-11-19.png?raw=true "NYTimes Nov. 19, 2000") 114 | 115 | Now let's click on the picture of Bill Clinton and Macaulay Culkin: 116 | 117 | ![Clinton and Culkin, 2016](pictures/clinton-culkin-2002-02-09.png?raw=true "Clinton and Culkin, 2016") 118 | 119 | 120 | Take a good look at the first four digits after "http://web.archive.org/web/" in your address bar. 121 | 122 | 123 | You should see 2002. This is not good. When we should be building a dataset of archived websites 124 | from within the same year, or the fifth, or the 10th year after that, we are getting an archive 125 | from the year 2002. 126 | 127 | The Archival Year 128 | ----------------- 129 | 130 | So, here we can layout a specification, in the context of using the Way Back Machine as our 131 | original source of archives. The archived page has to be of the year 2000, 2005, or 2010; 132 | this distinction is affirmed by the beginning of the url: 133 | 134 | http://web.archive.org/web/[2000,2005,2010] 135 | 136 | Expressed in words, the four digits following "http://web.archive.org/web/" has to be 137 | either "2000", "2005", or "2010" - we do not need archived 2015 websites. 138 | 139 | 140 | --- 141 | 142 | The above specification can be expressed in code in the following way: 143 | ```python 144 | ARCHIVE_DOMAIN = "http://web.archive.org" 145 | 146 | page = '/web/20001119144900/http://www1.nytimes.com/subscribe/help/searchtips.html' 147 | 148 | parsed_page = html.parse(ARCHIVE_DOMAIN + page) 149 | 150 | archival_year_spec = ARCHIVE_DOMAIN + '/web/' + str(year) 151 | 152 | if parsed_page.docinfo.URL.startswith(archival_year_spec): 153 | return True 154 | else: 155 | return False 156 | ``` 157 | 158 | 159 | For the current implementation of the script, what we I do is simply two things: 160 | 161 | 1. given a domain name (ie. www.nytimes.com) and archive year, retrive list of "snapshots" 162 | within that year. 163 | 164 | 2. given an existing domain "snapshot" url, we retrive a list of forward-linking "snapshots" from 165 | within the same year of the input domain. 166 | 167 | What we're doing is we're trying to retrieve as many websites from only two levels: 168 | the root domain page (ie. nytimes.com/index.html) and the forward-linked sites 169 | (ie. nytimes.com/pages-technology/index.html). 170 | 171 | 172 | Intro to WayBackTrack 173 | --------------------- 174 | 175 | [waybacktrack.py](waybacktrack.py) is a script designed to interface with the 176 | WayBack Machine; the module starts with a domain (ie. www.nytimes.com or www.reuters.com) 177 | and then extract forward links within that domain. 178 | 179 | Let's visualize www.reuters.com circa may 11, 2000: 180 | 181 | ![www.reuters.com circa 2000](pictures/reuters-2000.png?raw=true "Where's the news, reuters?") 182 | 183 | Now, here's the xpath we use to extract hyperlinks/forward links: 184 | 185 | '//a[starts-with(@href,"")]/@href' 186 | 187 | That xpath yields us a lot of forward links, which I then process and split 188 | into two lists: "flinks" and "duds" 189 | 190 | ``` 191 | print flinks 192 | ['/web/20000511182917/http://www.reuters.com/', 193 | '/web/20000511182917/http://www.reuters.com/nav/mar_ads/mar_ad_campaign.htm', 194 | '/web/20000511182917/http://www.reuters.com/nav/redir/promo001.html', 195 | '/web/20000511182917/http://www.reuters.com/legal/disclaimer.htm', 196 | '/web/20000511182917/http://www.reuters.com/legal/copyright.htm', 197 | '/web/20000511182917/http://www.reuters.com/legal/privacy.htm'] 198 | 199 | print duds 200 | ['/web/20000511182917/http://www.reuters.com/products/', 201 | '/web/20000511182917/http://www.reuters.com/investors/', 202 | '/web/20000511182917/http://www.reuters.com/careers/', 203 | '/web/20000511182917/http://www.reuters.com/aboutreuters/', 204 | ...] 205 | # a lot more duds than flinks 206 | ``` 207 | 208 | Note: There's a few tests we have to do in order to seperate the links into the 209 | above two lists; please refer to the source code for more info. 210 | 211 | It's pretty easy to show a "dud" webpage: 212 | 213 | ![Let's buy some reuters shirts!](pictures/reuters-products-2000.png?raw=true "Where's the products, reuters?") 214 | 215 | And here's a reuters page that is "valid": 216 | 217 | ![Market professionals, Reuters is hiring!](pictures/reuters-is-hiring.png?raw=true "Where's the jobs, reuters?") 218 | 219 | WayBackTrack 220 | ------------ 221 | 222 | Usage, at the moment, is pretty simple. Just provide a domain name, 223 | the year, a directory where the .html files will be stored , and 224 | optionally you can provide a "percent*" and/or turn on debug 225 | messages. 226 | 227 | *'percent' is the percent of domain "snapshots" to process. Default value is 228 | '0'; what this means is that only the first domain snapshot (ie. www.reuters.com circa 229 | June 11, 2000) extracted is crawled for forward links. 230 | 231 | ###Usage 232 | 233 | ```python 234 | import waybacktrack 235 | flinks, duds = waybacktrack.archive_domain(domain='www.reuters.com', 236 | year=2000, 237 | dir_path='path/to/directory/', 238 | percent=10, 239 | debug=True) 240 | 241 | print flinks 242 | ``` 243 | 244 | Output: 245 | ```python 246 | Extracting links from: www.reuters.com 247 | Storing files in: ../../dataset/www.reuters.com 248 | Number of domain snapshots: 42 249 | Number of domain snapshots to process: 1 250 | file name: 20000511182917_www.reuters.com_ 251 | file name: 20000512004654_www.reuters.com_nav_mar_ads_mar_ad_campaign.htm 252 | file name: 20000619154054_www.reuters.com_nav_redir_promo001.html 253 | file name: 20000511102346_www.reuters.com_legal_disclaimer.htm 254 | file name: 20000511091512_www.reuters.com_legal_copyright.htm 255 | file name: 20000306080923_www.reuters.com_legal_privacy.htm 256 | Number of archived forward links: 6 257 | Number of duds: 14 258 | ``` 259 | 260 | Note: the directory will be automatically created, but this may and likely will 261 | bug out if project is this module is used outside of the overall project directory 262 | 263 | Final Remarks 264 | ------------- 265 | 266 | Alright! So it seems like we literally had luck on our side, at least as 267 | of this writing; Google custom date range search was a no-go (I should've 268 | thought that one through tbh, of course servers aren't going to be hosting 269 | original content from 15 years ago lol). 270 | 271 | To say the least, the WayBack Machine delivered. Thanks [Brewster Kahle](http://brewster.kahle.org/) 272 | and everyone else at [archive.org](https://archive.org/) for keeping these websites 273 | alive and accessible! I'll try not to spam you with too many requests. 274 | -------------------------------------------------------------------------------- /crawlers/Way-Back/pictures/clinton-culkin-2002-02-09.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rodricios/crawl-to-the-future/aebed4d2ac836a116b40bde75a473726362d95db/crawlers/Way-Back/pictures/clinton-culkin-2002-02-09.png -------------------------------------------------------------------------------- /crawlers/Way-Back/pictures/nytimes-2000-11-19.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rodricios/crawl-to-the-future/aebed4d2ac836a116b40bde75a473726362d95db/crawlers/Way-Back/pictures/nytimes-2000-11-19.png -------------------------------------------------------------------------------- /crawlers/Way-Back/pictures/reuters-2000.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rodricios/crawl-to-the-future/aebed4d2ac836a116b40bde75a473726362d95db/crawlers/Way-Back/pictures/reuters-2000.png -------------------------------------------------------------------------------- /crawlers/Way-Back/pictures/reuters-is-hiring.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rodricios/crawl-to-the-future/aebed4d2ac836a116b40bde75a473726362d95db/crawlers/Way-Back/pictures/reuters-is-hiring.png -------------------------------------------------------------------------------- /crawlers/Way-Back/pictures/reuters-products-2000.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rodricios/crawl-to-the-future/aebed4d2ac836a116b40bde75a473726362d95db/crawlers/Way-Back/pictures/reuters-products-2000.png -------------------------------------------------------------------------------- /crawlers/Way-Back/pictures/way-back-makes-it-easy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rodricios/crawl-to-the-future/aebed4d2ac836a116b40bde75a473726362d95db/crawlers/Way-Back/pictures/way-back-makes-it-easy.png -------------------------------------------------------------------------------- /crawlers/Way-Back/waybacktrack.py: -------------------------------------------------------------------------------- 1 | """waybacktrack.py 2 | Use this to extract Way Back Machine's 3 | url-archives of any given domain! 4 | TODO: reiterate entire design! 5 | """ 6 | import time 7 | import os 8 | import urllib2 9 | import random 10 | from math import ceil 11 | 12 | try: 13 | from cStringIO import StringIO as BytesIO 14 | except ImportError: 15 | from io import BytesIO 16 | 17 | from lxml import html 18 | from lxml.html import clean 19 | 20 | ARCHIVE_DOMAIN = "http://web.archive.org" 21 | 22 | CURR_DIR = os.path.dirname(__file__) 23 | 24 | DATASET_DIR = os.path.join(CURR_DIR, '../../dataset/') 25 | 26 | 27 | def archive_domain(domain, year, dir_path=DATASET_DIR, 28 | percent=0, debug=False, throttle=1): 29 | """ 30 | domain 31 | 32 | @type domain: string 33 | @param domain: the domain of the website ie. www.nytimes.com 34 | 35 | @type year: int 36 | @param year: the year to extract archives from 37 | 38 | @type dir_path: string 39 | @param dir_path: the directory path to store archive, if 40 | empty, directory will automatically be created 41 | TODO: Think of better solution to storing 42 | downloaded archives 43 | 44 | @type percent: int 45 | @param percent: the percentage of Way Back archives to crawl 46 | 47 | @rtype: 48 | @return: Returns a list of archived sites 49 | """ 50 | # TODO: Improve this for module portability 51 | # WARNING: Module will likely break if used outside of 52 | # crawl-to-the-future project 53 | # automatically find or eventually create directory 54 | # based off domain name 55 | 56 | # Found way to check if file is being ran in crawl-to-the-future 57 | # super "hacky" though 58 | # TODO: Find better way to check if module is getting ran in 59 | # in crawl-to-the-future project 60 | if os.path.split( 61 | os.path.abspath(os.path.join(__file__, os.pardir)))[1] != "Way-Back": 62 | raise Exception("Please manually specify 'dir_name' value") 63 | 64 | 65 | if dir_path is DATASET_DIR: 66 | dir_path = os.path.join(dir_path, domain + '/') 67 | 68 | 69 | if not os.path.exists(dir_path): 70 | #raise IOError("[Errno 2] No such file or directory: '" + dir_path + "'") 71 | # this part is shady 72 | os.makedirs(dir_path) 73 | 74 | if not isinstance(dir_path, basestring): 75 | raise Exception("Directory - third arg. - path must be a string.") 76 | 77 | ia_year_url = ARCHIVE_DOMAIN + "/web/" + str(year) + \ 78 | "*/http://" + domain + "/" 79 | 80 | ia_parsed = html.parse(ia_year_url) 81 | 82 | domain_snapshots = list(set(ia_parsed.xpath('//*[starts-with(@id,"' + 83 | str(year) + '-")]//a/@href'))) 84 | 85 | #snapshot_age_span is a percentage of total snapshots to process from 86 | #the given year 87 | #ie. if percent is 100, and there are a total of 50 snapshots for 88 | #www.cnn.com, we will crawl (to a depth of 1 atm) all 50 snapshots 89 | snapshot_age_span = 1 if percent <= 0 \ 90 | else len(domain_snapshots) - 1 \ 91 | if percent >= 100 \ 92 | else int(percent*len(domain_snapshots)/100) 93 | 94 | if debug: 95 | print "Extracting links from: ", domain 96 | 97 | # http://margerytech.blogspot.com/2011/06/python-get-last-directory-name-in-path.html 98 | print "Current directory: ", os.path.split( 99 | os.path.abspath(os.path.join(__file__, os.pardir)))[1] 100 | 101 | print "Storing files in: ", os.path.abspath(dir_path) 102 | 103 | print "Number of domain snapshots: ", len(domain_snapshots) 104 | 105 | print "Number of domain snapshots to process: ", snapshot_age_span + 1 106 | 107 | random.shuffle(domain_snapshots) 108 | 109 | forward_links = [] 110 | 111 | #for snapshot in domain_snapshots[:snapshot_age_span]: 112 | for snapshot in domain_snapshots[:3]: 113 | 114 | curr_snapshot_flinks = get_forwardlink_snapshots(snapshot) 115 | 116 | forward_links.extend(curr_snapshot_flinks) 117 | 118 | if debug: 119 | print "snapshot url: ", snapshot 120 | 121 | print "forward link count: ", len(curr_snapshot_flinks) 122 | 123 | 124 | random.shuffle(forward_links) 125 | 126 | if debug: 127 | print "total number of foward links to download: ", len(forward_links) 128 | 129 | random.shuffle(forward_links) 130 | 131 | # archive forward links 132 | archived_links = [] 133 | duds = [] 134 | for forwardlink in forward_links: 135 | if archive(forwardlink, year, dir_path, debug, throttle): 136 | archived_links.append(forwardlink) 137 | else: 138 | duds.append(forwardlink) 139 | 140 | if debug: 141 | print "Number of archived forward links: ", len(archived_links) 142 | print "Number of duds: ", len(duds) 143 | return archived_links, duds 144 | 145 | 146 | # I know I'm breaking so many rules by not seperating concerns 147 | def archive(page, year, dir_path, debug=False, throttle=1): 148 | """ 149 | Check to see if downloaded forward link 150 | satisfies the archival year specification 151 | ie. (2000, 2005, 2010) 152 | """ 153 | #files = [f for f in os.listdir(dir_path) if os.path.isfile(f)] 154 | if debug: 155 | print "requesting ", page 156 | 157 | page_file = page.rsplit('/web/')[1].replace('http://', '').replace('-','_') 158 | page_file = page_file.replace('/', '_').replace(':', '_').replace('&','_') 159 | page_file = page_file.replace('?', '_').replace('*','_').replace('=','_') 160 | 161 | file_path = dir_path + page_file 162 | if os.path.isfile(file_path): 163 | if debug: 164 | print "Already saved: ", page_file 165 | print 166 | return False 167 | 168 | try: 169 | html_file = urllib2.urlopen(ARCHIVE_DOMAIN + page) 170 | except IOError: 171 | if debug: 172 | print "Failed to open request for ", ARCHIVE_DOMAIN + page 173 | print 174 | return False 175 | 176 | if html_file.getcode() == 302: 177 | if debug: 178 | print "Got HTTP 302 response for ", ARCHIVE_DOMAIN + page 179 | print 180 | return False 181 | 182 | html_string = str(html_file.read()) 183 | 184 | if html_string.find("HTTP 302 response") != -1: 185 | if debug: 186 | print "Got HTTP 302 response for ", ARCHIVE_DOMAIN + page 187 | print 188 | return False 189 | 190 | archival_year_spec = ARCHIVE_DOMAIN + '/web/' + str(year) 191 | 192 | page_url = html_file.geturl() 193 | 194 | if page_url.startswith(archival_year_spec): 195 | 196 | if debug: 197 | print "saving ", page_url 198 | print 199 | 200 | try: 201 | with open(file_path, 'wb') as f: 202 | f.write(BytesIO(html_string).read()) 203 | 204 | time.sleep(throttle) 205 | 206 | except IOError as e: 207 | if debug: 208 | print "Got error: ", e 209 | return False 210 | 211 | return True 212 | else: 213 | return False 214 | 215 | 216 | def get_forwardlink_snapshots(parent_site): 217 | """ 218 | @type index: string 219 | @param index: the index.html page from which to extract forward links 220 | 221 | @type year: int 222 | @param year: the year to extract archives from 223 | """ 224 | try: 225 | parsed_parent_site = html.parse(ARCHIVE_DOMAIN+parent_site) 226 | except IOError: 227 | print "Did not get extract links in ", ARCHIVE_DOMAIN+parent_site 228 | return [] 229 | 230 | #cleaner = html.clean.Cleaner(scripts=True, javascript=True,style=True, kill_tags = ["img"]) 231 | cleaner = clean.Cleaner(scripts=True, javascript=True, comments=True, 232 | style=True, meta=True, processing_instructions=True, embedded=True, 233 | frames=True, forms=True, kill_tags=["noscript", "iframe", "img"]) 234 | 235 | parsed_parent_site = cleaner.clean_html(parsed_parent_site) 236 | 237 | # spec archival year 238 | # check to see if the archival year of a forwark link 239 | # is that of the parent (ie. 2000|2005|2010) 240 | all_forwardlinks = parsed_parent_site.xpath('//a[starts-with(@href,"' + 241 | parent_site[:9] +'")]/@href') 242 | 243 | return all_forwardlinks 244 | 245 | -------------------------------------------------------------------------------- /dataset/README.md: -------------------------------------------------------------------------------- 1 | Datasets - websites from 2000, 2005, 2010, and 2015 2 | ----------------------------------------------------- 3 | 4 | This directory is where you'll find an on-going accumulation of 5 | websites from the following domains: 6 | 7 | * news.bbc.co.uk 8 | * cnn.com 9 | * news.yahoo.com 10 | * thenation.com 11 | * latimes.com 12 | * entertainment.msn.com 13 | * foxnews.com 14 | * forbes.com 15 | * nymag.com 16 | * esquire.com 17 | 18 | ###Update 1/23/2015 19 | 20 | This update is long overdue. Without saying too much about how I've lost all faith with so-called "tripple-padded ergonomic office chairs", I'm glad to say 21 | that there are +1000 sites and +1000 text files (containing the extracted articles) from 10 domains, originally published in years 2000, 2005, 2010, and 2015. 22 | 23 | I unfortunately didn't have time to document the process, but if I were to summarize what I did, it would be this: 24 | 25 | 1. Run all sites through a content-extractor (*cought* [eatiht](http://github.com/rodricios/eatiht) *cough*) 26 | 2. Hand verify the resulting text files to make sure they contain: article, title, author, and date 27 | 28 | A disclaimer: this is a **dataset**, not a testset, yet. 29 | 30 | I've done only one comprehensive scan of each htmlfile-textfile pair. If you do plan on using this dataset, please be aware that you may 31 | and likely will find a duplicate. 32 | 33 | That said, I'll have a *testset* uploaded once I've been able to do another comprehensive scan (in progress) inorder to weed out any discrepencies, duplicates, 34 | etc. 35 | 36 | Cheers 37 | 38 | 39 | Dataset Composition 40 | ------------------- 41 | 42 | Each one of the above sources will exist in their own directory. 43 | 44 | Currently, the only requirement is that each source's downloaded 45 | html's live in its own directory. This is automatically satisfied 46 | in [waybacktrack.py](../crawlers/Way-Back#waybacktrack) 47 | 48 | Most likely in future updates, I will include automated process 49 | of creating subdirectories depending on the year of the archive. 50 | 51 | Examples of raw Dataset .html's 52 | ------------------------------- 53 | 54 | ![cnn ex1](pictures/cnn-ex1.png?raw=true "ex1") 55 | 56 | --- 57 | 58 | ![cnn ex2](pictures/cnn-ex2.png?raw=true "ex2") 59 | 60 | --- 61 | 62 | ![cnn ex3](pictures/cnn-ex3.png?raw=true "ex3") 63 | -------------------------------------------------------------------------------- /dataset/crawl_extract.py: -------------------------------------------------------------------------------- 1 | import os 2 | os.chdir('c:/users/rodrigo/desktop/crawlToTheFuture/crawl-to-the-future/crawlers/Way-Back/') 3 | 4 | import waybacktrack 5 | 6 | sites = ['thenation.com'] 7 | years = [2000,2005] 8 | 9 | for site in sites: 10 | for year in years: 11 | waybacktrack.archive_domain(domain=site, 12 | year=year, 13 | debug=True) 14 | 15 | 16 | try: 17 | from cStringIO import StringIO as BytesIO 18 | except ImportError: 19 | from io import BytesIO 20 | 21 | from eatiht import etv2 22 | 23 | os.chdir('../../www.nytimes.com/content/') 24 | 25 | files = [f for f in os.listdir('.') if os.path.isfile(f)] 26 | 27 | for f in files: 28 | try: 29 | content = etv2.extract(f).get_text() 30 | 31 | with open(f + '.txt', 'wb') as fi: 32 | fi.write(BytesIO(content).read()) 33 | except Exception: 34 | pass 35 | -------------------------------------------------------------------------------- /dataset/entertainment.msn.com.7z: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rodricios/crawl-to-the-future/aebed4d2ac836a116b40bde75a473726362d95db/dataset/entertainment.msn.com.7z -------------------------------------------------------------------------------- /dataset/news.bbc.co.uk.7z: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rodricios/crawl-to-the-future/aebed4d2ac836a116b40bde75a473726362d95db/dataset/news.bbc.co.uk.7z -------------------------------------------------------------------------------- /dataset/news.yahoo.com.7z: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rodricios/crawl-to-the-future/aebed4d2ac836a116b40bde75a473726362d95db/dataset/news.yahoo.com.7z -------------------------------------------------------------------------------- /dataset/thenation.com.7z: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rodricios/crawl-to-the-future/aebed4d2ac836a116b40bde75a473726362d95db/dataset/thenation.com.7z -------------------------------------------------------------------------------- /dataset/www.cnn.com.7z: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rodricios/crawl-to-the-future/aebed4d2ac836a116b40bde75a473726362d95db/dataset/www.cnn.com.7z -------------------------------------------------------------------------------- /dataset/www.esquire.com.7z: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rodricios/crawl-to-the-future/aebed4d2ac836a116b40bde75a473726362d95db/dataset/www.esquire.com.7z -------------------------------------------------------------------------------- /dataset/www.forbes.com.7z: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rodricios/crawl-to-the-future/aebed4d2ac836a116b40bde75a473726362d95db/dataset/www.forbes.com.7z -------------------------------------------------------------------------------- /dataset/www.foxnews.com.7z: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rodricios/crawl-to-the-future/aebed4d2ac836a116b40bde75a473726362d95db/dataset/www.foxnews.com.7z -------------------------------------------------------------------------------- /dataset/www.latimes.com.7z: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rodricios/crawl-to-the-future/aebed4d2ac836a116b40bde75a473726362d95db/dataset/www.latimes.com.7z -------------------------------------------------------------------------------- /dataset/www.nymag.com.7z: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rodricios/crawl-to-the-future/aebed4d2ac836a116b40bde75a473726362d95db/dataset/www.nymag.com.7z -------------------------------------------------------------------------------- /testing/README.md: -------------------------------------------------------------------------------- 1 | Backtesting 2 | ----------- 3 | 4 | 5 | With the help of Matt Peters (who's done some amazingly useful work over w/ [dragnet](https://github.com/seomoz/dragnet/)), here's a synopsis of what's required in order to effectively back test extractors: 6 | 7 | 8 | 1. Read in HTML and the extracted content files (.html, .txt file pair) 9 | 2. Run the extractors 10 | 3. Tokenize expected and predicted (extracted) content 11 | 4. Compute [precision and recall](http://en.wikipedia.org/wiki/Precision_and_recall), and [F1](http://en.wikipedia.org/wiki/F1_score) 12 | 13 | Here's Tim Weninger's amazingly simple explanation of Precision and Recall: 14 | 15 | Where C is what's extracted from some extractor to be tested, G is Gold Standard or what we'd hope the best extractor would be able to extract, 16 | and W is what I like to refer to the Universe - or in other words, the collection of available to text that can be extracted (the "good" and the "bad" content). 17 | 18 | TP = C \intersect G 19 | FP = C \minus G 20 | FN = (W \minus C) \intersect G 21 | TN = (W \minus C) \intersect (W \minus G) 22 | 23 | The precision is then TP/(TP+FP) 24 | The recall is TP/(TP+FN) 25 | Accuracy is (TP+TN)/(TP+FP+FN+TN) 26 | F1 = 2*[(prec*recall)/(prec+recall)] 27 | 28 | -------------------------------------------------------------------------------- /testing/avgs.pkl: -------------------------------------------------------------------------------- 1 | (dp0 2 | S'www.latimes.com' 3 | p1 4 | (dp2 5 | S'2015' 6 | p3 7 | (dp4 8 | S'avg_precision' 9 | p5 10 | F0.8035509770759279 11 | sS'avg_accuracy' 12 | p6 13 | F0.9467728255373218 14 | sS'avg_recall' 15 | p7 16 | F0.6527383490053367 17 | sS'avg_f1' 18 | p8 19 | F0.7157798752065295 20 | ssS'2000' 21 | p9 22 | (dp10 23 | g5 24 | F0.8544639335585639 25 | sg6 26 | F0.8632617476408376 27 | sg7 28 | F0.9509115748205449 29 | sg8 30 | F0.8821950217638651 31 | ssS'2010' 32 | p11 33 | (dp12 34 | g5 35 | F0.828471014972406 36 | sg6 37 | F0.8991441536199235 38 | sg7 39 | F0.735286387262563 40 | sg8 41 | F0.7681908772753533 42 | ssS'2005' 43 | p13 44 | (dp14 45 | g5 46 | F0.9617522951230408 47 | sg6 48 | F0.9460540482534168 49 | sg7 50 | F0.8988154606909472 51 | sg8 52 | F0.9267798217502 53 | sssS'www.cnn.com' 54 | p15 55 | (dp16 56 | S'2015' 57 | p17 58 | (dp18 59 | g5 60 | F0.9319362497095548 61 | sg6 62 | F0.9050865273196554 63 | sg7 64 | F0.7176341070085498 65 | sg8 66 | F0.7993324657479303 67 | ssS'2000' 68 | p19 69 | (dp20 70 | g5 71 | F0.9161629883160631 72 | sg6 73 | F0.8238004517481676 74 | sg7 75 | F0.6706051953276166 76 | sg8 77 | F0.7289587628352208 78 | ssS'2010' 79 | p21 80 | (dp22 81 | g5 82 | F0.9190540608611633 83 | sg6 84 | F0.904397972100545 85 | sg7 86 | F0.720367938515751 87 | sg8 88 | F0.803034551827662 89 | ssS'2005' 90 | p23 91 | (dp24 92 | g5 93 | F0.9559779693068151 94 | sg6 95 | F0.919976127871615 96 | sg7 97 | F0.7598382772458616 98 | sg8 99 | F0.8387996836377389 100 | sssS'www.forbes.com' 101 | p25 102 | (dp26 103 | S'2015' 104 | p27 105 | (dp28 106 | g5 107 | F0.8972134060775241 108 | sg6 109 | F0.8986697321180566 110 | sg7 111 | F0.7335961110116219 112 | sg8 113 | F0.7870155318467539 114 | ssS'2000' 115 | p29 116 | (dp30 117 | g5 118 | F0.9602798588690202 119 | sg6 120 | F0.8091785796413377 121 | sg7 122 | F0.6424386899802277 123 | sg8 124 | F0.7388445804783836 125 | ssS'2010' 126 | p31 127 | (dp32 128 | g5 129 | F0.9696185233206508 130 | sg6 131 | F0.8641714809006092 132 | sg7 133 | F0.6014258806193916 134 | sg8 135 | F0.7327530754288866 136 | ssS'2005' 137 | p33 138 | (dp34 139 | g5 140 | F0.9569452859418489 141 | sg6 142 | F0.9424401453110219 143 | sg7 144 | F0.8162942552926418 145 | sg8 146 | F0.8614851126118714 147 | sssS'news.bbc.co.uk' 148 | p35 149 | (dp36 150 | S'2015' 151 | p37 152 | (dp38 153 | g5 154 | F0.9132419809489746 155 | sg6 156 | F0.9639845085604982 157 | sg7 158 | F0.6638402458216882 159 | sg8 160 | F0.7659767308535694 161 | ssS'2000' 162 | p39 163 | (dp40 164 | g5 165 | F0.8151192646595848 166 | sg6 167 | F0.7801424557526109 168 | sg7 169 | F0.6079253620456607 170 | sg8 171 | F0.6407220493857138 172 | ssS'2010' 173 | p41 174 | (dp42 175 | g5 176 | F0.9488260911878033 177 | sg6 178 | F0.9219708358842496 179 | sg7 180 | F0.6667294365781038 181 | sg8 182 | F0.7768263196095718 183 | ssS'2005' 184 | p43 185 | (dp44 186 | g5 187 | F0.7759439794297487 188 | sg6 189 | F0.8263657177684894 190 | sg7 191 | F0.8426336001200153 192 | sg8 193 | F0.7743474575947391 194 | sssS'thenation.com' 195 | p45 196 | (dp46 197 | S'2015' 198 | p47 199 | (dp48 200 | g5 201 | F0.982897564524634 202 | sg6 203 | F0.9003880106234209 204 | sg7 205 | F0.7439733981303495 206 | sg8 207 | F0.8400444073353461 208 | ssS'2000' 209 | p49 210 | (dp50 211 | g5 212 | F0.9837921683808656 213 | sg6 214 | F0.750037360435283 215 | sg7 216 | F0.6311344962995613 217 | sg8 218 | F0.7416221475217819 219 | ssS'2010' 220 | p51 221 | (dp52 222 | g5 223 | F0.909742955966361 224 | sg6 225 | F0.8876376322308915 226 | sg7 227 | F0.7268691166293663 228 | sg8 229 | F0.8053051003976753 230 | ssS'2005' 231 | p53 232 | (dp54 233 | g5 234 | F0.5869102540701906 235 | sg6 236 | F0.6444220918257025 237 | sg7 238 | F0.8653876246345382 239 | sg8 240 | F0.668542218409078 241 | sssS'www.nymag.com' 242 | p55 243 | (dp56 244 | S'2015' 245 | p57 246 | (dp58 247 | g5 248 | F0.49693262386927906 249 | sg6 250 | F0.9850166553337936 251 | sg7 252 | F0.30595144296903 253 | sg8 254 | F0.33944286974231674 255 | ssS'2000' 256 | p59 257 | (dp60 258 | g5 259 | F0.5856442739956657 260 | sg6 261 | F0.720808559257688 262 | sg7 263 | F0.9532318970908235 264 | sg8 265 | F0.7067372120242632 266 | ssS'2010' 267 | p61 268 | (dp62 269 | g5 270 | F0.9361875653104671 271 | sg6 272 | F0.9610072779871861 273 | sg7 274 | F0.6693575639566113 275 | sg8 276 | F0.7783766710706719 277 | ssS'2005' 278 | p63 279 | (dp64 280 | g5 281 | F0.9142767871177219 282 | sg6 283 | F0.8490913693788785 284 | sg7 285 | F0.7162724829909457 286 | sg8 287 | F0.7728741359778185 288 | sssS'entertainment.msn.com' 289 | p65 290 | (dp66 291 | S'2015' 292 | p67 293 | (dp68 294 | g5 295 | F0.8774759262982088 296 | sg6 297 | F0.9800434319954887 298 | sg7 299 | F0.6718317347646345 300 | sg8 301 | F0.7522973816033732 302 | ssS'2000' 303 | p69 304 | (dp70 305 | g5 306 | F0.7809895902954578 307 | sg6 308 | F0.878226458639663 309 | sg7 310 | F0.8248412997464151 311 | sg8 312 | F0.7776401450560837 313 | ssS'2010' 314 | p71 315 | (dp72 316 | g5 317 | F0.9591116105134913 318 | sg6 319 | F0.9105981447068229 320 | sg7 321 | F0.66318597426345 322 | sg8 323 | F0.7547600391625994 324 | ssS'2005' 325 | p73 326 | (dp74 327 | g5 328 | F0.9306181526661566 329 | sg6 330 | F0.9143227880115047 331 | sg7 332 | F0.7506788040971138 333 | sg8 334 | F0.8100250325165012 335 | sssS'www.esquire.com' 336 | p75 337 | (dp76 338 | S'2015' 339 | p77 340 | (dp78 341 | g5 342 | F0.9777928581767094 343 | sg6 344 | F0.972749959685334 345 | sg7 346 | F0.6058910891128433 347 | sg8 348 | F0.7442968420260976 349 | ssS'2000' 350 | p79 351 | (dp80 352 | g5 353 | F0.6922746324255855 354 | sg6 355 | F0.8025089990983143 356 | sg7 357 | F0.8502588017931124 358 | sg8 359 | F0.7144518452878422 360 | ssS'2010' 361 | p81 362 | (dp82 363 | g5 364 | F0.9657664616790654 365 | sg6 366 | F0.917713724757117 367 | sg7 368 | F0.720823276473981 369 | sg8 370 | F0.8140813900391529 371 | ssS'2005' 372 | p83 373 | (dp84 374 | g5 375 | F0.6180066873519628 376 | sg6 377 | F0.7701210111257204 378 | sg7 379 | F0.9413535035210979 380 | sg8 381 | F0.7267751357389283 382 | sssS'news.yahoo.com' 383 | p85 384 | (dp86 385 | S'2015' 386 | p87 387 | (dp88 388 | g5 389 | F0.9870026399258761 390 | sg6 391 | F0.9893460394645375 392 | sg7 393 | F0.44732805664318476 394 | sg8 395 | F0.6058897000309008 396 | ssS'2000' 397 | p89 398 | (dp90 399 | g5 400 | F0.7612476929495509 401 | sg6 402 | F0.7780228797983223 403 | sg7 404 | F0.7918118641170517 405 | sg8 406 | F0.7463531250175874 407 | ssS'2010' 408 | p91 409 | (dp92 410 | g5 411 | F0.9776218108265357 412 | sg6 413 | F0.9255179013943237 414 | sg7 415 | F0.7333055135375502 416 | sg8 417 | F0.8371233322749576 418 | ssS'2005' 419 | p93 420 | (dp94 421 | g5 422 | F0.5043899578387142 423 | sg6 424 | F0.7034687007943923 425 | sg7 426 | F0.9319664634558119 427 | sg8 428 | F0.6341621734787838 429 | sssS'www.foxnews.com' 430 | p95 431 | (dp96 432 | S'2015' 433 | p97 434 | (dp98 435 | g5 436 | F0.9800680098898277 437 | sg6 438 | F0.8973170757133413 439 | sg7 440 | F0.7498983851213855 441 | sg8 442 | F0.8445052629365077 443 | ssS'2000' 444 | p99 445 | (dp100 446 | g5 447 | F0.8388377342165464 448 | sg6 449 | F0.8108548107762594 450 | sg7 451 | F0.706418493627554 452 | sg8 453 | F0.7249702140649049 454 | ssS'2010' 455 | p101 456 | (dp102 457 | g5 458 | F0.9604578972556221 459 | sg6 460 | F0.9204965971604794 461 | sg7 462 | F0.6803906411927594 463 | sg8 464 | F0.7859592132141975 465 | ssS'2005' 466 | p103 467 | (dp104 468 | g5 469 | F0.9995934636566491 470 | sg6 471 | F0.8597143883177145 472 | sg7 473 | F0.48121791887371607 474 | sg8 475 | F0.634271215632443 476 | sssS'mean_avgs' 477 | p105 478 | (dp106 479 | S'a' 480 | p107 481 | (dp108 482 | S'2015' 483 | p109 484 | F0.9439374766351447 485 | sS'2000' 486 | p110 487 | F0.8016842302788483 488 | sS'2010' 489 | p111 490 | F0.9112655720742149 491 | sS'2005' 492 | p112 493 | F0.8375976388658456 494 | ssS'p' 495 | p113 496 | (dp114 497 | g109 498 | F0.8848112236496515 499 | sg110 500 | F0.8188812137666904 501 | sg111 502 | F0.9374857991893565 503 | sg112 504 | F0.8204414832502849 505 | ssS'r' 506 | p115 507 | (dp116 508 | g109 509 | F0.6292682919588625 510 | sg110 511 | F0.7629577674848568 512 | sg111 513 | F0.6917741729029527 514 | sg112 515 | F0.800445839092269 516 | ssS'f1' 517 | p117 518 | (dp118 519 | g109 520 | F0.7194581067329326 521 | sg110 522 | F0.7402495103435647 523 | sg111 524 | F0.7856410570300729 525 | sg112 526 | F0.7648061987348103 527 | sss. -------------------------------------------------------------------------------- /testing/eatiht_praf_output.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rodricios/crawl-to-the-future/aebed4d2ac836a116b40bde75a473726362d95db/testing/eatiht_praf_output.pkl -------------------------------------------------------------------------------- /testing/praf.py: -------------------------------------------------------------------------------- 1 | import re #for tokenizing 2 | import collections #for multiset/histogram 3 | import os 4 | try: 5 | from StringIO import StringIO 6 | except ImportError: 7 | from io import StringIO 8 | 9 | import lxml.html 10 | 11 | try: 12 | range = xrange 13 | except: 14 | pass 15 | 16 | def histsum(hist): return sum(hist.values()) 17 | 18 | 19 | #get every non-html word on page (ie. "//text()") as f-distribution 20 | def tokens_to_hist_from_universe(data_filepath): 21 | 22 | with open(data_filepath,'r') as data_file: 23 | 24 | try: 25 | parsed_goldhtml = lxml.html.parse(data_file, 26 | lxml.html.HTMLParser(encoding="utf-8")) 27 | except: 28 | print(str(data_file.read())) 29 | tokenized_content = collections.Counter() 30 | 31 | content = "".join(parsed_goldhtml.xpath('//text()')) 32 | 33 | tokenized_content.update(re.split(r'\W+', content)) 34 | 35 | return tokenized_content 36 | 37 | 38 | #get every word from gold std. as f-distribution 39 | def tokens_to_hist_goldstd(test_filepath): 40 | if not os.path.exists(test_filepath): 41 | test_filepath += '.txt' 42 | with open(os.path.abspath(test_filepath), 'r') as f: 43 | 44 | #print("test_filepath",os.path.abspath(test_filepath)) 45 | content = str(f.read()) 46 | 47 | tokenized_content = collections.Counter() 48 | 49 | tokenized_content.update(re.split(r'\W+', content)) 50 | 51 | return tokenized_content 52 | 53 | 54 | #get every extracted (predicted) word as f-distribution 55 | def tokens_to_hist_extractor(extract, data_filepath): 56 | 57 | try: 58 | content = extract(data_filepath) 59 | except Exception: 60 | print(data_filepath) 61 | raise 62 | #raise Exception("\"exception\" method needs to be implemented") 63 | 64 | tokenized_content = collections.Counter() 65 | try: 66 | tokenized_content.update(re.split(r'\W+', content)) 67 | except: 68 | print(content) 69 | return tokenized_content 70 | 71 | 72 | def calc_praf(goldstd,predicted,universe): 73 | 74 | TP = predicted & goldstd 75 | 76 | FP = predicted - goldstd 77 | 78 | FN = (universe - predicted) & goldstd 79 | 80 | TN = (universe - predicted) & (universe - goldstd) 81 | 82 | precision = (histsum(TP)*1.0)/histsum(TP+FP) 83 | 84 | recall = (histsum(TP)*1.0)/histsum(TP+FN) 85 | 86 | accuracy = (histsum(TP+TN)*1.0)/histsum(TP+FP+FN+TN) 87 | try: 88 | f1 = 2 * (((precision*recall)*1.0)/(precision+recall)) 89 | except ZeroDivisionError: 90 | f1 = 0 91 | #print(precision,recall, accuracy) 92 | #raise 93 | return {'p':precision,'r':recall,'a':accuracy,'f1':f1} 94 | 95 | 96 | def prep_data_for_measurements(directory, extractor, 97 | goldfile_ext='.txt', 98 | testfile_ext='.html', 99 | pickle_output_name = None): 100 | '''Given a directory, process the files (gold standard) in the following 101 | steps: 102 | 1. Build dictionary of file names to file paths to gold std files and test file 103 | 104 | 2. For each file name, get associated "universe" values (all possible outcomes W) 105 | 106 | 3. For each file name, get goldstd histogram (G) 107 | 108 | 4. For each file name, get extractor's output (C) 109 | ''' 110 | #Step 1 111 | filenames = set([re.sub(("\\"+ goldfile_ext+"|"+"\\"+testfile_ext),"",name) 112 | for name in os.listdir(directory) ]) 113 | 114 | #print("prep_data_for_measurements",filenames[0]) 115 | 116 | golden_dict = { name: 117 | { 118 | 'domain': os.path.split(os.path.split(directory)[0])[1], 119 | 'year': os.path.split(directory)[1], 120 | 'goldpath':os.path.join(directory, name+goldfile_ext), 121 | 'testpath':os.path.join(directory, name+testfile_ext) 122 | } for name in filenames } 123 | #print(textfile_names) 124 | 125 | 126 | for key, val in golden_dict.items(): 127 | content = tokens_to_hist_extractor(extractor, os.path.abspath(val['testpath'])) 128 | 129 | universe = tokens_to_hist_from_universe(os.path.abspath(val['testpath'])) 130 | 131 | goldstd = tokens_to_hist_goldstd(os.path.abspath(val['goldpath'])) 132 | 133 | val['measurements'] = calc_praf(goldstd,content,universe) 134 | 135 | ''' 136 | #Step 2 137 | for key, val in golden_dict.items(): 138 | #print() 139 | print(val['testpath']) 140 | val['W'] = tokens_to_hist_from_universe(os.path.abspath(val['testpath'])) 141 | 142 | #print(val['W']) 143 | 144 | #Step 3 145 | try: 146 | for key, val in golden_dict.items(): 147 | val['G'] = tokens_to_hist_goldstd(os.path.abspath(val['goldpath'])) 148 | except Exception as e: 149 | print(val['goldpath']) 150 | raise 151 | 152 | #Step 4 153 | for key, val in golden_dict.items(): 154 | val['C'] = tokens_to_hist_extractor(extractor, os.path.abspath(val['testpath'])) 155 | ''' 156 | return golden_dict 157 | 158 | 159 | def get_measurements(prep_data): 160 | for key, val in prep_data.items(): 161 | val['measurements'] = calc_praf(val['G'],val['C'],val['W']) 162 | 163 | 164 | def listsubdir(directory): 165 | return [os.path.join(d, f) for d in directory for f in os.listdir(d) 166 | if os.path.isdir(os.path.join(d, f))] 167 | 168 | def main(): 169 | # my code here 170 | import eatiht 171 | dataset_path = ['C:/Users/rodrigo/Desktop/crawl-to-the-future-master/crawl-to-the-future-master/dataset'] 172 | 173 | folders = listsubdir(listsubdir(dataset_path)) 174 | 175 | years = {} 176 | domains = {} 177 | 178 | exhaustive_test = [] 179 | for folder in folders: 180 | domainpath,year = os.path.split(folder) 181 | domain = os.path.split(domainpath)[1] 182 | prep_data = prep_data_for_measurements(os.path.abspath(folder),eatiht) 183 | #get_measurements(prep_data) 184 | exhaustive_test.append(prep_data) 185 | 186 | #prep_data = prep_data_for_measurements(os.path.abspath(dataset_path),eatiht) 187 | 188 | #get_measurements(prep_data) 189 | 190 | 191 | """for key, val in W.items(): 192 | print(key, val) 193 | print() 194 | """ 195 | 196 | if __name__ == "__main__": 197 | main() 198 | -------------------------------------------------------------------------------- /testing/wbce-tests.7z: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rodricios/crawl-to-the-future/aebed4d2ac836a116b40bde75a473726362d95db/testing/wbce-tests.7z -------------------------------------------------------------------------------- /testing/wbce-tests/BodyTextExtractor2Filter/avgs.pkl: -------------------------------------------------------------------------------- 1 | (dp0 2 | S'www.latimes.com' 3 | p1 4 | (dp2 5 | S'2015' 6 | p3 7 | (dp4 8 | S'avg_precision' 9 | p5 10 | F0.12218070191163449 11 | sS'avg_accuracy' 12 | p6 13 | F0.4928797528462945 14 | sS'avg_recall' 15 | p7 16 | F0.8331923688617182 17 | sS'avg_f1' 18 | p8 19 | F0.21086178590786336 20 | ssS'2000' 21 | p9 22 | (dp10 23 | g5 24 | F0.8572444640811792 25 | sg6 26 | F0.8802343747738629 27 | sg7 28 | F0.9430623481664986 29 | sg8 30 | F0.8972320848398079 31 | ssS'2010' 32 | p11 33 | (dp12 34 | g5 35 | F0.4394908403162711 36 | sg6 37 | F0.681661606253343 38 | sg7 39 | F0.8076022084890088 40 | sg8 41 | F0.5428877399677827 42 | ssS'2005' 43 | p13 44 | (dp14 45 | g5 46 | F0.6286822306460558 47 | sg6 48 | F0.7569839835615282 49 | sg7 50 | F0.894640252155387 51 | sg8 52 | F0.7284191666091211 53 | sssS'www.cnn.com' 54 | p15 55 | (dp16 56 | S'2015' 57 | p17 58 | (dp18 59 | g5 60 | F0.3029125013356543 61 | sg6 62 | F0.40317409919155567 63 | sg7 64 | F0.865448187557186 65 | sg8 66 | F0.4324038600619751 67 | ssS'2000' 68 | p19 69 | (dp20 70 | g5 71 | F0.7057985767079694 72 | sg6 73 | F0.800733747469661 74 | sg7 75 | F0.864311328238917 76 | sg8 77 | F0.7688012107244725 78 | ssS'2010' 79 | p21 80 | (dp22 81 | g5 82 | F0.4320541005367761 83 | sg6 84 | F0.6084899901455145 85 | sg7 86 | F0.8017286221537236 87 | sg8 88 | F0.5400160577031479 89 | ssS'2005' 90 | p23 91 | (dp24 92 | g5 93 | F0.6211738688863229 94 | sg6 95 | F0.8096439361653318 96 | sg7 97 | F0.8286779162880316 98 | sg8 99 | F0.7071877225133284 100 | sssS'www.forbes.com' 101 | p25 102 | (dp26 103 | S'2015' 104 | p27 105 | (dp28 106 | g5 107 | F0.3484809556323725 108 | sg6 109 | F0.5365559659335577 110 | sg7 111 | F0.8283171986084517 112 | sg8 113 | F0.48072279957902986 114 | ssS'2000' 115 | p29 116 | (dp30 117 | g5 118 | F0.672786168820609 119 | sg6 120 | F0.7794584406932229 121 | sg7 122 | F0.9016544964695659 123 | sg8 124 | F0.7625513206364645 125 | ssS'2010' 126 | p31 127 | (dp32 128 | g5 129 | F0.660445014823463 130 | sg6 131 | F0.8116505246063985 132 | sg7 133 | F0.8718069249651257 134 | sg8 135 | F0.7375315583874413 136 | ssS'2005' 137 | p33 138 | (dp34 139 | g5 140 | F0.7088740876453605 141 | sg6 142 | F0.9168198868029921 143 | sg7 144 | F0.7883061719247125 145 | sg8 146 | F0.7443993511594685 147 | sssS'news.bbc.co.uk' 148 | p35 149 | (dp36 150 | S'2015' 151 | p37 152 | (dp38 153 | g5 154 | F0.13308241469189427 155 | sg6 156 | F0.5565193185460882 157 | sg7 158 | F0.7304439153776018 159 | sg8 160 | F0.22151052512251035 161 | ssS'2000' 162 | p39 163 | (dp40 164 | g5 165 | F0.8919253176470745 166 | sg6 167 | F0.9021006354735241 168 | sg7 169 | F0.8401801928987451 170 | sg8 171 | F0.8647821118015534 172 | ssS'2010' 173 | p41 174 | (dp42 175 | g5 176 | F0.41742126841582045 177 | sg6 178 | F0.73532761388079 179 | sg7 180 | F0.7303742669478774 181 | sg8 182 | F0.5205747113910343 183 | ssS'2005' 184 | p43 185 | (dp44 186 | g5 187 | F0.6766215452483197 188 | sg6 189 | F0.8099502557304525 190 | sg7 191 | F0.8715406400104121 192 | sg8 193 | F0.7565121769831742 194 | sssS'thenation.com' 195 | p45 196 | (dp46 197 | S'2015' 198 | p47 199 | (dp48 200 | g5 201 | F0.5021943230481295 202 | sg6 203 | F0.6543885449100943 204 | sg7 205 | F0.8572473843198571 206 | sg8 207 | F0.6190670055430223 208 | ssS'2000' 209 | p49 210 | (dp50 211 | g5 212 | F0.8797908423638537 213 | sg6 214 | F0.8954149201859337 215 | sg7 216 | F0.9652013434871509 217 | sg8 218 | F0.918658715490274 219 | ssS'2010' 220 | p51 221 | (dp52 222 | g5 223 | F0.4915998914598782 224 | sg6 225 | F0.6740374139494874 226 | sg7 227 | F0.867987776825755 228 | sg8 229 | F0.6136605471301677 230 | ssS'2005' 231 | p53 232 | (dp54 233 | g5 234 | F0.7906462969568482 235 | sg6 236 | F0.804806209394557 237 | sg7 238 | F0.9495652685159028 239 | sg8 240 | F0.8561489426227454 241 | sssS'www.nymag.com' 242 | p55 243 | (dp56 244 | S'2015' 245 | p57 246 | (dp58 247 | g5 248 | F0.04347666120043913 249 | sg6 250 | F0.163241251572618 251 | sg7 252 | F0.5388911459138439 253 | sg8 254 | F0.06427847328424839 255 | ssS'2000' 256 | p59 257 | (dp60 258 | g5 259 | F0.855375126938793 260 | sg6 261 | F0.8992183949804482 262 | sg7 263 | F0.9395867057441535 264 | sg8 265 | F0.8825630519989532 266 | ssS'2010' 267 | p61 268 | (dp62 269 | g5 270 | F0.12117487487387163 271 | sg6 272 | F0.3941642691060131 273 | sg7 274 | F0.6389370425852912 275 | sg8 276 | F0.19293172049444848 277 | ssS'2005' 278 | p63 279 | (dp64 280 | g5 281 | F0.63792601378758 282 | sg6 283 | F0.7500215514070161 284 | sg7 285 | F0.8938914698626994 286 | sg8 287 | F0.7409493649922384 288 | sssS'entertainment.msn.com' 289 | p65 290 | (dp66 291 | S'2015' 292 | p67 293 | (dp68 294 | g5 295 | F0.07401168704789596 296 | sg6 297 | F0.5518234073818731 298 | sg7 299 | F0.8043228823612675 300 | sg8 301 | F0.1322736889940892 302 | ssS'2000' 303 | p69 304 | (dp70 305 | g5 306 | F0.8464097205988462 307 | sg6 308 | F0.9244090539071942 309 | sg7 310 | F0.8503924945577224 311 | sg8 312 | F0.846137936359689 313 | ssS'2010' 314 | p71 315 | (dp72 316 | g5 317 | F0.406078738764565 318 | sg6 319 | F0.5739534002357349 320 | sg7 321 | F0.8837208661044244 322 | sg8 323 | F0.5096672963521531 324 | ssS'2005' 325 | p73 326 | (dp74 327 | g5 328 | F0.4417019947232166 329 | sg6 330 | F0.6483814493886327 331 | sg7 332 | F0.8815812072871462 333 | sg8 334 | F0.5736756061820752 335 | sssS'www.esquire.com' 336 | p75 337 | (dp76 338 | S'2015' 339 | p77 340 | (dp78 341 | g5 342 | F0.06668416878257147 343 | sg6 344 | F0.33177659602885284 345 | sg7 346 | F0.7739857956252141 347 | sg8 348 | F0.11513009883762236 349 | ssS'2000' 350 | p79 351 | (dp80 352 | g5 353 | F0.8838432933187248 354 | sg6 355 | F0.9074614289477201 356 | sg7 357 | F0.8852199980453179 358 | sg8 359 | F0.8757034024592896 360 | ssS'2010' 361 | p81 362 | (dp82 363 | g5 364 | F0.350690725741593 365 | sg6 366 | F0.5462068777107696 367 | sg7 368 | F0.863806557191218 369 | sg8 370 | F0.4820745041640184 371 | ssS'2005' 372 | p83 373 | (dp84 374 | g5 375 | F0.926445902055539 376 | sg6 377 | F0.9454220019060875 378 | sg7 379 | F0.9361114008251437 380 | sg8 381 | F0.9306781924884957 382 | sssS'news.yahoo.com' 383 | p85 384 | (dp86 385 | S'2015' 386 | p87 387 | (dp88 388 | g5 389 | F0.018386000373501205 390 | sg6 391 | F0.3219588516463774 392 | sg7 393 | F0.686399968694185 394 | sg8 395 | F0.03564246563861722 396 | ssS'2000' 397 | p89 398 | (dp90 399 | g5 400 | F0.8683999024519955 401 | sg6 402 | F0.8906547313136641 403 | sg7 404 | F0.8638322817894927 405 | sg8 406 | F0.8654332302102575 407 | ssS'2010' 408 | p91 409 | (dp92 410 | g5 411 | F0.578675352037152 412 | sg6 413 | F0.7999153102686466 414 | sg7 415 | F0.7757993179137981 416 | sg8 417 | F0.6558914318984841 418 | ssS'2005' 419 | p93 420 | (dp94 421 | g5 422 | F0.6484059381381134 423 | sg6 424 | F0.7924984710068734 425 | sg7 426 | F0.8848410469684321 427 | sg8 428 | F0.7383258111263864 429 | sssS'www.foxnews.com' 430 | p95 431 | (dp96 432 | S'2015' 433 | p97 434 | (dp98 435 | g5 436 | F0.7543074627097117 437 | sg6 438 | F0.8435262483444723 439 | sg7 440 | F0.8328436054890792 441 | sg8 442 | F0.7897416404006864 443 | ssS'2000' 444 | p99 445 | (dp100 446 | g5 447 | F0.8753434627782866 448 | sg6 449 | F0.9075440551901525 450 | sg7 451 | F0.9260015291417278 452 | sg8 453 | F0.8874059086393135 454 | ssS'2010' 455 | p101 456 | (dp102 457 | g5 458 | F0.4369663029782896 459 | sg6 460 | F0.744614672197503 461 | sg7 462 | F0.8049231112015547 463 | sg8 464 | F0.5517263659638048 465 | ssS'2005' 466 | p103 467 | (dp104 468 | g5 469 | F0.3387610582651177 470 | sg6 471 | F0.49737059246242404 472 | sg7 473 | F0.9414502698019317 474 | sg8 475 | F0.48511616080858905 476 | sssS'mean_avgs' 477 | p105 478 | (dp106 479 | S'a' 480 | p107 481 | (dp108 482 | S'2015' 483 | p109 484 | F0.4855844036401784 485 | sS'2000' 486 | p110 487 | F0.8787229782935382 488 | sS'2010' 489 | p111 490 | F0.6570021678354201 491 | sS'2005' 492 | p112 493 | F0.7731898337825897 494 | ssS'p' 495 | p113 496 | (dp114 497 | g109 498 | F0.23657168767338047 499 | sg110 500 | F0.8336916875707331 501 | sg111 502 | F0.433459710994768 503 | sg112 504 | F0.6419238936352474 505 | ssS'r' 506 | p115 507 | (dp116 508 | g109 509 | F0.7751092452808405 510 | sg110 511 | F0.8979442718539291 512 | sg111 513 | F0.8046686694377776 514 | sg112 515 | F0.8870605643639801 516 | ssS'f1' 517 | p117 518 | (dp118 519 | g109 520 | F0.3101632343369664 521 | sg110 522 | F0.8569268973160075 523 | sg111 524 | F0.5346961933452483 525 | sg112 526 | F0.7261412495485622 527 | sss. -------------------------------------------------------------------------------- /testing/wbce-tests/BodyTextExtractorFilter/avgs.pkl: -------------------------------------------------------------------------------- 1 | (dp0 2 | S'www.latimes.com' 3 | p1 4 | (dp2 5 | S'2015' 6 | p3 7 | (dp4 8 | S'avg_precision' 9 | p5 10 | F0.12272875317104835 11 | sS'avg_accuracy' 12 | p6 13 | F0.49396154622073934 14 | sS'avg_recall' 15 | p7 16 | F0.8435650192979125 17 | sS'avg_f1' 18 | p8 19 | F0.21195019485093056 20 | ssS'2000' 21 | p9 22 | (dp10 23 | g5 24 | F0.9250015482359525 25 | sg6 26 | F0.9258974031198831 27 | sg7 28 | F0.9504997642390605 29 | sg8 30 | F0.9334350524137958 31 | ssS'2010' 32 | p11 33 | (dp12 34 | g5 35 | F0.4017566608588345 36 | sg6 37 | F0.6093635105963822 38 | sg7 39 | F0.8633933910418977 40 | sg8 41 | F0.5112870105243548 42 | ssS'2005' 43 | p13 44 | (dp14 45 | g5 46 | F0.6083888210659326 47 | sg6 48 | F0.7408316905365124 49 | sg7 50 | F0.917895268358133 51 | sg8 52 | F0.7203292113796789 53 | sssS'www.cnn.com' 54 | p15 55 | (dp16 56 | S'2015' 57 | p17 58 | (dp18 59 | g5 60 | F0.29940846505010726 61 | sg6 62 | F0.4025419214848678 63 | sg7 64 | F0.8473645773436883 65 | sg8 66 | F0.4268100068072376 67 | ssS'2000' 68 | p19 69 | (dp20 70 | g5 71 | F0.6767895607401481 72 | sg6 73 | F0.787291789807444 74 | sg7 75 | F0.8868254180410128 76 | sg8 77 | F0.7616275924904314 78 | ssS'2010' 79 | p21 80 | (dp22 81 | g5 82 | F0.33209885398619127 83 | sg6 84 | F0.4531771577601456 85 | sg7 86 | F0.9166603510481853 87 | sg8 88 | F0.46970478140479605 89 | ssS'2005' 90 | p23 91 | (dp24 92 | g5 93 | F0.6040764658910175 94 | sg6 95 | F0.8010867508484057 96 | sg7 97 | F0.8484664297329124 98 | sg8 99 | F0.701997960949887 100 | sssS'www.forbes.com' 101 | p25 102 | (dp26 103 | S'2015' 104 | p27 105 | (dp28 106 | g5 107 | F0.2791851517078164 108 | sg6 109 | F0.38141912176494697 110 | sg7 111 | F0.9333104136263721 112 | sg8 113 | F0.4211472600415881 114 | ssS'2000' 115 | p29 116 | (dp30 117 | g5 118 | F0.6534683899153207 119 | sg6 120 | F0.7632870010147942 121 | sg7 122 | F0.9404071650230295 123 | sg8 124 | F0.759877138712281 125 | ssS'2010' 126 | p31 127 | (dp32 128 | g5 129 | F0.5461973136131762 130 | sg6 131 | F0.728173262324377 132 | sg7 133 | F0.9097018544185852 134 | sg8 135 | F0.6712593396648558 136 | ssS'2005' 137 | p33 138 | (dp34 139 | g5 140 | F0.3535572901091079 141 | sg6 142 | F0.6913354915701799 143 | sg7 144 | F0.8829171755051728 145 | sg8 146 | F0.4911560793931217 147 | sssS'news.bbc.co.uk' 148 | p35 149 | (dp36 150 | S'2015' 151 | p37 152 | (dp38 153 | g5 154 | F0.11640002710415726 155 | sg6 156 | F0.4842104610988116 157 | sg7 158 | F0.8541567113093707 159 | sg8 160 | F0.20172430016244522 161 | ssS'2000' 162 | p39 163 | (dp40 164 | g5 165 | F0.5729001117183127 166 | sg6 167 | F0.7307765914280252 168 | sg7 169 | F0.9278350777198899 170 | sg8 171 | F0.7059603168323999 172 | ssS'2010' 173 | p41 174 | (dp42 175 | g5 176 | F0.3041230830149431 177 | sg6 178 | F0.5900514362034599 179 | sg7 180 | F0.8247355274499921 181 | sg8 182 | F0.43301263078402946 183 | ssS'2005' 184 | p43 185 | (dp44 186 | g5 187 | F0.5177519581452089 188 | sg6 189 | F0.6948387499665385 190 | sg7 191 | F0.9498492073703207 192 | sg8 193 | F0.6632118901906628 194 | sssS'thenation.com' 195 | p45 196 | (dp46 197 | S'2015' 198 | p47 199 | (dp48 200 | g5 201 | F0.46860182851819593 202 | sg6 203 | F0.6009976732709027 204 | sg7 205 | F0.8981403529700164 206 | sg8 207 | F0.5998386292854977 208 | ssS'2000' 209 | p49 210 | (dp50 211 | g5 212 | F0.8673455478551348 213 | sg6 214 | F0.8898651263744926 215 | sg7 216 | F0.9715400727648996 217 | sg8 218 | F0.9149158626087028 219 | ssS'2010' 220 | p51 221 | (dp52 222 | g5 223 | F0.4313543594309007 224 | sg6 225 | F0.6117791374652216 226 | sg7 227 | F0.9196668171556037 228 | sg8 229 | F0.5761477860300167 230 | ssS'2005' 231 | p53 232 | (dp54 233 | g5 234 | F0.7305810594010783 235 | sg6 236 | F0.7640878248044985 237 | sg7 238 | F0.9692380044272665 239 | sg8 240 | F0.8295052831046879 241 | sssS'www.nymag.com' 242 | p55 243 | (dp56 244 | S'2015' 245 | p57 246 | (dp58 247 | g5 248 | F0.045097010038526314 249 | sg6 250 | F0.15503685269634493 251 | sg7 252 | F0.8812109910492884 253 | sg8 254 | F0.06712107954478866 255 | ssS'2000' 256 | p59 257 | (dp60 258 | g5 259 | F0.8357303273376807 260 | sg6 261 | F0.8916370532340404 262 | sg7 263 | F0.9428559162171202 264 | sg8 265 | F0.8738127912711762 266 | ssS'2010' 267 | p61 268 | (dp62 269 | g5 270 | F0.1207055620925438 271 | sg6 272 | F0.3344133901389897 273 | sg7 274 | F0.8562798343950719 275 | sg8 276 | F0.19938956686541218 277 | ssS'2005' 278 | p63 279 | (dp64 280 | g5 281 | F0.563772151455564 282 | sg6 283 | F0.6888173433311248 284 | sg7 285 | F0.9459993330665563 286 | sg8 287 | F0.7032073956380118 288 | sssS'entertainment.msn.com' 289 | p65 290 | (dp66 291 | S'2015' 292 | p67 293 | (dp68 294 | g5 295 | F0.07007420541341315 296 | sg6 297 | F0.5204019148311159 298 | sg7 299 | F0.854012982421515 300 | sg8 301 | F0.12648755955108068 302 | ssS'2000' 303 | p69 304 | (dp70 305 | g5 306 | F0.7786809001887803 307 | sg6 308 | F0.9044573125851524 309 | sg7 310 | F0.8806697299622153 311 | sg8 312 | F0.8208128244189797 313 | ssS'2010' 314 | p71 315 | (dp72 316 | g5 317 | F0.34039318503958943 318 | sg6 319 | F0.5042812623221914 320 | sg7 321 | F0.8961666976065963 322 | sg8 323 | F0.4586618563570501 324 | ssS'2005' 325 | p73 326 | (dp74 327 | g5 328 | F0.4186477840737833 329 | sg6 330 | F0.6157148271745461 331 | sg7 332 | F0.8972966448530796 333 | sg8 334 | F0.5538968759062753 335 | sssS'www.esquire.com' 336 | p75 337 | (dp76 338 | S'2015' 339 | p77 340 | (dp78 341 | g5 342 | F0.06624818281736693 343 | sg6 344 | F0.3202453377209491 345 | sg7 346 | F0.8168627345418239 347 | sg8 348 | F0.11469291810643537 349 | ssS'2000' 350 | p79 351 | (dp80 352 | g5 353 | F0.8413135303544912 354 | sg6 355 | F0.901048810633123 356 | sg7 357 | F0.925449847131967 358 | sg8 359 | F0.8718678597743154 360 | ssS'2010' 361 | p81 362 | (dp82 363 | g5 364 | F0.33316897330572265 365 | sg6 366 | F0.5166756715884727 367 | sg7 368 | F0.8975495661247929 369 | sg8 370 | F0.4690759983072666 371 | ssS'2005' 372 | p83 373 | (dp84 374 | g5 375 | F0.9210595620529616 376 | sg6 377 | F0.9429496458987109 378 | sg7 379 | F0.9374695827432423 380 | sg8 381 | F0.9284252997056781 382 | sssS'news.yahoo.com' 383 | p85 384 | (dp86 385 | S'2015' 386 | p87 387 | (dp88 388 | g5 389 | F0.018336170825126605 390 | sg6 391 | F0.31951559031089 392 | sg7 393 | F0.7097419686453995 394 | sg8 395 | F0.03558797004348298 396 | ssS'2000' 397 | p89 398 | (dp90 399 | g5 400 | F0.6850494294232871 401 | sg6 402 | F0.7893135784179692 403 | sg7 404 | F0.9028278043401964 405 | sg8 406 | F0.7743623865271191 407 | ssS'2010' 408 | p91 409 | (dp92 410 | g5 411 | F0.31975003566238014 412 | sg6 413 | F0.5136290114583377 414 | sg7 415 | F0.8895496788062842 416 | sg8 417 | F0.46061874624466626 418 | ssS'2005' 419 | p93 420 | (dp94 421 | g5 422 | F0.553762919588722 423 | sg6 424 | F0.6937235984167229 425 | sg7 426 | F0.9039296240363984 427 | sg8 428 | F0.6418515919437905 429 | sssS'www.foxnews.com' 430 | p95 431 | (dp96 432 | S'2015' 433 | p97 434 | (dp98 435 | g5 436 | F0.5811714938921815 437 | sg6 438 | F0.726996194575719 439 | sg7 440 | F0.9085344730398744 441 | sg8 442 | F0.6994681708826616 443 | ssS'2000' 444 | p99 445 | (dp100 446 | g5 447 | F0.7997548741274138 448 | sg6 449 | F0.8845589461629683 450 | sg7 451 | F0.9449516315689113 452 | sg8 453 | F0.8569337227388726 454 | ssS'2010' 455 | p101 456 | (dp102 457 | g5 458 | F0.30196347934410317 459 | sg6 460 | F0.530523850053412 461 | sg7 462 | F0.8929124304491302 463 | sg8 464 | F0.42913095408950247 465 | ssS'2005' 466 | p103 467 | (dp104 468 | g5 469 | F0.5437722450092386 470 | sg6 471 | F0.5887271562989693 472 | sg7 473 | F0.6840968203980121 474 | sg8 475 | F0.3736088044609785 476 | sssS'mean_avgs' 477 | p105 478 | (dp106 479 | S'a' 480 | p107 481 | (dp108 482 | S'2015' 483 | p109 484 | F0.4405326613975287 485 | sS'2000' 486 | p110 487 | F0.8468133612777893 488 | sS'2010' 489 | p111 490 | F0.5392067689910991 491 | sS'2005' 492 | p112 493 | F0.722211307884621 494 | ssS'p' 495 | p113 496 | (dp114 497 | g109 498 | F0.206725128853794 499 | sg110 500 | F0.7636034219896521 501 | sg111 502 | F0.34315115063483853 503 | sg112 504 | F0.5815370256792615 505 | ssS'r' 506 | p115 507 | (dp116 508 | g109 509 | F0.8546900224245262 510 | sg110 511 | F0.92738624270083 512 | sg111 513 | F0.8866616148496138 514 | sg112 515 | F0.8937158090491095 516 | ssS'f1' 517 | p117 518 | (dp118 519 | g109 520 | F0.29048280892761486 521 | sg110 522 | F0.8273605547788072 523 | sg111 524 | F0.467828867027195 525 | sg112 526 | F0.6607190392672773 527 | sss. -------------------------------------------------------------------------------- /testing/wbce-tests/ContentCodeBlurringFilter/avgs.pkl: -------------------------------------------------------------------------------- 1 | (dp0 2 | S'www.latimes.com' 3 | p1 4 | (dp2 5 | S'2015' 6 | p3 7 | (dp4 8 | S'avg_precision' 9 | p5 10 | F0.11616947582549013 11 | sS'avg_accuracy' 12 | p6 13 | F0.46920187976581246 14 | sS'avg_recall' 15 | p7 16 | F0.926135867337941 17 | sS'avg_f1' 18 | p8 19 | F0.20415005552512575 20 | ssS'2000' 21 | p9 22 | (dp10 23 | g5 24 | F0.8111021925809357 25 | sg6 26 | F0.8537055561468689 27 | sg7 28 | F0.9605102080217843 29 | sg8 30 | F0.877801211777537 31 | ssS'2010' 32 | p11 33 | (dp12 34 | g5 35 | F0.42023599354375335 36 | sg6 37 | F0.6603534273271459 38 | sg7 39 | F0.8530170148458562 40 | sg8 41 | F0.5331588765579497 42 | ssS'2005' 43 | p13 44 | (dp14 45 | g5 46 | F0.5986065482125335 47 | sg6 48 | F0.732706800977865 49 | sg7 50 | F0.926431498730239 51 | sg8 52 | F0.7148217930250259 53 | sssS'www.cnn.com' 54 | p15 55 | (dp16 56 | S'2015' 57 | p17 58 | (dp18 59 | g5 60 | F0.2894186892399318 61 | sg6 62 | F0.3718682438205035 63 | sg7 64 | F0.9324626726661884 65 | sg8 66 | F0.4242833956822243 67 | ssS'2000' 68 | p19 69 | (dp20 70 | g5 71 | F0.5996675436639709 72 | sg6 73 | F0.7323592962875213 74 | sg7 75 | F0.9319726161726481 76 | sg8 77 | F0.7189743800605608 78 | ssS'2010' 79 | p21 80 | (dp22 81 | g5 82 | F0.3597736757896344 83 | sg6 84 | F0.5197400703467359 85 | sg7 86 | F0.9121055127029071 87 | sg8 88 | F0.4978157825811003 89 | ssS'2005' 90 | p23 91 | (dp24 92 | g5 93 | F0.5452230974470502 94 | sg6 95 | F0.7664507868126309 96 | sg7 97 | F0.8900767403269831 98 | sg8 99 | F0.6713951669429999 100 | sssS'www.forbes.com' 101 | p25 102 | (dp26 103 | S'2015' 104 | p27 105 | (dp28 106 | g5 107 | F0.2957366925768594 108 | sg6 109 | F0.43853710798593426 110 | sg7 111 | F0.9183124693494872 112 | sg8 113 | F0.43798657689106757 114 | ssS'2000' 115 | p29 116 | (dp30 117 | g5 118 | F0.6396386293000841 119 | sg6 120 | F0.7482299781041003 121 | sg7 122 | F0.9330656665380409 123 | sg8 124 | F0.7457628111341499 125 | ssS'2010' 126 | p31 127 | (dp32 128 | g5 129 | F0.6366358379063284 130 | sg6 131 | F0.8015931978989604 132 | sg7 133 | F0.8973244165879032 134 | sg8 135 | F0.7317827934727054 136 | ssS'2005' 137 | p33 138 | (dp34 139 | g5 140 | F0.46550867158986875 141 | sg6 142 | F0.8134896128309964 143 | sg7 144 | F0.8111536548160493 145 | sg8 146 | F0.5830351844715977 147 | sssS'news.bbc.co.uk' 148 | p35 149 | (dp36 150 | S'2015' 151 | p37 152 | (dp38 153 | g5 154 | F0.11919826889537567 155 | sg6 156 | F0.5072052935997045 157 | sg7 158 | F0.8131840259715051 159 | sg8 160 | F0.20479987793041265 161 | ssS'2000' 162 | p39 163 | (dp40 164 | g5 165 | F0.6203488498249787 166 | sg6 167 | F0.7719210597081312 168 | sg7 169 | F0.9111056593826031 170 | sg8 171 | F0.73602729927279 172 | ssS'2010' 173 | p41 174 | (dp42 175 | g5 176 | F0.3116620725542801 177 | sg6 178 | F0.6107183145734446 179 | sg7 180 | F0.8465817716601981 181 | sg8 182 | F0.4428979480973926 183 | ssS'2005' 184 | p43 185 | (dp44 186 | g5 187 | F0.5984474642761501 188 | sg6 189 | F0.7647090426615936 190 | sg7 191 | F0.9175424430558543 192 | sg8 193 | F0.7187128295406526 194 | sssS'thenation.com' 195 | p45 196 | (dp46 197 | S'2015' 198 | p47 199 | (dp48 200 | g5 201 | F0.45609212979185026 202 | sg6 203 | F0.5884629716470389 204 | sg7 205 | F0.9020425513072606 206 | sg8 207 | F0.5886430733200099 208 | ssS'2000' 209 | p49 210 | (dp50 211 | g5 212 | F0.8883697815371147 213 | sg6 214 | F0.8996873960112766 215 | sg7 216 | F0.956446821995258 217 | sg8 218 | F0.9200053550843839 219 | ssS'2010' 220 | p51 221 | (dp52 222 | g5 223 | F0.45163668180184763 224 | sg6 225 | F0.640214269422257 226 | sg7 227 | F0.8998899472625064 228 | sg8 229 | F0.5901444105419535 230 | ssS'2005' 231 | p53 232 | (dp54 233 | g5 234 | F0.7878965878803256 235 | sg6 236 | F0.8175606201703896 237 | sg7 238 | F0.9543100199795547 239 | sg8 240 | F0.8606465346855091 241 | sssS'www.nymag.com' 242 | p55 243 | (dp56 244 | S'2015' 245 | p57 246 | (dp58 247 | g5 248 | F0.044054500032989705 249 | sg6 250 | F0.17136458365310586 251 | sg7 252 | F0.7960899650737144 253 | sg8 254 | F0.06601879127669846 255 | ssS'2000' 256 | p59 257 | (dp60 258 | g5 259 | F0.8370761752694753 260 | sg6 261 | F0.889686567319923 262 | sg7 263 | F0.9353273159849615 264 | sg8 265 | F0.8707949425306105 266 | ssS'2010' 267 | p61 268 | (dp62 269 | g5 270 | F0.12616883801360262 271 | sg6 272 | F0.38642884874893196 273 | sg7 274 | F0.8069386641460552 275 | sg8 276 | F0.205709008596695 277 | ssS'2005' 278 | p63 279 | (dp64 280 | g5 281 | F0.5591489354125457 282 | sg6 283 | F0.6890375236691776 284 | sg7 285 | F0.9411507370178505 286 | sg8 287 | F0.6978964904115924 288 | sssS'entertainment.msn.com' 289 | p65 290 | (dp66 291 | S'2015' 292 | p67 293 | (dp68 294 | g5 295 | F0.07134534027116624 296 | sg6 297 | F0.5343765658847024 298 | sg7 299 | F0.8397069719251198 300 | sg8 301 | F0.1283561132707584 302 | ssS'2000' 303 | p69 304 | (dp70 305 | g5 306 | F0.6700112446095992 307 | sg6 308 | F0.8622989584868095 309 | sg7 310 | F0.8910749599860082 311 | sg8 312 | F0.7575812498608373 313 | ssS'2010' 314 | p71 315 | (dp72 316 | g5 317 | F0.31730289839466586 318 | sg6 319 | F0.49007838956361455 320 | sg7 321 | F0.8923212147968717 322 | sg8 323 | F0.44331640799871797 324 | ssS'2005' 325 | p73 326 | (dp74 327 | g5 328 | F0.41877132883633295 329 | sg6 330 | F0.6122458464258885 331 | sg7 332 | F0.8819647491325822 333 | sg8 334 | F0.5493943921725336 335 | sssS'www.esquire.com' 336 | p75 337 | (dp76 338 | S'2015' 339 | p77 340 | (dp78 341 | g5 342 | F0.06563692613337585 343 | sg6 344 | F0.32180996245028265 345 | sg7 346 | F0.7960846610536186 347 | sg8 348 | F0.11374893799509031 349 | ssS'2000' 350 | p79 351 | (dp80 352 | g5 353 | F0.8605989019941799 354 | sg6 355 | F0.9070391764783824 356 | sg7 357 | F0.9165174218787232 358 | sg8 359 | F0.8805870048908404 360 | ssS'2010' 361 | p81 362 | (dp82 363 | g5 364 | F0.35502210934852385 365 | sg6 366 | F0.5618179288837913 367 | sg7 368 | F0.8587718547200568 369 | sg8 370 | F0.4864279573305218 371 | ssS'2005' 372 | p83 373 | (dp84 374 | g5 375 | F0.8962966109245787 376 | sg6 377 | F0.936878478238269 378 | sg7 379 | F0.9509011644355638 380 | sg8 381 | F0.9216371477663517 382 | sssS'news.yahoo.com' 383 | p85 384 | (dp86 385 | S'2015' 386 | p87 387 | (dp88 388 | g5 389 | F0.018210151381855617 390 | sg6 391 | F0.3173418569689053 392 | sg7 393 | F0.7081125284163001 394 | sg8 395 | F0.035341779014767455 396 | ssS'2000' 397 | p89 398 | (dp90 399 | g5 400 | F0.7085868798180318 401 | sg6 402 | F0.8130165212383264 403 | sg7 404 | F0.9276842948098016 405 | sg8 406 | F0.7967306659258948 407 | ssS'2010' 408 | p91 409 | (dp92 410 | g5 411 | F0.3842242600405546 412 | sg6 413 | F0.6307431607134734 414 | sg7 415 | F0.831653904040496 416 | sg8 417 | F0.5157002834267875 418 | ssS'2005' 419 | p93 420 | (dp94 421 | g5 422 | F0.5720537517102889 423 | sg6 424 | F0.7387829146112146 425 | sg7 426 | F0.9225919473742025 427 | sg8 428 | F0.6931510896145612 429 | sssS'www.foxnews.com' 430 | p95 431 | (dp96 432 | S'2015' 433 | p97 434 | (dp98 435 | g5 436 | F0.5507998052113364 437 | sg6 438 | F0.7050861736184609 439 | sg7 440 | F0.9270176088617623 441 | sg8 442 | F0.6806404084243549 443 | ssS'2000' 444 | p99 445 | (dp100 446 | g5 447 | F0.8038458494347899 448 | sg6 449 | F0.8904233231452978 450 | sg7 451 | F0.9276480687811841 452 | sg8 453 | F0.8570256992468454 454 | ssS'2010' 455 | p101 456 | (dp102 457 | g5 458 | F0.33226391278577666 459 | sg6 460 | F0.626116225677421 461 | sg7 462 | F0.8757692680353659 463 | sg8 464 | F0.46587171576864456 465 | ssS'2005' 466 | p103 467 | (dp104 468 | g5 469 | F0.3258139976914447 470 | sg6 471 | F0.47924742745497784 472 | sg7 473 | F0.9394270160407397 474 | sg8 475 | F0.4699003469424067 476 | sssS'mean_avgs' 477 | p105 478 | (dp106 479 | S'a' 480 | p107 481 | (dp108 482 | S'2015' 483 | p109 484 | F0.4425254639394451 485 | sS'2000' 486 | p110 487 | F0.8368367832926638 488 | sS'2010' 489 | p111 490 | F0.5927803833155777 491 | sS'2005' 492 | p112 493 | F0.7351109053853003 494 | ssS'p' 495 | p113 496 | (dp114 497 | g109 498 | F0.20266619793602308 499 | sg110 500 | F0.743924604803316 501 | sg111 502 | F0.3694926280178968 503 | sg112 504 | F0.5767766993981119 505 | ssS'r' 506 | p115 507 | (dp116 508 | g109 509 | F0.8559149321962897 510 | sg110 511 | F0.9291353033551015 512 | sg111 513 | F0.8674373568798217 514 | sg112 515 | F0.913554997090962 516 | ssS'f1' 517 | p117 518 | (dp118 519 | g109 520 | F0.28839690093305104 521 | sg110 522 | F0.8161290619784449 523 | sg111 524 | F0.4912825184372468 525 | sg112 526 | F0.6880590975573231 527 | sss. -------------------------------------------------------------------------------- /testing/wbce-tests/DocumentSlopeCurveFilter/avgs.pkl: -------------------------------------------------------------------------------- 1 | (dp0 2 | S'www.latimes.com' 3 | p1 4 | (dp2 5 | S'2015' 6 | p3 7 | (dp4 8 | S'avg_precision' 9 | p5 10 | F0.13362793010722546 11 | sS'avg_accuracy' 12 | p6 13 | F0.5640730627271222 14 | sS'avg_recall' 15 | p7 16 | F0.6105099272553126 17 | sS'avg_f1' 18 | p8 19 | F0.21636299188862934 20 | ssS'2000' 21 | p9 22 | (dp10 23 | g5 24 | F0.8608073651801277 25 | sg6 26 | F0.8710840027081056 27 | sg7 28 | F0.9168894383171451 29 | sg8 30 | F0.8870460182155111 31 | ssS'2010' 32 | p11 33 | (dp12 34 | g5 35 | F0.4775014374694188 36 | sg6 37 | F0.7300415856892406 38 | sg7 39 | F0.7501699767053438 40 | sg8 41 | F0.5645140360715292 42 | ssS'2005' 43 | p13 44 | (dp14 45 | g5 46 | F0.62804991992724 47 | sg6 48 | F0.7457469193144748 49 | sg7 50 | F0.8624153529517319 51 | sg8 52 | F0.7147534515518724 53 | sssS'www.cnn.com' 54 | p15 55 | (dp16 56 | S'2015' 57 | p17 58 | (dp18 59 | g5 60 | F0.31429130410851036 61 | sg6 62 | F0.4344537969911349 63 | sg7 64 | F0.7989862630730329 65 | sg8 66 | F0.4353724215114172 67 | ssS'2000' 68 | p19 69 | (dp20 70 | g5 71 | F0.814601457327025 72 | sg6 73 | F0.8357306606628021 74 | sg7 75 | F0.7818753009578961 76 | sg8 77 | F0.7896223422120898 78 | ssS'2010' 79 | p21 80 | (dp22 81 | g5 82 | F0.4253522345143547 83 | sg6 84 | F0.6314854218762812 85 | sg7 86 | F0.7867452410355671 87 | sg8 88 | F0.5390458385822727 89 | ssS'2005' 90 | p23 91 | (dp24 92 | g5 93 | F0.6138419519432511 94 | sg6 95 | F0.80313490015239 96 | sg7 97 | F0.8170393374481099 98 | sg8 99 | F0.6969981160993483 100 | sssS'www.forbes.com' 101 | p25 102 | (dp26 103 | S'2015' 104 | p27 105 | (dp28 106 | g5 107 | F0.34104189997176576 108 | sg6 109 | F0.5295997580705213 110 | sg7 111 | F0.8133666106234162 112 | sg8 113 | F0.47083848493097724 114 | ssS'2000' 115 | p29 116 | (dp30 117 | g5 118 | F0.6696646303443038 119 | sg6 120 | F0.7531743318132633 121 | sg7 122 | F0.8580672164891681 123 | sg8 124 | F0.7397309776730646 125 | ssS'2010' 126 | p31 127 | (dp32 128 | g5 129 | F0.6700458372321113 130 | sg6 131 | F0.801389357969293 132 | sg7 133 | F0.791816010216907 134 | sg8 135 | F0.7111900747540568 136 | ssS'2005' 137 | p33 138 | (dp34 139 | g5 140 | F0.6725911369907592 141 | sg6 142 | F0.9015390713872321 143 | sg7 144 | F0.7316324009776635 145 | sg8 146 | F0.6945187976598736 147 | sssS'news.bbc.co.uk' 148 | p35 149 | (dp36 150 | S'2015' 151 | p37 152 | (dp38 153 | g5 154 | F0.13348385856834152 155 | sg6 156 | F0.5590850414538149 157 | sg7 158 | F0.7044027599974059 159 | sg8 160 | F0.22014817634676753 161 | ssS'2000' 162 | p39 163 | (dp40 164 | g5 165 | F0.9269321571721406 166 | sg6 167 | F0.8884687956422209 168 | sg7 169 | F0.763953673648995 170 | sg8 171 | F0.836663386622469 172 | ssS'2010' 173 | p41 174 | (dp42 175 | g5 176 | F0.41579101137917696 177 | sg6 178 | F0.7308280367222199 179 | sg7 180 | F0.7194407444111306 181 | sg8 182 | F0.5136769342918257 183 | ssS'2005' 184 | p43 185 | (dp44 186 | g5 187 | F0.8696766652605129 188 | sg6 189 | F0.8688825703286615 190 | sg7 191 | F0.7628470203880643 192 | sg8 193 | F0.8070844323139519 194 | sssS'thenation.com' 195 | p45 196 | (dp46 197 | S'2015' 198 | p47 199 | (dp48 200 | g5 201 | F0.5045095303547008 202 | sg6 203 | F0.6554436803185737 204 | sg7 205 | F0.844982270798196 206 | sg8 207 | F0.6176490638557665 208 | ssS'2000' 209 | p49 210 | (dp50 211 | g5 212 | F0.9264909687190436 213 | sg6 214 | F0.8213611321548475 215 | sg7 216 | F0.8004925341476387 217 | sg8 218 | F0.8520422864544532 219 | ssS'2010' 220 | p51 221 | (dp52 222 | g5 223 | F0.5713283942566657 224 | sg6 225 | F0.7374682800819208 226 | sg7 227 | F0.7368048150492504 228 | sg8 229 | F0.633614584648842 230 | ssS'2005' 231 | p53 232 | (dp54 233 | g5 234 | F0.9004401876670914 235 | sg6 236 | F0.8376794236576882 237 | sg7 238 | F0.8312027885459679 239 | sg8 240 | F0.8600106073208851 241 | sssS'www.nymag.com' 242 | p55 243 | (dp56 244 | S'2015' 245 | p57 246 | (dp58 247 | g5 248 | F0.042433386182753455 249 | sg6 250 | F0.20001345478004184 251 | sg7 252 | F0.5733902879619535 253 | sg8 254 | F0.061638344270854226 255 | ssS'2000' 256 | p59 257 | (dp60 258 | g5 259 | F0.8780918332144314 260 | sg6 261 | F0.8949653955995845 262 | sg7 263 | F0.8847969887068878 264 | sg8 265 | F0.8721098896116841 266 | ssS'2010' 267 | p61 268 | (dp62 269 | g5 270 | F0.19291701995638091 271 | sg6 272 | F0.6243345852588521 273 | sg7 274 | F0.6913373673163812 275 | sg8 276 | F0.28335290641161315 277 | ssS'2005' 278 | p63 279 | (dp64 280 | g5 281 | F0.647029606678659 282 | sg6 283 | F0.7477679494762822 284 | sg7 285 | F0.8426172623555896 286 | sg8 287 | F0.7278174420301338 288 | sssS'entertainment.msn.com' 289 | p65 290 | (dp66 291 | S'2015' 292 | p67 293 | (dp68 294 | g5 295 | F0.07619402011443083 296 | sg6 297 | F0.5683632505026852 298 | sg7 299 | F0.7424385848576286 300 | sg8 301 | F0.13432762095063344 302 | ssS'2000' 303 | p69 304 | (dp70 305 | g5 306 | F0.8307937669693192 307 | sg6 308 | F0.908672620029267 309 | sg7 310 | F0.8207658345948943 311 | sg8 312 | F0.8194960219785803 313 | ssS'2010' 314 | p71 315 | (dp72 316 | g5 317 | F0.4615173559036399 318 | sg6 319 | F0.6611911433400879 320 | sg7 321 | F0.8158742777301178 322 | sg8 323 | F0.554329139143541 324 | ssS'2005' 325 | p73 326 | (dp74 327 | g5 328 | F0.4385329315775029 329 | sg6 330 | F0.6371303369249157 331 | sg7 332 | F0.8385107333987197 333 | sg8 334 | F0.5600144932769493 335 | sssS'www.esquire.com' 336 | p75 337 | (dp76 338 | S'2015' 339 | p77 340 | (dp78 341 | g5 342 | F0.06688037449997622 343 | sg6 344 | F0.33850779320545665 345 | sg7 346 | F0.7483055257527502 347 | sg8 348 | F0.11391390720917939 349 | ssS'2000' 350 | p79 351 | (dp80 352 | g5 353 | F0.8958452423944537 354 | sg6 355 | F0.9112124301109888 356 | sg7 357 | F0.8482955195734273 358 | sg8 359 | F0.8664160242741558 360 | ssS'2010' 361 | p81 362 | (dp82 363 | g5 364 | F0.4108209241176574 365 | sg6 366 | F0.6348920726696297 367 | sg7 368 | F0.7732810849157287 369 | sg8 370 | F0.5184988347803462 371 | ssS'2005' 372 | p83 373 | (dp84 374 | g5 375 | F0.92854392161831 376 | sg6 377 | F0.9400540057619294 378 | sg7 379 | F0.9198915366390971 380 | sg8 381 | F0.9221331129385862 382 | sssS'news.yahoo.com' 383 | p85 384 | (dp86 385 | S'2015' 386 | p87 387 | (dp88 388 | g5 389 | F0.018287140557954475 390 | sg6 391 | F0.32301290969502494 392 | sg7 393 | F0.6713419906698788 394 | sg8 395 | F0.03543243516903241 396 | ssS'2000' 397 | p89 398 | (dp90 399 | g5 400 | F0.845171687583807 401 | sg6 402 | F0.8759148642457276 403 | sg7 404 | F0.8538566221913284 405 | sg8 406 | F0.8465503754031216 407 | ssS'2010' 408 | p91 409 | (dp92 410 | g5 411 | F0.5688487055711114 412 | sg6 413 | F0.7923965934651857 414 | sg7 415 | F0.7470244888565927 416 | sg8 417 | F0.6386090681025633 418 | ssS'2005' 419 | p93 420 | (dp94 421 | g5 422 | F0.6604807297526994 423 | sg6 424 | F0.7923291478637629 425 | sg7 426 | F0.822788744711171 427 | sg8 428 | F0.7230241200714896 429 | sssS'www.foxnews.com' 430 | p95 431 | (dp96 432 | S'2015' 433 | p97 434 | (dp98 435 | g5 436 | F0.7578331189482941 437 | sg6 438 | F0.8419532822418703 439 | sg7 440 | F0.7897098888097596 441 | sg8 442 | F0.7702705115416534 443 | ssS'2000' 444 | p99 445 | (dp100 446 | g5 447 | F0.8939153820029523 448 | sg6 449 | F0.8595958572289699 450 | sg7 451 | F0.7956644395158875 452 | sg8 453 | F0.8199633053604782 454 | ssS'2010' 455 | p101 456 | (dp102 457 | g5 458 | F0.4429766402972281 459 | sg6 460 | F0.7648768556425849 461 | sg7 462 | F0.7616822029672544 463 | sg8 464 | F0.5492804006344774 465 | ssS'2005' 466 | p103 467 | (dp104 468 | g5 469 | F0.3291739634988008 470 | sg6 471 | F0.48055033553055443 472 | sg7 473 | F0.8687604694520127 474 | sg8 475 | F0.461037993189315 476 | sssS'mean_avgs' 477 | p105 478 | (dp106 479 | S'a' 480 | p107 481 | (dp108 482 | S'2015' 483 | p109 484 | F0.5014506029986247 485 | sS'2000' 486 | p110 487 | F0.8620180090195776 488 | sS'2010' 489 | p111 490 | F0.7108903932715295 491 | sS'2005' 492 | p112 493 | F0.7754814660397891 494 | ssS'p' 495 | p113 496 | (dp114 497 | g109 498 | F0.23885825634139532 499 | sg110 500 | F0.8542314490907603 501 | sg111 502 | F0.4637099560697746 503 | sg112 504 | F0.6688361014914828 505 | ssS'r' 506 | p115 507 | (dp116 508 | g109 509 | F0.7297434109799334 510 | sg110 511 | F0.8324657568143268 512 | sg111 513 | F0.7574176209204273 514 | sg112 515 | F0.8297705646868128 516 | ssS'f1' 517 | p117 518 | (dp118 519 | g109 520 | F0.3075953957674911 521 | sg110 522 | F0.8329640627805608 523 | sg111 524 | F0.5506111817421068 525 | sg112 526 | F0.7167392566452404 527 | sss. -------------------------------------------------------------------------------- /testing/wbce-tests/FeatureExtractorDomFilter/avgs.pkl: -------------------------------------------------------------------------------- 1 | (dp0 2 | S'www.latimes.com' 3 | p1 4 | (dp2 5 | S'2015' 6 | p3 7 | (dp4 8 | S'avg_precision' 9 | p5 10 | F0.03862377662399559 11 | sS'avg_accuracy' 12 | p6 13 | F0.5475209655413935 14 | sS'avg_recall' 15 | p7 16 | F0.19660948355510707 17 | sS'avg_f1' 18 | p8 19 | F0.06358675385508622 20 | ssS'2000' 21 | p9 22 | (dp10 23 | g5 24 | F0.3108694782857746 25 | sg6 26 | F0.4950727101198374 27 | sg7 28 | F0.26258302039666276 29 | sg8 30 | F0.2572398528052871 31 | ssS'2010' 32 | p11 33 | (dp12 34 | g5 35 | F0.12564869282111016 36 | sg6 37 | F0.5845739906129119 38 | sg7 39 | F0.13280825382933611 40 | sg8 41 | F0.1086130914184433 42 | ssS'2005' 43 | p13 44 | (dp14 45 | g5 46 | F0.5836816702543528 47 | sg6 48 | F0.7362668094160602 49 | sg7 50 | F0.8417770250448909 51 | sg8 52 | F0.6780446646395671 53 | sssS'www.cnn.com' 54 | p15 55 | (dp16 56 | S'2015' 57 | p17 58 | (dp18 59 | g5 60 | F0.06418395143820504 61 | sg6 62 | F0.3011226834185275 63 | sg7 64 | F0.13817902619814443 65 | sg8 66 | F0.07999234183847852 67 | ssS'2000' 68 | p19 69 | (dp20 70 | g5 71 | F0.36240339186286513 72 | sg6 73 | F0.5692865765216962 74 | sg7 75 | F0.060388862603593246 76 | sg8 77 | F0.09763321585344725 78 | ssS'2010' 79 | p21 80 | (dp22 81 | g5 82 | F0.14134634671225613 83 | sg6 84 | F0.5131224986476233 85 | sg7 86 | F0.14818226943301183 87 | sg8 88 | F0.13358999942120353 89 | ssS'2005' 90 | p23 91 | (dp24 92 | g5 93 | F0.13467802471805546 94 | sg6 95 | F0.6076901575672266 96 | sg7 97 | F0.08002637305794259 98 | sg8 99 | F0.09616808818295225 100 | sssS'www.forbes.com' 101 | p25 102 | (dp26 103 | S'2015' 104 | p27 105 | (dp28 106 | g5 107 | F0.09245746028554677 108 | sg6 109 | F0.42323511783625356 110 | sg7 111 | F0.14637177598698922 112 | sg8 113 | F0.10616140086429192 114 | ssS'2000' 115 | p29 116 | (dp30 117 | g5 118 | F0.19561991680653532 119 | sg6 120 | F0.39922170353996267 121 | sg7 122 | F0.08571777493843516 123 | sg8 124 | F0.10854287129287789 125 | ssS'2010' 126 | p31 127 | (dp32 128 | g5 129 | F0.26944679283076217 130 | sg6 131 | F0.5954451775364666 132 | sg7 133 | F0.12339202789905615 134 | sg8 135 | F0.16095276230949238 136 | ssS'2005' 137 | p33 138 | (dp34 139 | g5 140 | F0.3108646536464451 141 | sg6 142 | F0.8189343375437297 143 | sg7 144 | F0.3648716498560567 145 | sg8 146 | F0.3213048585638684 147 | sssS'news.bbc.co.uk' 148 | p35 149 | (dp36 150 | S'2015' 151 | p37 152 | (dp38 153 | g5 154 | F0.03774622737251699 155 | sg6 156 | F0.5406490377063605 157 | sg7 158 | F0.20599176592001542 159 | sg8 160 | F0.0619876761909949 161 | ssS'2000' 162 | p39 163 | (dp40 164 | g5 165 | F0.48511903626399655 166 | sg6 167 | F0.6525910554046321 168 | sg7 169 | F0.054431874903996175 170 | sg8 171 | F0.09683742803582207 172 | ssS'2010' 173 | p41 174 | (dp42 175 | g5 176 | F0.060235937357474324 177 | sg6 178 | F0.6460925063571413 179 | sg7 180 | F0.08121106276235718 181 | sg8 182 | F0.06066186807175994 183 | ssS'2005' 184 | p43 185 | (dp44 186 | g5 187 | F0.42373560283340894 188 | sg6 189 | F0.6414624132040798 190 | sg7 191 | F0.06442630752520787 192 | sg8 193 | F0.1061185047176225 194 | sssS'thenation.com' 195 | p45 196 | (dp46 197 | S'2015' 198 | p47 199 | (dp48 200 | g5 201 | F0.05919721060422903 202 | sg6 203 | F0.36620592656232953 204 | sg7 205 | F0.08258572837938064 206 | sg8 207 | F0.055833167469511974 208 | ssS'2000' 209 | p49 210 | (dp50 211 | g5 212 | F0.5940573521660849 213 | sg6 214 | F0.37205129275932597 215 | sg7 216 | F0.12293561227475486 217 | sg8 218 | F0.19276604983481604 219 | ssS'2010' 220 | p51 221 | (dp52 222 | g5 223 | F0.20247773008213288 224 | sg6 225 | F0.6615429667487299 226 | sg7 227 | F0.059078121251345435 228 | sg8 229 | F0.0900074909766672 230 | ssS'2005' 231 | p53 232 | (dp54 233 | g5 234 | F0.36475352093750507 235 | sg6 236 | F0.3982831142892041 237 | sg7 238 | F0.06460122266976477 239 | sg8 240 | F0.10681061090808516 241 | sssS'www.nymag.com' 242 | p55 243 | (dp56 244 | S'2015' 245 | p57 246 | (dp58 247 | g5 248 | F0.007736638561868903 249 | sg6 250 | F0.17761674254715634 251 | sg7 252 | F0.20180235024542753 253 | sg8 254 | F0.010360786397247686 255 | ssS'2000' 256 | p59 257 | (dp60 258 | g5 259 | F0.6510800190870829 260 | sg6 261 | F0.5849852708130819 262 | sg7 263 | F0.1289885431845319 264 | sg8 265 | F0.20703329310535568 266 | ssS'2010' 267 | p61 268 | (dp62 269 | g5 270 | F0.10996307421215494 271 | sg6 272 | F0.5881590366858122 273 | sg7 274 | F0.4592625068901346 275 | sg8 276 | F0.1598092857436456 277 | ssS'2005' 278 | p63 279 | (dp64 280 | g5 281 | F0.29333271544410583 282 | sg6 283 | F0.5003691436647126 284 | sg7 285 | F0.18823697490955785 286 | sg8 287 | F0.21894693150333636 288 | sssS'entertainment.msn.com' 289 | p65 290 | (dp66 291 | S'2015' 292 | p67 293 | (dp68 294 | g5 295 | F0.01665606784896567 296 | sg6 297 | F0.5916375651628714 298 | sg7 299 | F0.15520839620770052 300 | sg8 301 | F0.029043956673389815 302 | ssS'2000' 303 | p69 304 | (dp70 305 | g5 306 | F0.4101324917837275 307 | sg6 308 | F0.727552972585202 309 | sg7 310 | F0.16259403751412904 311 | sg8 312 | F0.21222021473721056 313 | ssS'2010' 314 | p71 315 | (dp72 316 | g5 317 | F0.1298783825779457 318 | sg6 319 | F0.5836480678408781 320 | sg7 321 | F0.1278048434725496 322 | sg8 323 | F0.11219706096219867 324 | ssS'2005' 325 | p73 326 | (dp74 327 | g5 328 | F0.13401296466303134 329 | sg6 330 | F0.43490657773545416 331 | sg7 332 | F0.2510376892473985 333 | sg8 334 | F0.1509224712249121 335 | sssS'www.esquire.com' 336 | p75 337 | (dp76 338 | S'2015' 339 | p77 340 | (dp78 341 | g5 342 | F0.03007502159405593 343 | sg6 344 | F0.3251803285202408 345 | sg7 346 | F0.3342219116135242 347 | sg8 348 | F0.04957560749620002 349 | ssS'2000' 350 | p79 351 | (dp80 352 | g5 353 | F0.36353215206468886 354 | sg6 355 | F0.6285782663481376 356 | sg7 357 | F0.06656425254140755 358 | sg8 359 | F0.09421369283864033 360 | ssS'2010' 361 | p81 362 | (dp82 363 | g5 364 | F0.07315564039653401 365 | sg6 366 | F0.4853529626854926 367 | sg7 368 | F0.10412010412284012 369 | sg8 370 | F0.07771862647936642 371 | ssS'2005' 372 | p83 373 | (dp84 374 | g5 375 | F0.5122452215303767 376 | sg6 377 | F0.6055008627163937 378 | sg7 379 | F0.06701494451981742 380 | sg8 381 | F0.11640452148728203 382 | sssS'news.yahoo.com' 383 | p85 384 | (dp86 385 | S'2015' 386 | p87 387 | (dp88 388 | g5 389 | F0.013623785917849206 390 | sg6 391 | F0.3282927623605152 392 | sg7 393 | F0.42764807979045144 394 | sg8 395 | F0.02623372965102739 396 | ssS'2000' 397 | p89 398 | (dp90 399 | g5 400 | F0.42620741281304775 401 | sg6 402 | F0.5794007535673233 403 | sg7 404 | F0.06297910083230109 405 | sg8 406 | F0.10655436205225266 407 | ssS'2010' 408 | p91 409 | (dp92 410 | g5 411 | F0.17931260635945423 412 | sg6 413 | F0.65352495994737 414 | sg7 415 | F0.12229373715131016 416 | sg8 417 | F0.13595107557165734 418 | ssS'2005' 419 | p93 420 | (dp94 421 | g5 422 | F0.19753138142997795 423 | sg6 424 | F0.5446968776995572 425 | sg7 426 | F0.09911487870385352 427 | sg8 428 | F0.11784895010083458 429 | sssS'www.foxnews.com' 430 | p95 431 | (dp96 432 | S'2015' 433 | p97 434 | (dp98 435 | g5 436 | F0.375579362581543 437 | sg6 438 | F0.5800834160536803 439 | sg7 440 | F0.11685239448671766 441 | sg8 442 | F0.16803159074135582 443 | ssS'2000' 444 | p99 445 | (dp100 446 | g5 447 | F0.6306054822791173 448 | sg6 449 | F0.5952388658762817 450 | sg7 451 | F0.07668772578264597 452 | sg8 453 | F0.1161012178956622 454 | ssS'2010' 455 | p101 456 | (dp102 457 | g5 458 | F0.04998551810329787 459 | sg6 460 | F0.6375280859644399 461 | sg7 462 | F0.05425512716402376 463 | sg8 464 | F0.047394494117547474 465 | ssS'2005' 466 | p103 467 | (dp104 468 | g5 469 | F0.08660519833502513 470 | sg6 471 | F0.3044000109137328 472 | sg7 473 | F0.16218509687112367 474 | sg8 475 | F0.10530498888291785 476 | sssS'mean_avgs' 477 | p105 478 | (dp106 479 | S'a' 480 | p107 481 | (dp108 482 | S'2015' 483 | p109 484 | F0.4181544545709329 485 | sS'2000' 486 | p110 487 | F0.560397946753548 488 | sS'2010' 489 | p111 490 | F0.5948990253026867 491 | sS'2005' 492 | p112 493 | F0.5592510304750151 494 | ssS'p' 495 | p113 496 | (dp114 497 | g109 498 | F0.07358795028287761 499 | sg110 500 | F0.442962673341292 501 | sg111 502 | F0.13414507214531224 503 | sg112 504 | F0.3041440953792285 505 | ssS'r' 506 | p115 507 | (dp116 508 | g109 509 | F0.2005470912383458 510 | sg110 511 | F0.10838708049724577 512 | sg111 513 | F0.14124080539759648 514 | sg112 515 | F0.21832921624056137 516 | ssS'f1' 517 | p117 518 | (dp118 519 | g109 520 | F0.06508070111775842 521 | sg110 522 | F0.14891421984513717 523 | sg111 524 | F0.1086895755071982 525 | sg112 526 | F0.20178745902113784 527 | sss. -------------------------------------------------------------------------------- /testing/wbce-tests/FeatureExtractorSplitFilter/avgs.pkl: -------------------------------------------------------------------------------- 1 | (dp0 2 | S'www.latimes.com' 3 | p1 4 | (dp2 5 | S'2015' 6 | p3 7 | (dp4 8 | S'avg_precision' 9 | p5 10 | F0.042838971662592956 11 | sS'avg_accuracy' 12 | p6 13 | F0.5396451026698058 14 | sS'avg_recall' 15 | p7 16 | F0.2143386372252256 17 | sS'avg_f1' 18 | p8 19 | F0.07024946573629429 20 | ssS'2000' 21 | p9 22 | (dp10 23 | g5 24 | F0.259452776657885 25 | sg6 26 | F0.36496019960007986 27 | sg7 28 | F0.009278967151042651 29 | sg8 30 | F0.017488035262633123 31 | ssS'2010' 32 | p11 33 | (dp12 34 | g5 35 | F0.09076425485857424 36 | sg6 37 | F0.5842350688784347 38 | sg7 39 | F0.10755380819352929 40 | sg8 41 | F0.08352638379493534 42 | ssS'2005' 43 | p13 44 | (dp14 45 | g5 46 | F0.05359185225411894 47 | sg6 48 | F0.41814073165472027 49 | sg7 50 | F0.028820574276968523 51 | sg8 52 | F0.03237489913322082 53 | sssS'www.cnn.com' 54 | p15 55 | (dp16 56 | S'2015' 57 | p17 58 | (dp18 59 | g5 60 | F0.06889188795616198 61 | sg6 62 | F0.29879860863581437 63 | sg7 64 | F0.1476782218410179 65 | sg8 66 | F0.0858228128619211 67 | ssS'2000' 68 | p19 69 | (dp20 70 | g5 71 | F0.257515252955112 72 | sg6 73 | F0.5596886295786784 74 | sg7 75 | F0.043054322432346866 76 | sg8 77 | F0.06894351494374788 78 | ssS'2010' 79 | p21 80 | (dp22 81 | g5 82 | F0.10473888905032128 83 | sg6 84 | F0.5040686253245074 85 | sg7 86 | F0.10738992305131045 87 | sg8 88 | F0.09747633816857082 89 | ssS'2005' 90 | p23 91 | (dp24 92 | g5 93 | F0.059986740214574584 94 | sg6 95 | F0.5952464472193412 96 | sg7 97 | F0.03199585169719729 98 | sg8 99 | F0.039755999691476245 100 | sssS'www.forbes.com' 101 | p25 102 | (dp26 103 | S'2015' 104 | p27 105 | (dp28 106 | g5 107 | F0.08303484471116346 108 | sg6 109 | F0.42621213769760447 110 | sg7 111 | F0.1301063446624315 112 | sg8 113 | F0.09476040556826998 114 | ssS'2000' 115 | p29 116 | (dp30 117 | g5 118 | F0.06402753069155678 119 | sg6 120 | F0.37130669858762383 121 | sg7 122 | F0.024578900699650907 123 | sg8 124 | F0.03017986268669128 125 | ssS'2010' 126 | p31 127 | (dp32 128 | g5 129 | F0.12450890050980233 130 | sg6 131 | F0.5706953221243913 132 | sg7 133 | F0.05086602159400552 134 | sg8 135 | F0.06494777174860772 136 | ssS'2005' 137 | p33 138 | (dp34 139 | g5 140 | F0.05855408531900379 141 | sg6 142 | F0.5856900002845602 143 | sg7 144 | F0.10188154741869238 145 | sg8 146 | F0.06806905850296827 147 | sssS'news.bbc.co.uk' 148 | p35 149 | (dp36 150 | S'2015' 151 | p37 152 | (dp38 153 | g5 154 | F0.0343743719528452 155 | sg6 156 | F0.539281506587864 157 | sg7 158 | F0.18783213261199372 159 | sg8 160 | F0.05640772769120938 161 | ssS'2000' 162 | p39 163 | (dp40 164 | g5 165 | F0.20720090593774804 166 | sg6 167 | F0.6397461198718591 168 | sg7 169 | F0.015577491971960358 170 | sg8 171 | F0.028792305664383188 172 | ssS'2010' 173 | p41 174 | (dp42 175 | g5 176 | F0.030801142370060597 177 | sg6 178 | F0.6403730061114464 179 | sg7 180 | F0.03977761567488316 181 | sg8 182 | F0.030539753789960985 183 | ssS'2005' 184 | p43 185 | (dp44 186 | g5 187 | F0.17945710895699843 188 | sg6 189 | F0.6283139190882554 190 | sg7 191 | F0.018095108334784896 192 | sg8 193 | F0.0310333056622939 194 | sssS'thenation.com' 195 | p45 196 | (dp46 197 | S'2015' 198 | p47 199 | (dp48 200 | g5 201 | F0.04517924477576478 202 | sg6 203 | F0.36039637208968034 204 | sg7 205 | F0.043749082272892155 206 | sg8 207 | F0.03703408605097151 208 | ssS'2000' 209 | p49 210 | (dp50 211 | g5 212 | F0.13596091613435268 213 | sg6 214 | F0.31253981907438677 215 | sg7 216 | F0.0082908494952558 217 | sg8 218 | F0.015216115825885682 219 | ssS'2010' 220 | p51 221 | (dp52 222 | g5 223 | F0.17286342176717917 224 | sg6 225 | F0.6247750599716096 226 | sg7 227 | F0.30959223821903464 228 | sg8 229 | F0.1923349474934642 230 | ssS'2005' 231 | p53 232 | (dp54 233 | g5 234 | F0.3682191437488494 235 | sg6 236 | F0.3964650854480107 237 | sg7 238 | F0.05403360107032527 239 | sg8 240 | F0.09208525907298962 241 | sssS'www.nymag.com' 242 | p55 243 | (dp56 244 | S'2015' 245 | p57 246 | (dp58 247 | g5 248 | F0.026359227184905656 249 | sg6 250 | F0.1915798753761126 251 | sg7 252 | F0.24765930535331457 253 | sg8 254 | F0.03306305786876614 255 | ssS'2000' 256 | p59 257 | (dp60 258 | g5 259 | F0.7097155883500723 260 | sg6 261 | F0.834101853884513 262 | sg7 263 | F0.8866872632144056 264 | sg8 265 | F0.7720814250017534 266 | ssS'2010' 267 | p61 268 | (dp62 269 | g5 270 | F0.044021800471251676 271 | sg6 272 | F0.5662574872803382 273 | sg7 274 | F0.15819273015075444 275 | sg8 276 | F0.059832097832006076 277 | ssS'2005' 278 | p63 279 | (dp64 280 | g5 281 | F0.06628139954248631 282 | sg6 283 | F0.44630766029462815 284 | sg7 285 | F0.0310560496089806 286 | sg8 287 | F0.040844543592929244 288 | sssS'entertainment.msn.com' 289 | p65 290 | (dp66 291 | S'2015' 292 | p67 293 | (dp68 294 | g5 295 | F0.019098335185455066 296 | sg6 297 | F0.5989437196681083 298 | sg7 299 | F0.17421440823574907 300 | sg8 301 | F0.033283615734092796 302 | ssS'2000' 303 | p69 304 | (dp70 305 | g5 306 | F0.1207254953634531 307 | sg6 308 | F0.6930479035029173 309 | sg7 310 | F0.026388552487926424 311 | sg8 312 | F0.039727516310454646 313 | ssS'2010' 314 | p71 315 | (dp72 316 | g5 317 | F0.048415999768495664 318 | sg6 319 | F0.3982429160282593 320 | sg7 321 | F0.10240618912367663 322 | sg8 323 | F0.048631912927502664 324 | ssS'2005' 325 | p73 326 | (dp74 327 | g5 328 | F0.022684054643112104 329 | sg6 330 | F0.40722411431097855 331 | sg7 332 | F0.02841699198602758 333 | sg8 334 | F0.02223251863755825 335 | sssS'www.esquire.com' 336 | p75 337 | (dp76 338 | S'2015' 339 | p77 340 | (dp78 341 | g5 342 | F0.029511317676966412 343 | sg6 344 | F0.3247508394943719 345 | sg7 346 | F0.31729511677603983 347 | sg8 348 | F0.04847777451655669 349 | ssS'2000' 350 | p79 351 | (dp80 352 | g5 353 | F0.08509745093616063 354 | sg6 355 | F0.6136468759685759 356 | sg7 357 | F0.017694846805562613 358 | sg8 359 | F0.013225429344492903 360 | ssS'2010' 361 | p81 362 | (dp82 363 | g5 364 | F0.04548066527479219 365 | sg6 366 | F0.47556130510908495 367 | sg7 368 | F0.06389027139069726 369 | sg8 370 | F0.04710949356863442 371 | ssS'2005' 372 | p83 373 | (dp84 374 | g5 375 | F0.15894624504628735 376 | sg6 377 | F0.5808538159743603 378 | sg7 379 | F0.00866946700137356 380 | sg8 381 | F0.01634334468783556 382 | sssS'news.yahoo.com' 383 | p85 384 | (dp86 385 | S'2015' 386 | p87 387 | (dp88 388 | g5 389 | F0.01360822373383575 390 | sg6 391 | F0.3280975508678668 392 | sg7 393 | F0.427371360117364 394 | sg8 395 | F0.0262041010947494 396 | ssS'2000' 397 | p89 398 | (dp90 399 | g5 400 | F0.20901849585870255 401 | sg6 402 | F0.5621910385637939 403 | sg7 404 | F0.020860836132324034 405 | sg8 406 | F0.03677093732496071 407 | ssS'2010' 408 | p91 409 | (dp92 410 | g5 411 | F0.13330163214667995 412 | sg6 413 | F0.6465744205644355 414 | sg7 415 | F0.08741214266153287 416 | sg8 417 | F0.09783085614946609 418 | ssS'2005' 419 | p93 420 | (dp94 421 | g5 422 | F0.10767432608632829 423 | sg6 424 | F0.5302897684251133 425 | sg7 426 | F0.04711259753842053 427 | sg8 428 | F0.05948299736144186 429 | sssS'www.foxnews.com' 430 | p95 431 | (dp96 432 | S'2015' 433 | p97 434 | (dp98 435 | g5 436 | F0.4335891319208261 437 | sg6 438 | F0.5904350598598168 439 | sg7 440 | F0.13153289800630058 441 | sg8 442 | F0.19363369241865677 443 | ssS'2000' 444 | p99 445 | (dp100 446 | g5 447 | F0.19892154478650526 448 | sg6 449 | F0.608009455591965 450 | sg7 451 | F0.1341346899174315 452 | sg8 453 | F0.11717690601781064 454 | ssS'2010' 455 | p101 456 | (dp102 457 | g5 458 | F0.05581275411375033 459 | sg6 460 | F0.6344256490113849 461 | sg7 462 | F0.06201920153749155 463 | sg8 464 | F0.05344105431751456 465 | ssS'2005' 466 | p103 467 | (dp104 468 | g5 469 | F0.06539045075135168 470 | sg6 471 | F0.2914227331396125 472 | sg7 473 | F0.12235864682765385 474 | sg8 475 | F0.07881345001177152 476 | sssS'mean_avgs' 477 | p105 478 | (dp106 479 | S'a' 480 | p107 481 | (dp108 482 | S'2015' 483 | p109 484 | F0.41981407729470455 485 | sS'2000' 486 | p110 487 | F0.5559238594224393 488 | sS'2010' 489 | p111 490 | F0.5645208860403892 491 | sS'2005' 492 | p112 493 | F0.4879954275839581 494 | ssS'p' 495 | p113 496 | (dp114 497 | g109 498 | F0.07964855567605174 499 | sg110 500 | F0.22476359576715482 501 | sg111 502 | F0.08507094603309076 503 | sg112 504 | F0.11407854065631107 505 | ssS'r' 506 | p115 507 | (dp116 508 | g109 509 | F0.20217775071023286 510 | sg110 511 | F0.11865467203079069 512 | sg111 513 | F0.10891001415969159 514 | sg112 515 | F0.04724404357604244 516 | ssS'f1' 517 | p117 518 | (dp118 519 | g109 520 | F0.0678936739541488 521 | sg110 522 | F0.11396020483828136 523 | sg111 524 | F0.0775670609790663 525 | sg112 526 | F0.04810353763544852 527 | sss. -------------------------------------------------------------------------------- /testing/wbce-tests/GeneralCCB/avgs.pkl: -------------------------------------------------------------------------------- 1 | (dp0 2 | S'www.latimes.com' 3 | p1 4 | (dp2 5 | S'2015' 6 | p3 7 | (dp4 8 | S'avg_precision' 9 | p5 10 | F0.12943625842051695 11 | sS'avg_accuracy' 12 | p6 13 | F0.5250061462335551 14 | sS'avg_recall' 15 | p7 16 | F0.7434403199000661 17 | sS'avg_f1' 18 | p8 19 | F0.21802652657797789 20 | ssS'2000' 21 | p9 22 | (dp10 23 | g5 24 | F0.8586431262759081 25 | sg6 26 | F0.8763593955864373 27 | sg7 28 | F0.9309931441328005 29 | sg8 30 | F0.8925579934720653 31 | ssS'2010' 32 | p11 33 | (dp12 34 | g5 35 | F0.46851571662688785 36 | sg6 37 | F0.7251025820794678 38 | sg7 39 | F0.782825706152114 40 | sg8 41 | F0.5670646006916412 42 | ssS'2005' 43 | p13 44 | (dp14 45 | g5 46 | F0.6305525928335406 47 | sg6 48 | F0.754325682418688 49 | sg7 50 | F0.8758294454964423 51 | sg8 52 | F0.7226073018270625 53 | sssS'www.cnn.com' 54 | p15 55 | (dp16 56 | S'2015' 57 | p17 58 | (dp18 59 | g5 60 | F0.2995482008650643 61 | sg6 62 | F0.41658660273986564 63 | sg7 64 | F0.7392606328795993 65 | sg8 66 | F0.41094501270942446 67 | ssS'2000' 68 | p19 69 | (dp20 70 | g5 71 | F0.8548475040578897 72 | sg6 73 | F0.8652617859855292 74 | sg7 75 | F0.7980020032138568 76 | sg8 77 | F0.8239078232781918 78 | ssS'2010' 79 | p21 80 | (dp22 81 | g5 82 | F0.41096334652475897 83 | sg6 84 | F0.5976865136647788 85 | sg7 86 | F0.8141348709311889 87 | sg8 88 | F0.5283105971456803 89 | ssS'2005' 90 | p23 91 | (dp24 92 | g5 93 | F0.6246431645192004 94 | sg6 95 | F0.8107314141414568 96 | sg7 97 | F0.8208739795207112 98 | sg8 99 | F0.7065673680006674 100 | sssS'www.forbes.com' 101 | p25 102 | (dp26 103 | S'2015' 104 | p27 105 | (dp28 106 | g5 107 | F0.3247401991305493 108 | sg6 109 | F0.5042208387585145 110 | sg7 111 | F0.820306864003994 112 | sg8 113 | F0.4555405234947937 114 | ssS'2000' 115 | p29 116 | (dp30 117 | g5 118 | F0.6714638467737778 119 | sg6 120 | F0.7726642440119037 121 | sg7 122 | F0.8835955863654548 123 | sg8 124 | F0.7541707874903932 125 | ssS'2010' 126 | p31 127 | (dp32 128 | g5 129 | F0.6832190050181055 130 | sg6 131 | F0.8260568755557612 132 | sg7 133 | F0.8287582660043098 134 | sg8 135 | F0.7406669303266823 136 | ssS'2005' 137 | p33 138 | (dp34 139 | g5 140 | F0.6347119061510296 141 | sg6 142 | F0.8923898495002328 143 | sg7 144 | F0.7322239123657356 145 | sg8 146 | F0.6760203440596354 147 | sssS'news.bbc.co.uk' 148 | p35 149 | (dp36 150 | S'2015' 151 | p37 152 | (dp38 153 | g5 154 | F0.13167702125754785 155 | sg6 156 | F0.5557833567425585 157 | sg7 158 | F0.704761777836753 159 | sg8 160 | F0.2177646082861531 161 | ssS'2000' 162 | p39 163 | (dp40 164 | g5 165 | F0.9404262665672861 166 | sg6 167 | F0.9031837369462709 168 | sg7 169 | F0.7934351140261374 170 | sg8 171 | F0.8604023318861276 172 | ssS'2010' 173 | p41 174 | (dp42 175 | g5 176 | F0.4218849025093671 177 | sg6 178 | F0.7356214404152991 179 | sg7 180 | F0.7315038802881014 181 | sg8 182 | F0.5224175328373507 183 | ssS'2005' 184 | p43 185 | (dp44 186 | g5 187 | F0.8903295054227871 188 | sg6 189 | F0.892340147185392 190 | sg7 191 | F0.8065457160833114 192 | sg8 193 | F0.8431309370189013 194 | sssS'thenation.com' 195 | p45 196 | (dp46 197 | S'2015' 198 | p47 199 | (dp48 200 | g5 201 | F0.49137907199335934 202 | sg6 203 | F0.6385027865047056 204 | sg7 205 | F0.8183080904152722 206 | sg8 207 | F0.599057636868512 208 | ssS'2000' 209 | p49 210 | (dp50 211 | g5 212 | F0.9018062702104928 213 | sg6 214 | F0.9041183484999058 215 | sg7 216 | F0.9431168589781965 217 | sg8 218 | F0.9210062931942521 219 | ssS'2010' 220 | p51 221 | (dp52 222 | g5 223 | F0.5434962294760681 224 | sg6 225 | F0.7236031960447936 226 | sg7 227 | F0.8307223799289499 228 | sg8 229 | F0.6449111544101084 230 | ssS'2005' 231 | p53 232 | (dp54 233 | g5 234 | F0.8503041167751857 235 | sg6 236 | F0.8514198954886943 237 | sg7 238 | F0.9158400762942217 239 | sg8 240 | F0.880404810182225 241 | sssS'www.nymag.com' 242 | p55 243 | (dp56 244 | S'2015' 245 | p57 246 | (dp58 247 | g5 248 | F0.04407018165599923 249 | sg6 250 | F0.19594505708206753 251 | sg7 252 | F0.6457960577024366 253 | sg8 254 | F0.06515978126985782 255 | ssS'2000' 256 | p59 257 | (dp60 258 | g5 259 | F0.87037309891489 260 | sg6 261 | F0.8989658132658929 262 | sg7 263 | F0.9125392490483983 264 | sg8 265 | F0.8779352802757441 266 | ssS'2010' 267 | p61 268 | (dp62 269 | g5 270 | F0.18612609604198724 271 | sg6 272 | F0.6158661421068967 273 | sg7 274 | F0.6813664511777056 275 | sg8 276 | F0.27378658571086034 277 | ssS'2005' 278 | p63 279 | (dp64 280 | g5 281 | F0.6187842974937978 282 | sg6 283 | F0.7369936912283517 284 | sg7 285 | F0.8899418847891526 286 | sg8 287 | F0.7276428526444784 288 | sssS'entertainment.msn.com' 289 | p65 290 | (dp66 291 | S'2015' 292 | p67 293 | (dp68 294 | g5 295 | F0.07223171179425797 296 | sg6 297 | F0.5561677382782463 298 | sg7 299 | F0.7429133405773873 300 | sg8 301 | F0.12834716752819722 302 | ssS'2000' 303 | p69 304 | (dp70 305 | g5 306 | F0.8394696101599314 307 | sg6 308 | F0.9132061617727136 309 | sg7 310 | F0.8102407264543855 311 | sg8 312 | F0.8216145335043756 313 | ssS'2010' 314 | p71 315 | (dp72 316 | g5 317 | F0.3885914367069556 318 | sg6 319 | F0.5515757099798353 320 | sg7 321 | F0.7856855775435772 322 | sg8 323 | F0.47304243382661726 324 | ssS'2005' 325 | p73 326 | (dp74 327 | g5 328 | F0.41909133109112984 329 | sg6 330 | F0.6180640361725218 331 | sg7 332 | F0.7854621047987349 333 | sg8 334 | F0.5283975229467838 335 | sssS'www.esquire.com' 336 | p75 337 | (dp76 338 | S'2015' 339 | p77 340 | (dp78 341 | g5 342 | F0.06655548974343942 343 | sg6 344 | F0.33710349763999986 345 | sg7 346 | F0.7433952164830431 347 | sg8 348 | F0.11439706807127707 349 | ssS'2000' 350 | p79 351 | (dp80 352 | g5 353 | F0.9094951163901436 354 | sg6 355 | F0.9250933190128898 356 | sg7 357 | F0.8822626050739679 358 | sg8 359 | F0.8920212123389878 360 | ssS'2010' 361 | p81 362 | (dp82 363 | g5 364 | F0.3866449923138486 365 | sg6 366 | F0.6145963072982272 367 | sg7 368 | F0.7987107075291516 369 | sg8 370 | F0.50746549586869 371 | ssS'2005' 372 | p83 373 | (dp84 374 | g5 375 | F0.9281173744128661 376 | sg6 377 | F0.9427156896491509 378 | sg7 379 | F0.9270540821599755 380 | sg8 381 | F0.9266251018372934 382 | sssS'news.yahoo.com' 383 | p85 384 | (dp86 385 | S'2015' 386 | p87 387 | (dp88 388 | g5 389 | F0.018207304858120263 390 | sg6 391 | F0.3233456930716502 392 | sg7 393 | F0.6682526808458175 394 | sg8 395 | F0.03527015079604294 396 | ssS'2000' 397 | p89 398 | (dp90 399 | g5 400 | F0.8087223234480646 401 | sg6 402 | F0.8581239695209166 403 | sg7 404 | F0.8529444309794099 405 | sg8 406 | F0.8281701076506122 407 | ssS'2010' 408 | p91 409 | (dp92 410 | g5 411 | F0.5255237109769181 412 | sg6 413 | F0.7650532501563796 414 | sg7 415 | F0.6522829330164196 416 | sg8 417 | F0.5761780236415831 418 | ssS'2005' 419 | p93 420 | (dp94 421 | g5 422 | F0.6564277365163304 423 | sg6 424 | F0.797182181393969 425 | sg7 426 | F0.845919527022068 427 | sg8 428 | F0.7312008340147842 429 | sssS'www.foxnews.com' 430 | p95 431 | (dp96 432 | S'2015' 433 | p97 434 | (dp98 435 | g5 436 | F0.7143877357447221 437 | sg6 438 | F0.8196423442102124 439 | sg7 440 | F0.8162201865047065 441 | sg8 442 | F0.7590627532077726 443 | ssS'2000' 444 | p99 445 | (dp100 446 | g5 447 | F0.8730005208941793 448 | sg6 449 | F0.90897283141247 450 | sg7 451 | F0.8877214385967767 452 | sg8 453 | F0.8768382672661295 454 | ssS'2010' 455 | p101 456 | (dp102 457 | g5 458 | F0.43042619558685963 459 | sg6 460 | F0.739502127667384 461 | sg7 462 | F0.8004489782249443 463 | sg8 464 | F0.5452341862171841 465 | ssS'2005' 466 | p103 467 | (dp104 468 | g5 469 | F0.32624183171026927 470 | sg6 471 | F0.482067086180646 472 | sg7 473 | F0.8908788442528022 474 | sg8 475 | F0.4637437250173835 476 | sssS'mean_avgs' 477 | p105 478 | (dp106 479 | S'a' 480 | p107 481 | (dp108 482 | S'2015' 483 | p109 484 | F0.4872304061261376 485 | sS'2000' 486 | p110 487 | F0.882594960601493 488 | sS'2010' 489 | p111 490 | F0.6894664144968823 491 | sS'2005' 492 | p112 493 | F0.7778229673359103 494 | ssS'p' 495 | p113 496 | (dp114 497 | g109 498 | F0.22922331754635766 499 | sg110 500 | F0.8528247683692562 501 | sg111 502 | F0.44453916317817554 503 | sg112 504 | F0.6579203856926136 505 | ssS'r' 506 | p115 507 | (dp116 508 | g109 509 | F0.7442655167149075 510 | sg110 511 | F0.8694851156869386 512 | sg111 513 | F0.7706439750796462 514 | sg112 515 | F0.8490569572783155 516 | ssS'f1' 517 | p117 518 | (dp118 519 | g109 520 | F0.30035712288100086 521 | sg110 522 | F0.8548624630356878 523 | sg111 524 | F0.5379077540676398 525 | sg112 526 | F0.7206340797549216 527 | sss. -------------------------------------------------------------------------------- /testing/wbce-tests/KFeatureExtractorDomFilter/avgs.pkl: -------------------------------------------------------------------------------- 1 | (dp0 2 | S'www.latimes.com' 3 | p1 4 | (dp2 5 | S'2015' 6 | p3 7 | (dp4 8 | S'avg_precision' 9 | p5 10 | F0.09226749054973236 11 | sS'avg_accuracy' 12 | p6 13 | F0.5400863683008629 14 | sS'avg_recall' 15 | p7 16 | F0.49271557997733345 17 | sS'avg_f1' 18 | p8 19 | F0.15366537139446726 20 | ssS'2000' 21 | p9 22 | (dp10 23 | g5 24 | F0.8533664269391111 25 | sg6 26 | F0.8773113672620625 27 | sg7 28 | F0.9416679931472296 29 | sg8 30 | F0.8946270365115992 31 | ssS'2010' 32 | p11 33 | (dp12 34 | g5 35 | F0.3291085018457033 36 | sg6 37 | F0.6501737141435677 38 | sg7 39 | F0.44325852903926327 40 | sg8 41 | F0.36078986633516474 42 | ssS'2005' 43 | p13 44 | (dp14 45 | g5 46 | F0.5780249782557191 47 | sg6 48 | F0.7236720984158829 49 | sg7 50 | F0.8849589977403614 51 | sg8 52 | F0.6875677388124654 53 | sssS'www.cnn.com' 54 | p15 55 | (dp16 56 | S'2015' 57 | p17 58 | (dp18 59 | g5 60 | F0.2791625929367795 61 | sg6 62 | F0.4446095296581864 63 | sg7 64 | F0.680628546988132 65 | sg8 66 | F0.38165311936483914 67 | ssS'2000' 68 | p19 69 | (dp20 70 | g5 71 | F0.7639870392099527 72 | sg6 73 | F0.829538640915483 74 | sg7 75 | F0.7652162904918535 76 | sg8 77 | F0.754270696813957 78 | ssS'2010' 79 | p21 80 | (dp22 81 | g5 82 | F0.40366920898565617 83 | sg6 84 | F0.6135031661248305 85 | sg7 86 | F0.792543349498563 87 | sg8 88 | F0.5195045155808845 89 | ssS'2005' 90 | p23 91 | (dp24 92 | g5 93 | F0.5211798048500964 94 | sg6 95 | F0.7732414080822231 96 | sg7 97 | F0.7065168763354187 98 | sg8 99 | F0.5955615845803097 100 | sssS'www.forbes.com' 101 | p25 102 | (dp26 103 | S'2015' 104 | p27 105 | (dp28 106 | g5 107 | F0.27262554887522134 108 | sg6 109 | F0.4775907856227633 110 | sg7 111 | F0.7169938088127147 112 | sg8 113 | F0.3840785834274435 114 | ssS'2000' 115 | p29 116 | (dp30 117 | g5 118 | F0.5711755450519884 119 | sg6 120 | F0.7030518292531293 121 | sg7 122 | F0.7813597040644285 123 | sg8 124 | F0.6458373572637298 125 | ssS'2010' 126 | p31 127 | (dp32 128 | g5 129 | F0.46850561527616924 130 | sg6 131 | F0.723034886206674 132 | sg7 133 | F0.5537004813302328 134 | sg8 135 | F0.4939337521815701 136 | ssS'2005' 137 | p33 138 | (dp34 139 | g5 140 | F0.3897890357149691 141 | sg6 142 | F0.7926106722888099 143 | sg7 144 | F0.7208642188091428 145 | sg8 146 | F0.49429358757756797 147 | sssS'news.bbc.co.uk' 148 | p35 149 | (dp36 150 | S'2015' 151 | p37 152 | (dp38 153 | g5 154 | F0.12364045968228651 155 | sg6 156 | F0.5480934489362985 157 | sg7 158 | F0.6631423828239639 159 | sg8 160 | F0.2045473546067208 161 | ssS'2000' 162 | p39 163 | (dp40 164 | g5 165 | F0.8106990701712056 166 | sg6 167 | F0.843900758702464 168 | sg7 169 | F0.6619995034569421 170 | sg8 171 | F0.7209709603723564 172 | ssS'2010' 173 | p41 174 | (dp42 175 | g5 176 | F0.4059495413065444 177 | sg6 178 | F0.7426804272541961 179 | sg7 180 | F0.658806952211745 181 | sg8 182 | F0.48996124735054936 183 | ssS'2005' 184 | p43 185 | (dp44 186 | g5 187 | F0.4781585844754011 188 | sg6 189 | F0.7688062702494711 190 | sg7 191 | F0.4126536266974072 192 | sg8 193 | F0.43383368251909227 194 | sssS'thenation.com' 195 | p45 196 | (dp46 197 | S'2015' 198 | p47 199 | (dp48 200 | g5 201 | F0.4305324047938475 202 | sg6 203 | F0.6026080164174088 204 | sg7 205 | F0.745194223169191 206 | sg8 207 | F0.5310463626653996 208 | ssS'2000' 209 | p49 210 | (dp50 211 | g5 212 | F0.88178763991206 213 | sg6 214 | F0.8808508450007183 215 | sg7 216 | F0.898260974681018 217 | sg8 218 | F0.8871873524308667 219 | ssS'2010' 220 | p51 221 | (dp52 222 | g5 223 | F0.43766238053965867 224 | sg6 225 | F0.7088362210544541 226 | sg7 227 | F0.6777025021637173 228 | sg8 229 | F0.5171200025480234 230 | ssS'2005' 231 | p53 232 | (dp54 233 | g5 234 | F0.7429835013444328 235 | sg6 236 | F0.7611563160882148 237 | sg7 238 | F0.7737949974302322 239 | sg8 240 | F0.7493614381068742 241 | sssS'www.nymag.com' 242 | p55 243 | (dp56 244 | S'2015' 245 | p57 246 | (dp58 247 | g5 248 | F0.02016864363656134 249 | sg6 250 | F0.17460642041679098 251 | sg7 252 | F0.5077769862898498 253 | sg8 254 | F0.03186479214128831 255 | ssS'2000' 256 | p59 257 | (dp60 258 | g5 259 | F0.7789060527515866 260 | sg6 261 | F0.8561571852146516 262 | sg7 263 | F0.8379665284959501 264 | sg8 265 | F0.7938558580435991 266 | ssS'2010' 267 | p61 268 | (dp62 269 | g5 270 | F0.17948716199317488 271 | sg6 272 | F0.5533369462507318 273 | sg7 274 | F0.7262789001261555 275 | sg8 276 | F0.26577966128502 277 | ssS'2005' 278 | p63 279 | (dp64 280 | g5 281 | F0.5501415443668539 282 | sg6 283 | F0.6884519254444252 284 | sg7 285 | F0.8505411811467111 286 | sg8 287 | F0.6610912568488648 288 | sssS'entertainment.msn.com' 289 | p65 290 | (dp66 291 | S'2015' 292 | p67 293 | (dp68 294 | g5 295 | F0.06978700649645515 296 | sg6 297 | F0.5595765261398606 298 | sg7 299 | F0.6991956727116136 300 | sg8 301 | F0.12362785274735372 302 | ssS'2000' 303 | p69 304 | (dp70 305 | g5 306 | F0.5947838565979695 307 | sg6 308 | F0.8528189254900252 309 | sg7 310 | F0.6085723906383511 311 | sg8 312 | F0.5936517907574929 313 | ssS'2010' 314 | p71 315 | (dp72 316 | g5 317 | F0.26444680402070014 318 | sg6 319 | F0.5726585329067403 320 | sg7 321 | F0.5749121624296706 322 | sg8 323 | F0.32973118882786884 324 | ssS'2005' 325 | p73 326 | (dp74 327 | g5 328 | F0.3002877518789036 329 | sg6 330 | F0.5633722482966653 331 | sg7 332 | F0.6066177067598075 333 | sg8 334 | F0.3875082921427772 335 | sssS'www.esquire.com' 336 | p75 337 | (dp76 338 | S'2015' 339 | p77 340 | (dp78 341 | g5 342 | F0.06346258311336817 343 | sg6 344 | F0.33731542718759855 345 | sg7 346 | F0.6858328422214869 347 | sg8 348 | F0.10782238335996003 349 | ssS'2000' 350 | p79 351 | (dp80 352 | g5 353 | F0.7898786198862732 354 | sg6 355 | F0.8821803354208536 356 | sg7 357 | F0.7859599903287482 358 | sg8 359 | F0.7811950374543093 360 | ssS'2010' 361 | p81 362 | (dp82 363 | g5 364 | F0.30168348382925414 365 | sg6 366 | F0.5788691101272231 367 | sg7 368 | F0.6185600017506294 369 | sg8 370 | F0.3932342256427291 371 | ssS'2005' 372 | p83 373 | (dp84 374 | g5 375 | F0.6723481555542996 376 | sg6 377 | F0.8308752572574278 378 | sg7 379 | F0.6682037922563471 380 | sg8 381 | F0.6647064111943694 382 | sssS'news.yahoo.com' 383 | p85 384 | (dp86 385 | S'2015' 386 | p87 387 | (dp88 388 | g5 389 | F0.01726963014185236 390 | sg6 391 | F0.3231211225234069 392 | sg7 393 | F0.6299381535040304 394 | sg8 395 | F0.03343206856262065 396 | ssS'2000' 397 | p89 398 | (dp90 399 | g5 400 | F0.624509586529224 401 | sg6 402 | F0.7793123337946543 403 | sg7 404 | F0.6581648562797924 405 | sg8 406 | F0.6315682303034241 407 | ssS'2010' 408 | p91 409 | (dp92 410 | g5 411 | F0.44113596091377016 412 | sg6 413 | F0.7369200359753033 414 | sg7 415 | F0.6255704851433763 416 | sg8 417 | F0.5082318305549401 418 | ssS'2005' 419 | p93 420 | (dp94 421 | g5 422 | F0.47413698373950586 423 | sg6 424 | F0.7090323392915349 425 | sg7 426 | F0.5909758822039194 427 | sg8 428 | F0.5157737245371024 429 | sssS'www.foxnews.com' 430 | p95 431 | (dp96 432 | S'2015' 433 | p97 434 | (dp98 435 | g5 436 | F0.6090391904597992 437 | sg6 438 | F0.7643059856169206 439 | sg7 440 | F0.6705224688707049 441 | sg8 442 | F0.6154112022087228 443 | ssS'2000' 444 | p99 445 | (dp100 446 | g5 447 | F0.7523577774183531 448 | sg6 449 | F0.8290007715237845 450 | sg7 451 | F0.6486163133789056 452 | sg8 453 | F0.6828751725323865 454 | ssS'2010' 455 | p101 456 | (dp102 457 | g5 458 | F0.2962800934568883 459 | sg6 460 | F0.6602084268496973 461 | sg7 462 | F0.6733754234437869 463 | sg8 464 | F0.3965989388280699 465 | ssS'2005' 466 | p103 467 | (dp104 468 | g5 469 | F0.2933510531370516 470 | sg6 471 | F0.45929734050178533 472 | sg7 473 | F0.7803326212820084 474 | sg8 475 | F0.41194662842149066 476 | sssS'mean_avgs' 477 | p105 478 | (dp106 479 | S'a' 480 | p107 481 | (dp108 482 | S'2015' 483 | p109 484 | F0.4771913630820097 485 | sS'2000' 486 | p110 487 | F0.8334122992577825 488 | sS'2010' 489 | p111 490 | F0.6540221466893419 491 | sS'2005' 492 | p112 493 | F0.707051587591644 494 | ssS'p' 495 | p113 496 | (dp114 497 | g109 498 | F0.19779555506859037 499 | sg110 500 | F0.7421451614467723 501 | sg111 502 | F0.3527928752167519 503 | sg112 504 | F0.5000401393317233 505 | ssS'r' 506 | p115 507 | (dp116 508 | g109 509 | F0.649194066536902 510 | sg110 511 | F0.7587784544963219 512 | sg111 513 | F0.634470878713714 514 | sg112 515 | F0.6995459900661355 516 | ssS'f1' 517 | p117 518 | (dp118 519 | g109 520 | F0.25671490904788163 521 | sg110 522 | F0.7386039492483721 523 | sg111 524 | F0.427488522913482 525 | sg112 526 | F0.5601644344740914 527 | sss. -------------------------------------------------------------------------------- /testing/wbce-tests/LinkQuotaFilter/avgs.pkl: -------------------------------------------------------------------------------- 1 | (dp0 2 | S'www.latimes.com' 3 | p1 4 | (dp2 5 | S'2015' 6 | p3 7 | (dp4 8 | S'avg_precision' 9 | p5 10 | F0.12303070023234008 11 | sS'avg_accuracy' 12 | p6 13 | F0.49731733391390115 14 | sS'avg_recall' 15 | p7 16 | F0.8363018514260364 17 | sS'avg_f1' 18 | p8 19 | F0.21214648094613994 20 | ssS'2000' 21 | p9 22 | (dp10 23 | g5 24 | F0.7738884403357201 25 | sg6 26 | F0.8243078336225673 27 | sg7 28 | F0.9616770892516293 29 | sg8 30 | F0.8539343487329285 31 | ssS'2010' 32 | p11 33 | (dp12 34 | g5 35 | F0.4339480725003696 36 | sg6 37 | F0.6805292249663549 38 | sg7 39 | F0.8234889977467575 40 | sg8 41 | F0.5448143734914582 42 | ssS'2005' 43 | p13 44 | (dp14 45 | g5 46 | F0.595289193234168 47 | sg6 48 | F0.73211157469712 49 | sg7 50 | F0.926982837374602 51 | sg8 52 | F0.7141617039098064 53 | sssS'www.cnn.com' 54 | p15 55 | (dp16 56 | S'2015' 57 | p17 58 | (dp18 59 | g5 60 | F0.29387452934098823 61 | sg6 62 | F0.38137601869134313 63 | sg7 64 | F0.9090368896113173 65 | sg8 66 | F0.42766745941574724 67 | ssS'2000' 68 | p19 69 | (dp20 70 | g5 71 | F0.6515399864573219 72 | sg6 73 | F0.7665877807758202 74 | sg7 75 | F0.8610065806139251 76 | sg8 77 | F0.7369041471140174 78 | ssS'2010' 79 | p21 80 | (dp22 81 | g5 82 | F0.34478378415001126 83 | sg6 84 | F0.4821234561792043 85 | sg7 86 | F0.9124192798041184 87 | sg8 88 | F0.4817308275501408 89 | ssS'2005' 90 | p23 91 | (dp24 92 | g5 93 | F0.5193456459563168 94 | sg6 95 | F0.7400141801716724 96 | sg7 97 | F0.8787630467616998 98 | sg8 99 | F0.6472382253552182 100 | sssS'www.forbes.com' 101 | p25 102 | (dp26 103 | S'2015' 104 | p27 105 | (dp28 106 | g5 107 | F0.3002309261235297 108 | sg6 109 | F0.44533555639431355 110 | sg7 111 | F0.9259418835579247 112 | sg8 113 | F0.4442087433698053 114 | ssS'2000' 115 | p29 116 | (dp30 117 | g5 118 | F0.5907915319816335 119 | sg6 120 | F0.7004519883849166 121 | sg7 122 | F0.9569877558161108 123 | sg8 124 | F0.715007022035535 125 | ssS'2010' 126 | p31 127 | (dp32 128 | g5 129 | F0.6597499715250379 130 | sg6 131 | F0.8179022411090748 132 | sg7 133 | F0.9081805079127265 134 | sg8 135 | F0.7511869220110685 136 | ssS'2005' 137 | p33 138 | (dp34 139 | g5 140 | F0.440919472040343 141 | sg6 142 | F0.7891939036625564 143 | sg7 144 | F0.8677296847728847 145 | sg8 146 | F0.5711205787631477 147 | sssS'news.bbc.co.uk' 148 | p35 149 | (dp36 150 | S'2015' 151 | p37 152 | (dp38 153 | g5 154 | F0.12744554000908173 155 | sg6 156 | F0.5299132037919041 157 | sg7 158 | F0.7749891758625078 159 | sg8 160 | F0.21520295709525783 161 | ssS'2000' 162 | p39 163 | (dp40 164 | g5 165 | F0.6426368875256027 166 | sg6 167 | F0.7863704679790642 168 | sg7 169 | F0.9046663139938275 170 | sg8 171 | F0.7495691360016425 172 | ssS'2010' 173 | p41 174 | (dp42 175 | g5 176 | F0.35022956666455196 177 | sg6 178 | F0.6639060967300228 179 | sg7 180 | F0.7951861566721486 181 | sg8 182 | F0.4745243800900827 183 | ssS'2005' 184 | p43 185 | (dp44 186 | g5 187 | F0.6212194798442074 188 | sg6 189 | F0.7768581531864079 190 | sg7 191 | F0.8970103620206412 192 | sg8 193 | F0.7285640169643994 194 | sssS'thenation.com' 195 | p45 196 | (dp46 197 | S'2015' 198 | p47 199 | (dp48 200 | g5 201 | F0.49478343796012586 202 | sg6 203 | F0.6422546386665137 204 | sg7 205 | F0.8769511793171386 206 | sg8 207 | F0.6175495352612643 208 | ssS'2000' 209 | p49 210 | (dp50 211 | g5 212 | F0.8369926820075588 213 | sg6 214 | F0.8725200402982584 215 | sg7 216 | F0.975507518244809 217 | sg8 218 | F0.8960384921166267 219 | ssS'2010' 220 | p51 221 | (dp52 222 | g5 223 | F0.46639164402721606 224 | sg6 225 | F0.6508022357756286 226 | sg7 227 | F0.8763171934479236 228 | sg8 229 | F0.5971361623341747 230 | ssS'2005' 231 | p53 232 | (dp54 233 | g5 234 | F0.7599538973666159 235 | sg6 236 | F0.7903147665367217 237 | sg7 238 | F0.951097137299713 239 | sg8 240 | F0.8424088799110683 241 | sssS'www.nymag.com' 242 | p55 243 | (dp56 244 | S'2015' 245 | p57 246 | (dp58 247 | g5 248 | F0.043716349444529276 249 | sg6 250 | F0.15976086204751105 251 | sg7 252 | F0.843350719376327 253 | sg8 254 | F0.0657848659689782 255 | ssS'2000' 256 | p59 257 | (dp60 258 | g5 259 | F0.7817656558124935 260 | sg6 261 | F0.8627749298583269 262 | sg7 263 | F0.9522504645498109 264 | sg8 265 | F0.8464093248211401 266 | ssS'2010' 267 | p61 268 | (dp62 269 | g5 270 | F0.18114137356170518 271 | sg6 272 | F0.5819261966067185 273 | sg7 274 | F0.7618851218542366 275 | sg8 276 | F0.27459414205008575 277 | ssS'2005' 278 | p63 279 | (dp64 280 | g5 281 | F0.5302978779309909 282 | sg6 283 | F0.655032948427116 284 | sg7 285 | F0.9589397589187656 286 | sg8 287 | F0.6787363214808984 288 | sssS'entertainment.msn.com' 289 | p65 290 | (dp66 291 | S'2015' 292 | p67 293 | (dp68 294 | g5 295 | F0.06974145722281835 296 | sg6 297 | F0.5220714244049803 298 | sg7 299 | F0.8638358295555182 300 | sg8 301 | F0.126001528790156 302 | ssS'2000' 303 | p69 304 | (dp70 305 | g5 306 | F0.7026935512166758 307 | sg6 308 | F0.8796427642687705 309 | sg7 310 | F0.8979205404512224 311 | sg8 312 | F0.7824729937475147 313 | ssS'2010' 314 | p71 315 | (dp72 316 | g5 317 | F0.3236012443738165 318 | sg6 319 | F0.49366662128870575 320 | sg7 321 | F0.9110053487704931 322 | sg8 323 | F0.45074001527117197 324 | ssS'2005' 325 | p73 326 | (dp74 327 | g5 328 | F0.408552851894306 329 | sg6 330 | F0.5913513896912642 331 | sg7 332 | F0.930053316072358 333 | sg8 334 | F0.5481423181024188 335 | sssS'www.esquire.com' 336 | p75 337 | (dp76 338 | S'2015' 339 | p77 340 | (dp78 341 | g5 342 | F0.06628174494498319 343 | sg6 344 | F0.32438439332486807 345 | sg7 346 | F0.80883013763079 347 | sg8 348 | F0.1148912478341382 349 | ssS'2000' 350 | p79 351 | (dp80 352 | g5 353 | F0.7644592804937886 354 | sg6 355 | F0.8698902559214398 356 | sg7 357 | F0.9427083371386117 358 | sg8 359 | F0.8350633681668603 360 | ssS'2010' 361 | p81 362 | (dp82 363 | g5 364 | F0.37115173971617904 365 | sg6 366 | F0.5824151114439085 367 | sg7 368 | F0.8518870679096348 369 | sg8 370 | F0.502124747508677 371 | ssS'2005' 372 | p83 373 | (dp84 374 | g5 375 | F0.8564583004210408 376 | sg6 377 | F0.9232450816606103 378 | sg7 379 | F0.969495650826658 380 | sg8 381 | F0.9084750963867535 382 | sssS'news.yahoo.com' 383 | p85 384 | (dp86 385 | S'2015' 386 | p87 387 | (dp88 388 | g5 389 | F0.018395332008463647 390 | sg6 391 | F0.31968477145704893 392 | sg7 393 | F0.6933790172641622 394 | sg8 395 | F0.035668622667856247 396 | ssS'2000' 397 | p89 398 | (dp90 399 | g5 400 | F0.582312988541893 401 | sg6 402 | F0.7071219099382866 403 | sg7 404 | F0.9608475314387523 405 | sg8 406 | F0.7168375560870826 407 | ssS'2010' 408 | p91 409 | (dp92 410 | g5 411 | F0.42375700219940554 412 | sg6 413 | F0.6742704693513056 414 | sg7 415 | F0.837016040001205 416 | sg8 417 | F0.5528091185506862 418 | ssS'2005' 419 | p93 420 | (dp94 421 | g5 422 | F0.5907695245619571 423 | sg6 424 | F0.7382566122084887 425 | sg7 426 | F0.8945900464090906 427 | sg8 428 | F0.6711867932682472 429 | sssS'www.foxnews.com' 430 | p95 431 | (dp96 432 | S'2015' 433 | p97 434 | (dp98 435 | g5 436 | F0.5532927097447694 437 | sg6 438 | F0.7020692702417015 439 | sg7 440 | F0.9143186917632302 441 | sg8 442 | F0.6762055195961915 443 | ssS'2000' 444 | p99 445 | (dp100 446 | g5 447 | F0.784355741068858 448 | sg6 449 | F0.8797888516574993 450 | sg7 451 | F0.9676234794985138 452 | sg8 453 | F0.8540105353052375 454 | ssS'2010' 455 | p101 456 | (dp102 457 | g5 458 | F0.3128787204252841 459 | sg6 460 | F0.5659934692578981 461 | sg7 462 | F0.8668401155675087 463 | sg8 464 | F0.44095175226152655 465 | ssS'2005' 466 | p103 467 | (dp104 468 | g5 469 | F0.3340192053349481 470 | sg6 471 | F0.48636273920144313 472 | sg7 473 | F0.9643433000452133 474 | sg8 475 | F0.48247801977803434 476 | sssS'mean_avgs' 477 | p105 478 | (dp106 479 | S'a' 480 | p107 481 | (dp108 482 | S'2015' 483 | p109 484 | F0.4524167472934086 485 | sS'2000' 486 | p110 487 | F0.8149456822704948 488 | sS'2010' 489 | p111 490 | F0.6193535122708822 491 | sS'2005' 492 | p112 493 | F0.7222741349443401 494 | ssS'p' 495 | p113 496 | (dp114 497 | g109 498 | F0.20907927270316295 499 | sg110 500 | F0.7111436745441546 501 | sg111 502 | F0.38676331191435775 503 | sg112 504 | F0.5656825448584895 505 | ssS'r' 506 | p115 507 | (dp116 508 | g109 509 | F0.8446935375364951 510 | sg110 511 | F0.9381195610997212 512 | sg111 513 | F0.8544225829686752 514 | sg112 515 | F0.9239005140501625 516 | ssS'f1' 517 | p117 518 | (dp118 519 | g109 520 | F0.29353269609455346 521 | sg110 522 | F0.7986246924128585 523 | sg111 524 | F0.5070612441119073 525 | sg112 526 | F0.6792511953919993 527 | sss. -------------------------------------------------------------------------------- /testing/wbce-tests/eatiht_results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rodricios/crawl-to-the-future/aebed4d2ac836a116b40bde75a473726362d95db/testing/wbce-tests/eatiht_results.png -------------------------------------------------------------------------------- /testing/wbce-tests/wbce_results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rodricios/crawl-to-the-future/aebed4d2ac836a116b40bde75a473726362d95db/testing/wbce-tests/wbce_results.png -------------------------------------------------------------------------------- /testing/wbce-tests/wbce_results2of3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rodricios/crawl-to-the-future/aebed4d2ac836a116b40bde75a473726362d95db/testing/wbce-tests/wbce_results2of3.png -------------------------------------------------------------------------------- /testing/wbce-tests/wbce_results3of3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rodricios/crawl-to-the-future/aebed4d2ac836a116b40bde75a473726362d95db/testing/wbce-tests/wbce_results3of3.png -------------------------------------------------------------------------------- /testing/wbce_process_results.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import re 3 | import os 4 | import collections 5 | import lxml.html 6 | 7 | 8 | def histsum(hist): return sum(hist.values()) 9 | 10 | 11 | def calc_praf(goldstd,predicted,universe): 12 | 13 | TP = predicted & goldstd 14 | 15 | FP = predicted - goldstd 16 | 17 | FN = (universe - predicted) & goldstd 18 | 19 | TN = (universe - predicted) & (universe - goldstd) 20 | 21 | precision = (histsum(TP)*1.0)/histsum(TP+FP) 22 | 23 | recall = (histsum(TP)*1.0)/histsum(TP+FN) 24 | 25 | accuracy = (histsum(TP+TN)*1.0)/histsum(TP+FP+FN+TN) 26 | try: 27 | f1 = 2 * (((precision*recall)*1.0)/(precision+recall)) 28 | except ZeroDivisionError: 29 | f1 = 0 30 | #print(precision,recall, accuracy) 31 | #raise 32 | return {'p':precision,'r':recall,'a':accuracy,'f1':f1} 33 | 34 | 35 | def listsubdir(directory): 36 | return [os.path.join(d, f) for d in directory for f in os.listdir(d) 37 | if os.path.isdir(os.path.join(d, f))] 38 | 39 | #get every non-html word on page (ie. "//text()") as f-distribution 40 | def tokens_to_hist_from_universe(data_filepath): 41 | 42 | with open(data_filepath,'r') as data_file: 43 | 44 | try: 45 | parsed_goldhtml = lxml.html.parse(data_file, 46 | lxml.html.HTMLParser(encoding="utf-8")) 47 | except: 48 | print(str(data_file.read())) 49 | raise 50 | tokenized_content = collections.Counter() 51 | 52 | content = "".join(parsed_goldhtml.xpath('//text()')) 53 | 54 | tokenized_content.update(re.split(r'\W+', content)) 55 | 56 | return tokenized_content 57 | 58 | 59 | #get every word from gold std. as f-distribution 60 | def tokens_to_hist_goldstd(test_filepath): 61 | 62 | if not os.path.exists(test_filepath): 63 | test_filepath += '.txt' 64 | 65 | with open(os.path.abspath(test_filepath), 'r') as f: 66 | 67 | #print("test_filepath",os.path.abspath(test_filepath)) 68 | content = str(f.read()) 69 | 70 | tokenized_content = collections.Counter() 71 | 72 | tokenized_content.update(re.split(r'\W+', content)) 73 | 74 | return tokenized_content 75 | 76 | 77 | #get every extracted (predicted) word as f-distribution 78 | def tokens_to_hist_extractor(extract, data_filepath): 79 | print(data_filepath) 80 | 81 | if os.path.exists(data_filepath): 82 | 83 | try: 84 | content = extract(data_filepath).read() 85 | except IOError: 86 | 87 | print(data_filepath) 88 | raise 89 | #raise Exception("\"exception\" method needs to be implemented") 90 | else: 91 | data_filepath += '.txt' 92 | try: 93 | content = extract(data_filepath).read() 94 | except IOError: 95 | 96 | print(data_filepath) 97 | raise 98 | tokenized_content = collections.Counter() 99 | tokenized_content.update(re.split(r'\W+', content)) 100 | 101 | return tokenized_content 102 | 103 | 104 | def take_measurements(prep_data): 105 | 106 | for key, val in prep_data.iteritems(): 107 | #print('take_measurements:',val['testpath']) 108 | content = tokens_to_hist_extractor(lambda x: open(x), os.path.abspath(val['testpath'])) 109 | 110 | universe = tokens_to_hist_from_universe(os.path.abspath(val['universepath'])) 111 | 112 | goldstd = tokens_to_hist_goldstd(os.path.abspath(val['goldpath'])) 113 | 114 | val['measurements'] = calc_praf(goldstd,content,universe) 115 | 116 | return prep_data 117 | 118 | def prepare_for_measurements(gold_directory, 119 | test_directory=None, 120 | goldfile_ext='.txt', 121 | testfile_ext='.html', 122 | universe_ext='.html', 123 | pickle_output_name = None): 124 | '''Given a directory, process the files (gold standard) in the following 125 | steps: 126 | 1. Build dictionary of file names to file paths to gold std files and test file 127 | 128 | 2. For each file name, get associated "universe" values (all possible outcomes W) 129 | 130 | 3. For each file name, get goldstd histogram (G) 131 | 132 | 4. For each file name, get extractor's output (C) 133 | ''' 134 | #Step 1 135 | filenames = set([re.sub(("\\"+ goldfile_ext+"|"+"\\"+testfile_ext+"|"+"\\"+universe_ext),"",name) 136 | for name in os.listdir(gold_directory)]) 137 | 138 | #print("prep_data_for_measurements",filenames[0]) 139 | 140 | if test_directory: 141 | to_test_pkg = { name: 142 | { 143 | 'domain': os.path.split(os.path.split(gold_directory)[0])[1], 144 | 'year': os.path.split(gold_directory)[1], 145 | 'goldpath':os.path.join(gold_directory, name+goldfile_ext), 146 | 'universepath':os.path.join(gold_directory, name+universe_ext), 147 | 'testpath':os.path.join(test_directory, name+testfile_ext) 148 | } for name in filenames } 149 | 150 | else: 151 | to_test_pkg = { name: 152 | { 153 | 'domain': os.path.split(os.path.split(gold_directory)[0])[1], 154 | 'year': os.path.split(gold_directory)[1], 155 | 'goldpath':os.path.join(gold_directory, name+goldfile_ext), 156 | 'universepath':os.path.join(gold_directory, name+universe_ext), 157 | 'testpath':os.path.join(test_directory, name+testfile_ext) 158 | } for name in filenames } 159 | #print(textfile_names) 160 | 161 | return to_test_pkg 162 | 163 | 164 | 165 | wcbe_path = 'c:/crawlToTheFuture/crawl-to-the-future/testing/wbce-tests' 166 | test_dir = 'c:/crawlToTheFuture/crawl-to-the-future/dataset/' 167 | 168 | goldset_folders = listsubdir(listsubdir([test_dir])) 169 | wcbe_subdirs = listsubdir([wcbe_path]) 170 | 171 | for wcbe_test in wcbe_subdirs: 172 | 173 | testset_folders = listsubdir(listsubdir([wcbe_test])) 174 | 175 | linkquotatest = [] 176 | 177 | for i in range(len(testset_folders)): 178 | 179 | prep_data = prepare_for_measurements(goldset_folders[i], testset_folders[i], testfile_ext='.txt') 180 | prep_data = take_measurements(prep_data) 181 | linkquotatest.append(prep_data) 182 | 183 | 184 | pickle.dump(linkquotatest,open(os.path.join(wcbe_test,'results.pkl'),'wb')) 185 | 186 | packages = [os.path.join(f,filename) 187 | for f in listsubdir(['c:/crawlToTheFuture/crawl-to-the-future/testing/wbce-tests']) 188 | for filename in os.listdir(f) 189 | if filename == "results.pkl"] 190 | 191 | def trim_results(domain_path): 192 | domain_results = pickle.load(open(domain_path)) 193 | part = {str(val['domain']+';'+val['year']):[] for domainyear in domain_results for key,val, in domainyear.items()} 194 | 195 | for domainyear in domain_results: 196 | for key,val in domainyear.items(): 197 | part[str(val['domain']+";"+val['year'])].append(val['measurements']) 198 | 199 | domain_path = os.path.join(*(os.path.split(domain_path)[:-1])) 200 | 201 | trimmed_path = os.path.join(domain_path,'trimmed.pkl') 202 | #print(trimmed_path) 203 | pickle.dump(part,open(trimmed_path,'wb')) 204 | 205 | return part 206 | 207 | trimmed_results = [] 208 | for pack in packages: 209 | trimmed_results.append([os.path.join(*(os.path.split(pack)[:-1])),trim_results(pack)]) 210 | 211 | 212 | extractor_avgs = {} 213 | for basepath,trimmed in trimmed_results: 214 | print(basepath) 215 | extractor = os.path.split(basepath)[-1] 216 | 217 | avg_results = {} 218 | for key,val in trimmed.items(): 219 | domain, year = key.split(";") 220 | 221 | if domain not in avg_results: 222 | avg_results[domain] = {year:{'avg_precision':sum([d['p'] for d in val])/len(val), 223 | 'avg_recall':sum([d['r'] for d in val])/len(val), 224 | 'avg_f1':sum([d['f1'] for d in val])/len(val), 225 | 'avg_accuracy':sum([d['a'] for d in val])/len(val)}} 226 | else: 227 | avg_results[domain][year] = {'avg_precision':sum([d['p'] for d in val])/len(val), 228 | 'avg_recall':sum([d['r'] for d in val])/len(val), 229 | 'avg_f1':sum([d['f1'] for d in val])/len(val), 230 | 'avg_accuracy':sum([d['a'] for d in val])/len(val)} 231 | 232 | 233 | #print("precision avg", sum([d['p'] for d in val])/len(val)) 234 | 235 | 236 | mean_avgs = {'a':{},'p':{},'r':{},'f1':{}} 237 | 238 | for year in ['2000','2005','2010','2015']: 239 | 240 | 241 | mean_avgs['a'].update({year:sum([val[year]['avg_accuracy'] for key,val in avg_results.items()])/10}) 242 | mean_avgs['p'].update({year:sum([val[year]['avg_precision'] for key,val in avg_results.items()])/10}) 243 | mean_avgs['r'].update({year:sum([val[year]['avg_recall'] for key,val in avg_results.items()])/10}) 244 | mean_avgs['f1'].update({year:sum([val[year]['avg_f1'] for key,val in avg_results.items()])/10}) 245 | 246 | 247 | 248 | extractor_avgs[extractor] = avg_results 249 | extractor_avgs[extractor]['mean_avgs'] = mean_avgs 250 | 251 | pickle.dump(avg_results,open(os.path.join(basepath,'avgs.pkl'),'wb')) 252 | 253 | pickle.dump(extractor_avgs,open('c:/crawlToTheFuture/crawl-to-the-future/testing/wbce-tests/extractor_avgs.pkl','wb')) -------------------------------------------------------------------------------- /timelines/1.1.2015-1.7.2015: -------------------------------------------------------------------------------- 1 | This week’s project timeline: 2 | 3 | 1/1/2015 - have crawled sites from year 2000-2005 in Way Back and Google search engines 4 | 5 | 1/2/2015 - 1/3/2015 - have decided on Way Back vs. Google approach or hybrid 6 | 7 | 1/3/2015 - 1/4/2015 - brief write up explaining methods used, thoughts on process 8 | CRITICAL: if too difficult, clumsy, unrealistic, etc., pivot to anecdotal piece 9 | 10 | 1/5/2015 - begin to specify “gold-standard” - (“silver”?) - a la CleanEval 11 | 12 | 1/6/2015 - begin to specify project’s approach to blind crawling for years 2000, 2005, 2010, 2015 13 | specify specific date for crawl? ex. 2000, Jan. 1st 14 | crawling engine (most likely going straight to scrapy, for time’s sake) 15 | 16 | 1/7/2015 - brief write laying out explanation to crawling process. 17 | --------------------------------------------------------------------------------