├── .gitignore ├── Dockerfile ├── LICENSE ├── README.md ├── c_bg_sites └── tor-browser-crawler-webfp-paper ├── .travis.yml ├── LICENSE ├── README.md ├── __init__.py ├── common.py ├── common.py~ ├── datacollection ├── __init__.py ├── crawler.py ├── dumputils.py ├── torutils.py ├── torutils_pj.py └── visit.py ├── etc ├── barebones.html ├── localized-urls-100-top.csv ├── urls-100-10-mon.csv ├── urls-100-google.csv └── urls-100-top.csv ├── geckodriver.log ├── log.py ├── main.py ├── requirements.txt ├── setup.py ├── start.sh ├── test ├── common_test.py ├── crawler_test.py ├── dumputils_test.py ├── env_test.py ├── setup_test.py ├── torutils_test.py ├── utils_test.py └── visit_test.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | tbb/ 3 | results/ 4 | .project 5 | .pydevproject 6 | __pycache__ 7 | tor-browser-crawler-webfp-paper/test/files/ 8 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:14.04 2 | 3 | MAINTAINER Pankaj Bhambhani 4 | 5 | #Ref1 - https://blog.scrapinghub.com/2016/09/08/how-to-deploy-custom-docker-images-for-your-web-crawlers/ 6 | #Ref2 - https://github.com/webfp/tor-browser-crawler 7 | 8 | #Install the necessary background tools 9 | RUN apt-get update -qq && \ 10 | apt-get install -qy htop iputils-ping lsof ltrace strace telnet vim xorg libxext-dev libxrender-dev libxtst-dev libgtk2.0-dev zenity 11 | 12 | #Install the tools needed to run the code 13 | RUN apt-get install -qy python python-pip python-dev tcpdump wireshark Xvfb phantomjs ethtool 14 | 15 | #Remove the apt lists 16 | #RUN rm -rf /var/lib/apt/lists/* 17 | 18 | #RUN ifconfig eth0 mtu 1500 - Needs privileges, so moved to runtime 19 | 20 | #RUN ethtool -K eth0 tx off rx off tso off gso off gro off lro off - Needs privileges, so moved to runtime 21 | 22 | COPY ./tor-browser-crawler-webfp-paper /tor 23 | 24 | ENV PATH $PATH:/tor/tbb/tor-browser-linux64-6.5.2_en-US/Browser/ 25 | 26 | RUN pip install pip --upgrade 27 | 28 | #Install the required python packages 29 | RUN pip install --no-cache-dir -r /tor/requirements.txt 30 | 31 | ENV LD_LIBRARY_PATH $LD_LIBRARY_PATH:/tor/tbb/tor-browser-linux64-6.5.2_en-US/Browser/TorBrowser/Tor/ 32 | 33 | WORKDIR /tor 34 | #Run the start.sh command 35 | CMD [ "bash", "./start.sh" ] 36 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 2, June 1991 3 | 4 | Copyright (C) 1989, 1991 Free Software Foundation, Inc., 5 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 6 | Everyone is permitted to copy and distribute verbatim copies 7 | of this license document, but changing it is not allowed. 8 | 9 | Preamble 10 | 11 | The licenses for most software are designed to take away your 12 | freedom to share and change it. By contrast, the GNU General Public 13 | License is intended to guarantee your freedom to share and change free 14 | software--to make sure the software is free for all its users. This 15 | General Public License applies to most of the Free Software 16 | Foundation's software and to any other program whose authors commit to 17 | using it. (Some other Free Software Foundation software is covered by 18 | the GNU Lesser General Public License instead.) You can apply it to 19 | your programs, too. 20 | 21 | When we speak of free software, we are referring to freedom, not 22 | price. Our General Public Licenses are designed to make sure that you 23 | have the freedom to distribute copies of free software (and charge for 24 | this service if you wish), that you receive source code or can get it 25 | if you want it, that you can change the software or use pieces of it 26 | in new free programs; and that you know you can do these things. 27 | 28 | To protect your rights, we need to make restrictions that forbid 29 | anyone to deny you these rights or to ask you to surrender the rights. 30 | These restrictions translate to certain responsibilities for you if you 31 | distribute copies of the software, or if you modify it. 32 | 33 | For example, if you distribute copies of such a program, whether 34 | gratis or for a fee, you must give the recipients all the rights that 35 | you have. You must make sure that they, too, receive or can get the 36 | source code. And you must show them these terms so they know their 37 | rights. 38 | 39 | We protect your rights with two steps: (1) copyright the software, and 40 | (2) offer you this license which gives you legal permission to copy, 41 | distribute and/or modify the software. 42 | 43 | Also, for each author's protection and ours, we want to make certain 44 | that everyone understands that there is no warranty for this free 45 | software. If the software is modified by someone else and passed on, we 46 | want its recipients to know that what they have is not the original, so 47 | that any problems introduced by others will not reflect on the original 48 | authors' reputations. 49 | 50 | Finally, any free program is threatened constantly by software 51 | patents. We wish to avoid the danger that redistributors of a free 52 | program will individually obtain patent licenses, in effect making the 53 | program proprietary. To prevent this, we have made it clear that any 54 | patent must be licensed for everyone's free use or not licensed at all. 55 | 56 | The precise terms and conditions for copying, distribution and 57 | modification follow. 58 | 59 | GNU GENERAL PUBLIC LICENSE 60 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 61 | 62 | 0. This License applies to any program or other work which contains 63 | a notice placed by the copyright holder saying it may be distributed 64 | under the terms of this General Public License. The "Program", below, 65 | refers to any such program or work, and a "work based on the Program" 66 | means either the Program or any derivative work under copyright law: 67 | that is to say, a work containing the Program or a portion of it, 68 | either verbatim or with modifications and/or translated into another 69 | language. (Hereinafter, translation is included without limitation in 70 | the term "modification".) Each licensee is addressed as "you". 71 | 72 | Activities other than copying, distribution and modification are not 73 | covered by this License; they are outside its scope. The act of 74 | running the Program is not restricted, and the output from the Program 75 | is covered only if its contents constitute a work based on the 76 | Program (independent of having been made by running the Program). 77 | Whether that is true depends on what the Program does. 78 | 79 | 1. You may copy and distribute verbatim copies of the Program's 80 | source code as you receive it, in any medium, provided that you 81 | conspicuously and appropriately publish on each copy an appropriate 82 | copyright notice and disclaimer of warranty; keep intact all the 83 | notices that refer to this License and to the absence of any warranty; 84 | and give any other recipients of the Program a copy of this License 85 | along with the Program. 86 | 87 | You may charge a fee for the physical act of transferring a copy, and 88 | you may at your option offer warranty protection in exchange for a fee. 89 | 90 | 2. You may modify your copy or copies of the Program or any portion 91 | of it, thus forming a work based on the Program, and copy and 92 | distribute such modifications or work under the terms of Section 1 93 | above, provided that you also meet all of these conditions: 94 | 95 | a) You must cause the modified files to carry prominent notices 96 | stating that you changed the files and the date of any change. 97 | 98 | b) You must cause any work that you distribute or publish, that in 99 | whole or in part contains or is derived from the Program or any 100 | part thereof, to be licensed as a whole at no charge to all third 101 | parties under the terms of this License. 102 | 103 | c) If the modified program normally reads commands interactively 104 | when run, you must cause it, when started running for such 105 | interactive use in the most ordinary way, to print or display an 106 | announcement including an appropriate copyright notice and a 107 | notice that there is no warranty (or else, saying that you provide 108 | a warranty) and that users may redistribute the program under 109 | these conditions, and telling the user how to view a copy of this 110 | License. (Exception: if the Program itself is interactive but 111 | does not normally print such an announcement, your work based on 112 | the Program is not required to print an announcement.) 113 | 114 | These requirements apply to the modified work as a whole. If 115 | identifiable sections of that work are not derived from the Program, 116 | and can be reasonably considered independent and separate works in 117 | themselves, then this License, and its terms, do not apply to those 118 | sections when you distribute them as separate works. But when you 119 | distribute the same sections as part of a whole which is a work based 120 | on the Program, the distribution of the whole must be on the terms of 121 | this License, whose permissions for other licensees extend to the 122 | entire whole, and thus to each and every part regardless of who wrote it. 123 | 124 | Thus, it is not the intent of this section to claim rights or contest 125 | your rights to work written entirely by you; rather, the intent is to 126 | exercise the right to control the distribution of derivative or 127 | collective works based on the Program. 128 | 129 | In addition, mere aggregation of another work not based on the Program 130 | with the Program (or with a work based on the Program) on a volume of 131 | a storage or distribution medium does not bring the other work under 132 | the scope of this License. 133 | 134 | 3. You may copy and distribute the Program (or a work based on it, 135 | under Section 2) in object code or executable form under the terms of 136 | Sections 1 and 2 above provided that you also do one of the following: 137 | 138 | a) Accompany it with the complete corresponding machine-readable 139 | source code, which must be distributed under the terms of Sections 140 | 1 and 2 above on a medium customarily used for software interchange; or, 141 | 142 | b) Accompany it with a written offer, valid for at least three 143 | years, to give any third party, for a charge no more than your 144 | cost of physically performing source distribution, a complete 145 | machine-readable copy of the corresponding source code, to be 146 | distributed under the terms of Sections 1 and 2 above on a medium 147 | customarily used for software interchange; or, 148 | 149 | c) Accompany it with the information you received as to the offer 150 | to distribute corresponding source code. (This alternative is 151 | allowed only for noncommercial distribution and only if you 152 | received the program in object code or executable form with such 153 | an offer, in accord with Subsection b above.) 154 | 155 | The source code for a work means the preferred form of the work for 156 | making modifications to it. For an executable work, complete source 157 | code means all the source code for all modules it contains, plus any 158 | associated interface definition files, plus the scripts used to 159 | control compilation and installation of the executable. However, as a 160 | special exception, the source code distributed need not include 161 | anything that is normally distributed (in either source or binary 162 | form) with the major components (compiler, kernel, and so on) of the 163 | operating system on which the executable runs, unless that component 164 | itself accompanies the executable. 165 | 166 | If distribution of executable or object code is made by offering 167 | access to copy from a designated place, then offering equivalent 168 | access to copy the source code from the same place counts as 169 | distribution of the source code, even though third parties are not 170 | compelled to copy the source along with the object code. 171 | 172 | 4. You may not copy, modify, sublicense, or distribute the Program 173 | except as expressly provided under this License. Any attempt 174 | otherwise to copy, modify, sublicense or distribute the Program is 175 | void, and will automatically terminate your rights under this License. 176 | However, parties who have received copies, or rights, from you under 177 | this License will not have their licenses terminated so long as such 178 | parties remain in full compliance. 179 | 180 | 5. You are not required to accept this License, since you have not 181 | signed it. However, nothing else grants you permission to modify or 182 | distribute the Program or its derivative works. These actions are 183 | prohibited by law if you do not accept this License. Therefore, by 184 | modifying or distributing the Program (or any work based on the 185 | Program), you indicate your acceptance of this License to do so, and 186 | all its terms and conditions for copying, distributing or modifying 187 | the Program or works based on it. 188 | 189 | 6. Each time you redistribute the Program (or any work based on the 190 | Program), the recipient automatically receives a license from the 191 | original licensor to copy, distribute or modify the Program subject to 192 | these terms and conditions. You may not impose any further 193 | restrictions on the recipients' exercise of the rights granted herein. 194 | You are not responsible for enforcing compliance by third parties to 195 | this License. 196 | 197 | 7. If, as a consequence of a court judgment or allegation of patent 198 | infringement or for any other reason (not limited to patent issues), 199 | conditions are imposed on you (whether by court order, agreement or 200 | otherwise) that contradict the conditions of this License, they do not 201 | excuse you from the conditions of this License. If you cannot 202 | distribute so as to satisfy simultaneously your obligations under this 203 | License and any other pertinent obligations, then as a consequence you 204 | may not distribute the Program at all. For example, if a patent 205 | license would not permit royalty-free redistribution of the Program by 206 | all those who receive copies directly or indirectly through you, then 207 | the only way you could satisfy both it and this License would be to 208 | refrain entirely from distribution of the Program. 209 | 210 | If any portion of this section is held invalid or unenforceable under 211 | any particular circumstance, the balance of the section is intended to 212 | apply and the section as a whole is intended to apply in other 213 | circumstances. 214 | 215 | It is not the purpose of this section to induce you to infringe any 216 | patents or other property right claims or to contest validity of any 217 | such claims; this section has the sole purpose of protecting the 218 | integrity of the free software distribution system, which is 219 | implemented by public license practices. Many people have made 220 | generous contributions to the wide range of software distributed 221 | through that system in reliance on consistent application of that 222 | system; it is up to the author/donor to decide if he or she is willing 223 | to distribute software through any other system and a licensee cannot 224 | impose that choice. 225 | 226 | This section is intended to make thoroughly clear what is believed to 227 | be a consequence of the rest of this License. 228 | 229 | 8. If the distribution and/or use of the Program is restricted in 230 | certain countries either by patents or by copyrighted interfaces, the 231 | original copyright holder who places the Program under this License 232 | may add an explicit geographical distribution limitation excluding 233 | those countries, so that distribution is permitted only in or among 234 | countries not thus excluded. In such case, this License incorporates 235 | the limitation as if written in the body of this License. 236 | 237 | 9. The Free Software Foundation may publish revised and/or new versions 238 | of the General Public License from time to time. Such new versions will 239 | be similar in spirit to the present version, but may differ in detail to 240 | address new problems or concerns. 241 | 242 | Each version is given a distinguishing version number. If the Program 243 | specifies a version number of this License which applies to it and "any 244 | later version", you have the option of following the terms and conditions 245 | either of that version or of any later version published by the Free 246 | Software Foundation. If the Program does not specify a version number of 247 | this License, you may choose any version ever published by the Free Software 248 | Foundation. 249 | 250 | 10. If you wish to incorporate parts of the Program into other free 251 | programs whose distribution conditions are different, write to the author 252 | to ask for permission. For software which is copyrighted by the Free 253 | Software Foundation, write to the Free Software Foundation; we sometimes 254 | make exceptions for this. Our decision will be guided by the two goals 255 | of preserving the free status of all derivatives of our free software and 256 | of promoting the sharing and reuse of software generally. 257 | 258 | NO WARRANTY 259 | 260 | 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY 261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN 262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES 263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED 264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS 266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE 267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, 268 | REPAIR OR CORRECTION. 269 | 270 | 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR 272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, 273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING 274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED 275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY 276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER 277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE 278 | POSSIBILITY OF SUCH DAMAGES. 279 | 280 | END OF TERMS AND CONDITIONS 281 | 282 | How to Apply These Terms to Your New Programs 283 | 284 | If you develop a new program, and you want it to be of the greatest 285 | possible use to the public, the best way to achieve this is to make it 286 | free software which everyone can redistribute and change under these terms. 287 | 288 | To do so, attach the following notices to the program. It is safest 289 | to attach them to the start of each source file to most effectively 290 | convey the exclusion of warranty; and each file should have at least 291 | the "copyright" line and a pointer to where the full notice is found. 292 | 293 | {description} 294 | Copyright (C) {year} {fullname} 295 | 296 | This program is free software; you can redistribute it and/or modify 297 | it under the terms of the GNU General Public License as published by 298 | the Free Software Foundation; either version 2 of the License, or 299 | (at your option) any later version. 300 | 301 | This program is distributed in the hope that it will be useful, 302 | but WITHOUT ANY WARRANTY; without even the implied warranty of 303 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 304 | GNU General Public License for more details. 305 | 306 | You should have received a copy of the GNU General Public License along 307 | with this program; if not, write to the Free Software Foundation, Inc., 308 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 309 | 310 | Also add information on how to contact you by electronic and paper mail. 311 | 312 | If the program is interactive, make it output a short notice like this 313 | when it starts in an interactive mode: 314 | 315 | Gnomovision version 69, Copyright (C) year name of author 316 | Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 317 | This is free software, and you are welcome to redistribute it 318 | under certain conditions; type `show c' for details. 319 | 320 | The hypothetical commands `show w' and `show c' should show the appropriate 321 | parts of the General Public License. Of course, the commands you use may 322 | be called something other than `show w' and `show c'; they could even be 323 | mouse-clicks or menu items--whatever suits your program. 324 | 325 | You should also get your employer (if you work as a programmer) or your 326 | school, if any, to sign a "copyright disclaimer" for the program, if 327 | necessary. Here is a sample; alter the names: 328 | 329 | Yoyodyne, Inc., hereby disclaims all copyright interest in the program 330 | `Gnomovision' (which makes passes at compilers) written by James Hacker. 331 | 332 | {signature of Ty Coon}, 1 April 1989 333 | Ty Coon, President of Vice 334 | 335 | This General Public License does not permit incorporating your program into 336 | proprietary programs. If your program is a subroutine library, you may 337 | consider it more useful to permit linking proprietary applications with the 338 | library. If this is what you want to do, use the GNU Lesser General 339 | Public License instead of this License. 340 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # webfp-crawler-phantomjs 2 | 3 | **Update - the repo has been modified since the first release to facilitate my work. If you want the phantomjs related code, check the release v1.0 [here](https://github.com/pankajb64/webfp-crawler-phantomjs/releases/tag/release-v1.0)** 4 | 5 | A python crawler for crawling Tor and collect network traces using wireshark. Used to create dataset for testing Website Fingerprinting (WF) attacks on Tor. 6 | 7 | Ref - M. Juarez, S. Afroz, G. Acar, C. Diaz, R. Greenstadt, “A Critical Analysis of Website Fingerprinting Attacks”, in the proceedings of the ACM Conference on Computer and Communications Security (CCS), pp. 263-274, ACM, 2014. http://www1.icsi.berkeley.edu/~sadia/papers/ccs-webfp-final.pdf 8 | 9 | Source taken from the release version of https://github.com/webfp/tor-browser-crawler (see https://github.com/webfp/tor-browser-crawler/releases/tag/webfp-paper) 10 | 11 | **Adapted to use phantomjs instead of firefox, so it can run on a headless linux server.** 12 | 13 | It is best to run the code in a container like docker. A docker file is provided, and a docker image is available at https://hub.docker.com/r/pankajb64/webfp-crawler-phantomjs/ **The docker image needs to be run in privileged mode as it modifies local network interface settings.** 14 | 15 | Source code is available under the same license (GPL2) as it was originally published. 16 | 17 | **Tested on Ubuntu 14.04 and Python 2.7** 18 | 19 | The contents below are taken from the original README at https://github.com/webfp/tor-browser-crawler/blob/master/README.md 20 | 21 | Requirements 22 | --------------- 23 | * Linux packages: ```python tcpdump wireshark Xvfb``` 24 | * Python packages: ```selenium requests stem psutil(version < 3) tld xvfbwrapper scapy``` 25 | 26 | # Getting started 27 | 28 | ### 1. Configure the environment 29 | 30 | * We recommend running crawls in a VM or a container (e.g. LXC) to avoid perturbations introduced by the background network traffic and system level network settings. Please note that the crawler will not only store the Tor traffic but will capture all the network traffic generated during a visit to a website. That’s why it’s extremely important to disable all the automatic/background network traffic such as the auto-updates. See, for example the [instructions for disabling automatic connections for Ubuntu](https://help.ubuntu.com/community/AutomaticConnections). 31 | 32 | * You’ll need to set capture capabilities to your user: `sudo setcap 'CAP_NET_RAW+eip CAP_NET_ADMIN+eip' /usr/bin/dumpcap` 33 | 34 | * [Download the TBB](https://www.torproject.org/download/download.html.en) and extract it to `./tbb/tor-browser-linux-_/`. 35 | 36 | * You might want to change the MTU of your network interface and disable NIC offloads that might make the traffic collected by tcpdump look different from how it would have been seen on the wire. 37 | 38 | * Change MTU to standard ethernet MTU (1500 bytes): `sudo ifconfig mtu 1500` 39 | 40 | * Disable offloads: `sudo ethtool -K tx off rx off tso off gso off gro off lro off` 41 | 42 | * See the [Wireshark Offloading page](https://wiki.wireshark.org/CaptureSetup/Offloading) for more info. 43 | 44 | 45 | 46 | ### 2. Run a crawl with the defaults 47 | 48 | ``` 49 | python main.py -u ./etc/localized-urls-100-top.csv -e wang_and_goldberg 50 | ``` 51 | 52 | To get all the available command line parameters and the usage run: 53 | 54 | ``` 55 | python main.py --help 56 | ``` 57 | 58 | ### 3. Check out the results 59 | 60 | The collected data can be found in the `results` folder: 61 | 62 | * Pcaps: `./results/latest` 63 | * Logs: `./results/latest_crawl_log` 64 | 65 | 66 | Sample crawl data 67 | ------------- 68 | You can download a sample of data collected using this crawler with the configuration used by Wang and Goldberg in their WPES'13 paper (namely 10 batches, 100 pages and 4 instances per page) from here: 69 | 70 | * [Crawl `140203_042843`](https://mega.co.nz/#!ekIXBTbZ!1bn7zSPuV5r8fS0zpp2hrMvNc4Xrj6F2oUbjlyBb87o) 71 | (SHA256: 06a007a41ca83bd24ad3f7e9f5e8f881bd81111a547cbfcf20f057be1b89d0dd) 72 | 73 | The crawl names include a timestamp. The list of crawls used in our study can be found in the appendix of the paper [1]. 74 | 75 | 76 | Notes 77 | ------- 78 | * Tested on *Xubuntu 14.04* and *Debian 7.8*. 79 | 80 | 81 | References 82 | ------------- 83 | 84 | [1] M. Juarez, S. Afroz, G. Acar, C. Diaz, R. Greenstadt, “A Critical Analysis of Website Fingerprinting Attacks”, in the proceedings of the ACM Conference on Computer and Communications Security (CCS), pp. 263-274, ACM, 2014. 85 | 86 | [2] T. Wang and I. Goldberg. “Improved Website Fingerprinting on Tor”, in the proceedings of the ACM Workshop on Privacy in the Electronic Society (WPES), pp. 201–212. ACM, 2013. 87 | -------------------------------------------------------------------------------- /tor-browser-crawler-webfp-paper/.travis.yml: -------------------------------------------------------------------------------- 1 | sudo: required 2 | language: python 3 | python: 4 | - "2.7" 5 | before_install: 6 | - "export DISPLAY=:99.0" 7 | - "sh -e /etc/init.d/xvfb start" 8 | install: 9 | - sudo apt-get -qq install xvfb tcpdump wireshark 10 | - pip install -r requirements.txt 11 | - python setup.py 12 | before_script: 13 | - cd test 14 | script: py.test -k 'not dumputils_test.py' 15 | 16 | -------------------------------------------------------------------------------- /tor-browser-crawler-webfp-paper/LICENSE: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 2, June 1991 3 | 4 | Copyright (C) 1989, 1991 Free Software Foundation, Inc., 5 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 6 | Everyone is permitted to copy and distribute verbatim copies 7 | of this license document, but changing it is not allowed. 8 | 9 | Preamble 10 | 11 | The licenses for most software are designed to take away your 12 | freedom to share and change it. By contrast, the GNU General Public 13 | License is intended to guarantee your freedom to share and change free 14 | software--to make sure the software is free for all its users. This 15 | General Public License applies to most of the Free Software 16 | Foundation's software and to any other program whose authors commit to 17 | using it. (Some other Free Software Foundation software is covered by 18 | the GNU Lesser General Public License instead.) You can apply it to 19 | your programs, too. 20 | 21 | When we speak of free software, we are referring to freedom, not 22 | price. Our General Public Licenses are designed to make sure that you 23 | have the freedom to distribute copies of free software (and charge for 24 | this service if you wish), that you receive source code or can get it 25 | if you want it, that you can change the software or use pieces of it 26 | in new free programs; and that you know you can do these things. 27 | 28 | To protect your rights, we need to make restrictions that forbid 29 | anyone to deny you these rights or to ask you to surrender the rights. 30 | These restrictions translate to certain responsibilities for you if you 31 | distribute copies of the software, or if you modify it. 32 | 33 | For example, if you distribute copies of such a program, whether 34 | gratis or for a fee, you must give the recipients all the rights that 35 | you have. You must make sure that they, too, receive or can get the 36 | source code. And you must show them these terms so they know their 37 | rights. 38 | 39 | We protect your rights with two steps: (1) copyright the software, and 40 | (2) offer you this license which gives you legal permission to copy, 41 | distribute and/or modify the software. 42 | 43 | Also, for each author's protection and ours, we want to make certain 44 | that everyone understands that there is no warranty for this free 45 | software. If the software is modified by someone else and passed on, we 46 | want its recipients to know that what they have is not the original, so 47 | that any problems introduced by others will not reflect on the original 48 | authors' reputations. 49 | 50 | Finally, any free program is threatened constantly by software 51 | patents. We wish to avoid the danger that redistributors of a free 52 | program will individually obtain patent licenses, in effect making the 53 | program proprietary. To prevent this, we have made it clear that any 54 | patent must be licensed for everyone's free use or not licensed at all. 55 | 56 | The precise terms and conditions for copying, distribution and 57 | modification follow. 58 | 59 | GNU GENERAL PUBLIC LICENSE 60 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 61 | 62 | 0. This License applies to any program or other work which contains 63 | a notice placed by the copyright holder saying it may be distributed 64 | under the terms of this General Public License. The "Program", below, 65 | refers to any such program or work, and a "work based on the Program" 66 | means either the Program or any derivative work under copyright law: 67 | that is to say, a work containing the Program or a portion of it, 68 | either verbatim or with modifications and/or translated into another 69 | language. (Hereinafter, translation is included without limitation in 70 | the term "modification".) Each licensee is addressed as "you". 71 | 72 | Activities other than copying, distribution and modification are not 73 | covered by this License; they are outside its scope. The act of 74 | running the Program is not restricted, and the output from the Program 75 | is covered only if its contents constitute a work based on the 76 | Program (independent of having been made by running the Program). 77 | Whether that is true depends on what the Program does. 78 | 79 | 1. You may copy and distribute verbatim copies of the Program's 80 | source code as you receive it, in any medium, provided that you 81 | conspicuously and appropriately publish on each copy an appropriate 82 | copyright notice and disclaimer of warranty; keep intact all the 83 | notices that refer to this License and to the absence of any warranty; 84 | and give any other recipients of the Program a copy of this License 85 | along with the Program. 86 | 87 | You may charge a fee for the physical act of transferring a copy, and 88 | you may at your option offer warranty protection in exchange for a fee. 89 | 90 | 2. You may modify your copy or copies of the Program or any portion 91 | of it, thus forming a work based on the Program, and copy and 92 | distribute such modifications or work under the terms of Section 1 93 | above, provided that you also meet all of these conditions: 94 | 95 | a) You must cause the modified files to carry prominent notices 96 | stating that you changed the files and the date of any change. 97 | 98 | b) You must cause any work that you distribute or publish, that in 99 | whole or in part contains or is derived from the Program or any 100 | part thereof, to be licensed as a whole at no charge to all third 101 | parties under the terms of this License. 102 | 103 | c) If the modified program normally reads commands interactively 104 | when run, you must cause it, when started running for such 105 | interactive use in the most ordinary way, to print or display an 106 | announcement including an appropriate copyright notice and a 107 | notice that there is no warranty (or else, saying that you provide 108 | a warranty) and that users may redistribute the program under 109 | these conditions, and telling the user how to view a copy of this 110 | License. (Exception: if the Program itself is interactive but 111 | does not normally print such an announcement, your work based on 112 | the Program is not required to print an announcement.) 113 | 114 | These requirements apply to the modified work as a whole. If 115 | identifiable sections of that work are not derived from the Program, 116 | and can be reasonably considered independent and separate works in 117 | themselves, then this License, and its terms, do not apply to those 118 | sections when you distribute them as separate works. But when you 119 | distribute the same sections as part of a whole which is a work based 120 | on the Program, the distribution of the whole must be on the terms of 121 | this License, whose permissions for other licensees extend to the 122 | entire whole, and thus to each and every part regardless of who wrote it. 123 | 124 | Thus, it is not the intent of this section to claim rights or contest 125 | your rights to work written entirely by you; rather, the intent is to 126 | exercise the right to control the distribution of derivative or 127 | collective works based on the Program. 128 | 129 | In addition, mere aggregation of another work not based on the Program 130 | with the Program (or with a work based on the Program) on a volume of 131 | a storage or distribution medium does not bring the other work under 132 | the scope of this License. 133 | 134 | 3. You may copy and distribute the Program (or a work based on it, 135 | under Section 2) in object code or executable form under the terms of 136 | Sections 1 and 2 above provided that you also do one of the following: 137 | 138 | a) Accompany it with the complete corresponding machine-readable 139 | source code, which must be distributed under the terms of Sections 140 | 1 and 2 above on a medium customarily used for software interchange; or, 141 | 142 | b) Accompany it with a written offer, valid for at least three 143 | years, to give any third party, for a charge no more than your 144 | cost of physically performing source distribution, a complete 145 | machine-readable copy of the corresponding source code, to be 146 | distributed under the terms of Sections 1 and 2 above on a medium 147 | customarily used for software interchange; or, 148 | 149 | c) Accompany it with the information you received as to the offer 150 | to distribute corresponding source code. (This alternative is 151 | allowed only for noncommercial distribution and only if you 152 | received the program in object code or executable form with such 153 | an offer, in accord with Subsection b above.) 154 | 155 | The source code for a work means the preferred form of the work for 156 | making modifications to it. For an executable work, complete source 157 | code means all the source code for all modules it contains, plus any 158 | associated interface definition files, plus the scripts used to 159 | control compilation and installation of the executable. However, as a 160 | special exception, the source code distributed need not include 161 | anything that is normally distributed (in either source or binary 162 | form) with the major components (compiler, kernel, and so on) of the 163 | operating system on which the executable runs, unless that component 164 | itself accompanies the executable. 165 | 166 | If distribution of executable or object code is made by offering 167 | access to copy from a designated place, then offering equivalent 168 | access to copy the source code from the same place counts as 169 | distribution of the source code, even though third parties are not 170 | compelled to copy the source along with the object code. 171 | 172 | 4. You may not copy, modify, sublicense, or distribute the Program 173 | except as expressly provided under this License. Any attempt 174 | otherwise to copy, modify, sublicense or distribute the Program is 175 | void, and will automatically terminate your rights under this License. 176 | However, parties who have received copies, or rights, from you under 177 | this License will not have their licenses terminated so long as such 178 | parties remain in full compliance. 179 | 180 | 5. You are not required to accept this License, since you have not 181 | signed it. However, nothing else grants you permission to modify or 182 | distribute the Program or its derivative works. These actions are 183 | prohibited by law if you do not accept this License. Therefore, by 184 | modifying or distributing the Program (or any work based on the 185 | Program), you indicate your acceptance of this License to do so, and 186 | all its terms and conditions for copying, distributing or modifying 187 | the Program or works based on it. 188 | 189 | 6. Each time you redistribute the Program (or any work based on the 190 | Program), the recipient automatically receives a license from the 191 | original licensor to copy, distribute or modify the Program subject to 192 | these terms and conditions. You may not impose any further 193 | restrictions on the recipients' exercise of the rights granted herein. 194 | You are not responsible for enforcing compliance by third parties to 195 | this License. 196 | 197 | 7. If, as a consequence of a court judgment or allegation of patent 198 | infringement or for any other reason (not limited to patent issues), 199 | conditions are imposed on you (whether by court order, agreement or 200 | otherwise) that contradict the conditions of this License, they do not 201 | excuse you from the conditions of this License. If you cannot 202 | distribute so as to satisfy simultaneously your obligations under this 203 | License and any other pertinent obligations, then as a consequence you 204 | may not distribute the Program at all. For example, if a patent 205 | license would not permit royalty-free redistribution of the Program by 206 | all those who receive copies directly or indirectly through you, then 207 | the only way you could satisfy both it and this License would be to 208 | refrain entirely from distribution of the Program. 209 | 210 | If any portion of this section is held invalid or unenforceable under 211 | any particular circumstance, the balance of the section is intended to 212 | apply and the section as a whole is intended to apply in other 213 | circumstances. 214 | 215 | It is not the purpose of this section to induce you to infringe any 216 | patents or other property right claims or to contest validity of any 217 | such claims; this section has the sole purpose of protecting the 218 | integrity of the free software distribution system, which is 219 | implemented by public license practices. Many people have made 220 | generous contributions to the wide range of software distributed 221 | through that system in reliance on consistent application of that 222 | system; it is up to the author/donor to decide if he or she is willing 223 | to distribute software through any other system and a licensee cannot 224 | impose that choice. 225 | 226 | This section is intended to make thoroughly clear what is believed to 227 | be a consequence of the rest of this License. 228 | 229 | 8. If the distribution and/or use of the Program is restricted in 230 | certain countries either by patents or by copyrighted interfaces, the 231 | original copyright holder who places the Program under this License 232 | may add an explicit geographical distribution limitation excluding 233 | those countries, so that distribution is permitted only in or among 234 | countries not thus excluded. In such case, this License incorporates 235 | the limitation as if written in the body of this License. 236 | 237 | 9. The Free Software Foundation may publish revised and/or new versions 238 | of the General Public License from time to time. Such new versions will 239 | be similar in spirit to the present version, but may differ in detail to 240 | address new problems or concerns. 241 | 242 | Each version is given a distinguishing version number. If the Program 243 | specifies a version number of this License which applies to it and "any 244 | later version", you have the option of following the terms and conditions 245 | either of that version or of any later version published by the Free 246 | Software Foundation. If the Program does not specify a version number of 247 | this License, you may choose any version ever published by the Free Software 248 | Foundation. 249 | 250 | 10. If you wish to incorporate parts of the Program into other free 251 | programs whose distribution conditions are different, write to the author 252 | to ask for permission. For software which is copyrighted by the Free 253 | Software Foundation, write to the Free Software Foundation; we sometimes 254 | make exceptions for this. Our decision will be guided by the two goals 255 | of preserving the free status of all derivatives of our free software and 256 | of promoting the sharing and reuse of software generally. 257 | 258 | NO WARRANTY 259 | 260 | 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY 261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN 262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES 263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED 264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS 266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE 267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, 268 | REPAIR OR CORRECTION. 269 | 270 | 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR 272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, 273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING 274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED 275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY 276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER 277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE 278 | POSSIBILITY OF SUCH DAMAGES. 279 | 280 | END OF TERMS AND CONDITIONS 281 | 282 | How to Apply These Terms to Your New Programs 283 | 284 | If you develop a new program, and you want it to be of the greatest 285 | possible use to the public, the best way to achieve this is to make it 286 | free software which everyone can redistribute and change under these terms. 287 | 288 | To do so, attach the following notices to the program. It is safest 289 | to attach them to the start of each source file to most effectively 290 | convey the exclusion of warranty; and each file should have at least 291 | the "copyright" line and a pointer to where the full notice is found. 292 | 293 | {description} 294 | Copyright (C) {year} {fullname} 295 | 296 | This program is free software; you can redistribute it and/or modify 297 | it under the terms of the GNU General Public License as published by 298 | the Free Software Foundation; either version 2 of the License, or 299 | (at your option) any later version. 300 | 301 | This program is distributed in the hope that it will be useful, 302 | but WITHOUT ANY WARRANTY; without even the implied warranty of 303 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 304 | GNU General Public License for more details. 305 | 306 | You should have received a copy of the GNU General Public License along 307 | with this program; if not, write to the Free Software Foundation, Inc., 308 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 309 | 310 | Also add information on how to contact you by electronic and paper mail. 311 | 312 | If the program is interactive, make it output a short notice like this 313 | when it starts in an interactive mode: 314 | 315 | Gnomovision version 69, Copyright (C) year name of author 316 | Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 317 | This is free software, and you are welcome to redistribute it 318 | under certain conditions; type `show c' for details. 319 | 320 | The hypothetical commands `show w' and `show c' should show the appropriate 321 | parts of the General Public License. Of course, the commands you use may 322 | be called something other than `show w' and `show c'; they could even be 323 | mouse-clicks or menu items--whatever suits your program. 324 | 325 | You should also get your employer (if you work as a programmer) or your 326 | school, if any, to sign a "copyright disclaimer" for the program, if 327 | necessary. Here is a sample; alter the names: 328 | 329 | Yoyodyne, Inc., hereby disclaims all copyright interest in the program 330 | `Gnomovision' (which makes passes at compilers) written by James Hacker. 331 | 332 | {signature of Ty Coon}, 1 April 1989 333 | Ty Coon, President of Vice 334 | 335 | This General Public License does not permit incorporating your program into 336 | proprietary programs. If your program is a subroutine library, you may 337 | consider it more useful to permit linking proprietary applications with the 338 | library. If this is what you want to do, use the GNU Lesser General 339 | Public License instead of this License. 340 | 341 | -------------------------------------------------------------------------------- /tor-browser-crawler-webfp-paper/README.md: -------------------------------------------------------------------------------- 1 | tor-browser-crawler [![Build Status](https://travis-ci.org/webfp/tor-browser-crawler.svg)](https://travis-ci.org/webfp/tor-browser-crawler) 2 | =============== 3 | This repository contains the source code for the data collection part of our ACM CCS’14 paper [“A Critical Analysis of Website Fingerprinting Attacks”](http://homes.esat.kuleuven.be/~mjuarezm/index_files/pdf/ccs14.pdf) [1]. 4 | 5 | The crawler can be used in the similar website fingerprinting studies. It uses [Selenium](https://selenium-python.readthedocs.org/) to drive the **Tor Browser** and [stem](https://stem.torproject.org/) to control the tor. Our implementation started as a fork of [tor-browser-selenium](https://github.com/isislovecruft/tor-browser-selenium) (by @isislovecruft). 6 | 7 | For the crawl parameters such as `batch` and `instance` refer to the ACM WPES’13 paper by Wang and Goldberg[2]. 8 | 9 | Requirements 10 | --------------- 11 | * Linux packages: ```python tcpdump wireshark Xvfb``` 12 | * Python packages: ```selenium requests stem psutil tld xvfbwrapper``` 13 | 14 | # Getting started 15 | 16 | ### 1. Configure the environment 17 | 18 | * We recommend running crawls in a VM or a container (e.g. LXC) to avoid perturbations introduced by the background network traffic and system level network settings. Please note that the crawler will not only store the Tor traffic but will capture all the network traffic generated during a visit to a website. That’s why it’s extremely important to disable all the automatic/background network traffic such as the auto-updates. See, for example the [instructions for disabling automatic connections for Ubuntu](https://help.ubuntu.com/community/AutomaticConnections). 19 | 20 | * You’ll need to set capture capabilities to your user: `sudo setcap 'CAP_NET_RAW+eip CAP_NET_ADMIN+eip' /usr/bin/dumpcap` 21 | 22 | * [Download the TBB](https://www.torproject.org/download/download.html.en) and extract it to `./tbb/tor-browser-linux-_/`. 23 | 24 | * You might want to change the MTU of your network interface and disable NIC offloads that might make the traffic collected by tcpdump look different from how it would have been seen on the wire. 25 | 26 | * Change MTU to standard ethernet MTU (1500 bytes): `sudo ifconfig mtu 1500` 27 | 28 | * Disable offloads: `sudo ethtool -K tx off rx off tso off gso off` 29 | 30 | * See the [Wireshark Offloading page](https://wiki.wireshark.org/CaptureSetup/Offloading) for more info. 31 | 32 | 33 | 34 | ### 2. Run a crawl with the defaults 35 | 36 | ``` 37 | python main.py -u ./etc/localized-urls-100-top.csv -e wang_and_goldberg 38 | ``` 39 | 40 | To get all the available command line parameters and the usage run: 41 | 42 | ``` 43 | python main.py --help 44 | ``` 45 | 46 | ### 3. Check out the results 47 | 48 | The collected data can be found in the `results` folder: 49 | 50 | * Pcaps: `./results/latest` 51 | * Logs: `./results/latest_crawl_log` 52 | 53 | 54 | Sample crawl data 55 | ------------- 56 | You can download a sample of data collected using this crawler with the configuration used by Wang and Goldberg in their WPES'13 paper (namely 10 batches, 100 pages and 4 instances per page) from here: 57 | 58 | * [Crawl `140203_042843`](https://mega.co.nz/#!ekIXBTbZ!1bn7zSPuV5r8fS0zpp2hrMvNc4Xrj6F2oUbjlyBb87o) 59 | (SHA256: 06a007a41ca83bd24ad3f7e9f5e8f881bd81111a547cbfcf20f057be1b89d0dd) 60 | 61 | The crawl names include a timestamp. The list of crawls used in our study can be found in the appendix of the paper [1]. 62 | 63 | 64 | Notes 65 | ------- 66 | * Tested on *Xubuntu 14.04* and *Debian 7.8*. 67 | 68 | 69 | References 70 | ------------- 71 | 72 | [1] M. Juarez, S. Afroz, G. Acar, C. Diaz, R. Greenstadt, “A Critical Analysis of Website Fingerprinting Attacks”, in the proceedings of the ACM Conference on Computer and Communications Security (CCS), pp. 263-274, ACM, 2014. 73 | 74 | [2] T. Wang and I. Goldberg. “Improved Website Fingerprinting on Tor”, in the proceedings of the ACM Workshop on Privacy in the Electronic Society (WPES), pp. 201–212. ACM, 2013. 75 | -------------------------------------------------------------------------------- /tor-browser-crawler-webfp-paper/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pankajb64/webfp-crawler-phantomjs/51e195ce73c5c8bf620941c85f4fed0b92e98f98/tor-browser-crawler-webfp-paper/__init__.py -------------------------------------------------------------------------------- /tor-browser-crawler-webfp-paper/common.py: -------------------------------------------------------------------------------- 1 | import os 2 | import platform 3 | 4 | 5 | class TBBTarballVerificationError(Exception): 6 | pass 7 | 8 | 9 | class TBBSigningKeyImportError(Exception): 10 | pass 11 | 12 | 13 | class TBBGetRecommendedVersionError(Exception): 14 | pass 15 | 16 | 17 | class DumpcapTimeoutError(Exception): 18 | pass 19 | 20 | 21 | env_vars = os.environ 22 | # whether we're running on Travis CI or not 23 | running_in_CI = "CONTINUOUS_INTEGRATION" in env_vars and "TRAVIS" in env_vars 24 | 25 | architecture = platform.architecture() 26 | if '64' in architecture[0]: 27 | arch = '64' 28 | machine = 'x86_64' 29 | elif '32' in architecture[0]: 30 | arch = '32' 31 | machine = 'i686' 32 | else: 33 | raise RuntimeError('Architecture is not known: %s' % architecture) 34 | 35 | # shortcuts 36 | path = os.path 37 | join = path.join 38 | dirname = os.path.dirname 39 | expanduser = os.path.expanduser 40 | 41 | # timeouts and pauses 42 | PAUSE_BETWEEN_SITES = 5 # pause before crawling a new site 43 | WAIT_IN_SITE = 5 # time to wait after the page loads 44 | PAUSE_BETWEEN_INSTANCES = 4 # pause before visiting the same site (instances) 45 | SOFT_VISIT_TIMEOUT = 120 # timeout used by selenium and dumpcap 46 | # signal based hard timeout in case soft timeout fails 47 | HARD_VISIT_TIMEOUT = SOFT_VISIT_TIMEOUT + 10 48 | # max dumpcap size in KB 49 | MAX_DUMP_SIZE = 30000 50 | # max filename length 51 | MAX_FNAME_LENGTH = 200 52 | 53 | DISABLE_RANDOMIZEDPIPELINENING = False # use with caution! 54 | STREAM_CLOSE_TIMEOUT = 20 # wait 20 seconds before raising an alarm signal 55 | # otherwise we had many cases where get_streams hanged 56 | 57 | XVFB_W = 1280 58 | XVFB_H = 720 59 | 60 | # Tor browser version suffixes 61 | # The version used by Wang & Goldberg 62 | TBB_V_2_4_7_A1 = "2.4.7-alpha-1" 63 | TBB_WANG_AND_GOLDBERG = TBB_V_2_4_7_A1 64 | 65 | TBB_V_3_5 = "3.5" 66 | TBB_V_4_0_8 = "4.0.8" 67 | TBB_V_6_5_2 = "6.5.2" 68 | TBB_DEFAULT_VERSION = TBB_V_6_5_2 69 | 70 | TBB_KNOWN_VERSIONS = [TBB_V_2_4_7_A1, TBB_V_3_5, TBB_V_4_0_8, TBB_V_6_5_2] 71 | 72 | # Default paths 73 | BASE_DIR = path.abspath(os.path.dirname(__file__)) 74 | DATASET_DIR = join(BASE_DIR, "datasets") 75 | ALEXA_DIR = join(DATASET_DIR, "alexa") 76 | TEST_DIR = join(BASE_DIR, 'test') 77 | TEST_FILES_DIR = join(TEST_DIR, 'files') 78 | DUMMY_TEST_DIR = join(TEST_FILES_DIR, 'dummy') 79 | DUMMY_TEST_DIR_TARGZIPPED = DUMMY_TEST_DIR + ".tar.gz" 80 | TBB_TEST_TARBALL = join(TEST_FILES_DIR, 81 | 'tor-browser-linux64-4.0.99_en-US.tar.xz') 82 | TBB_TEST_TARBALL_EXTRACTED = join(TEST_FILES_DIR, 83 | 'tor-browser-linux64-4.0.99_en-US') 84 | RESULTS_DIR = join(BASE_DIR, 'results') 85 | ETC_DIR = join(BASE_DIR, 'etc') 86 | PERMISSIONS_DB = join(ETC_DIR, 'permissions.sqlite') 87 | HOME_PATH = expanduser('~') 88 | TBB_BASE_DIR = join(BASE_DIR, 'tbb') 89 | 90 | # Top URLs localized (DE) to prevent the effect of localization 91 | LOCALIZED_DATASET = join(ETC_DIR, "localized-urls-100-top.csv") 92 | 93 | # Experiment type determines what to do during the visits 94 | EXP_TYPE_WANG_AND_GOLDBERG = "wang_and_goldberg" # setting from WPES'13 paper 95 | EXP_TYPE_MULTITAB_ALEXA = "multitab_alexa" # open Alexa sites in multiple tabs 96 | 97 | # Tor ports 98 | SOCKS_PORT = 9050 99 | CONTROLLER_PORT = 9051 100 | MAX_ENTRY_GUARDS = "1" 101 | 102 | # defaults for batch and instance numbers 103 | NUM_BATCHES = 2 104 | NUM_INSTANCES = 4 105 | MAX_SITES_PER_TOR_PROCESS = 100 # reset tor process after crawling 100 sites 106 | 107 | # torrc dictionaries 108 | TORRC_DEFAULT = {'SocksPort': str(SOCKS_PORT), 109 | 'ControlPort': str(CONTROLLER_PORT)} 110 | 111 | TORRC_WANG_AND_GOLDBERG = {'SocksPort': str(SOCKS_PORT), 112 | 'ControlPort': str(CONTROLLER_PORT), 113 | 'MaxCircuitDirtiness': '600000', 114 | 'UseEntryGuards': '0' 115 | } 116 | 117 | # Directory structure and paths depend on TBB versions 118 | # Path to Firefox binary in TBB dir 119 | TBB_V2_FF_BIN_PATH = join('App', 'Firefox', 'firefox') 120 | TBB_V3_FF_BIN_PATH = join('Browser', 'firefox') 121 | TBB_V4_FF_BIN_PATH = join('Browser', 'firefox') 122 | TBB_V6_FF_BIN_PATH = TBB_V4_FF_BIN_PATH 123 | 124 | TBB_FF_BIN_PATH_DICT = {"2": TBB_V2_FF_BIN_PATH, 125 | "3": TBB_V3_FF_BIN_PATH, 126 | "4": TBB_V4_FF_BIN_PATH, 127 | "6": TBB_V6_FF_BIN_PATH, 128 | } 129 | 130 | # Path to Firefox profile in TBB dir 131 | TBB_V2_PROFILE_PATH = join('Data', 'profile') 132 | TBB_V3_PROFILE_PATH = join('Data', 'Browser', 'profile.default') 133 | TBB_V4_PROFILE_PATH = join('Browser', 'TorBrowser', 'Data', 134 | 'Browser', 'profile.default') 135 | TBB_V6_PROFILE_PATH = TBB_V4_PROFILE_PATH 136 | 137 | TBB_PROFILE_DIR_DICT = {"2": TBB_V2_PROFILE_PATH, 138 | "3": TBB_V3_PROFILE_PATH, 139 | "4": TBB_V4_PROFILE_PATH, 140 | "6": TBB_V6_PROFILE_PATH, 141 | } 142 | 143 | # Path to Tor binary in TBB dir 144 | TOR_V2_BINARY_PATH = join('App', 'tor') 145 | TOR_V3_BINARY_PATH = join('Tor', 'tor') 146 | TOR_V4_BINARY_PATH = join('Browser', 'TorBrowser', 'Tor', 'tor') 147 | TOR_V6_BINARY_PATH = TOR_V4_BINARY_PATH 148 | 149 | TOR_BINARY_PATH_DICT = {"2": TOR_V2_BINARY_PATH, 150 | "3": TOR_V3_BINARY_PATH, 151 | "4": TOR_V4_BINARY_PATH, 152 | "6": TOR_V6_BINARY_PATH, 153 | } 154 | # Path to Tor binary in TBB dir 155 | TOR_V2_DATA_DIR = join('Data', 'Tor') 156 | TOR_V3_DATA_DIR = join('Data', 'Tor') 157 | TOR_V4_DATA_DIR = join('Browser', 'TorBrowser', 'Data', 'Tor') 158 | TOR_V6_DATA_DIR = join('Browser', 'TorBrowser', 'Data', 'Tor') 159 | 160 | TOR_DATA_DIR_DICT = {"2": TOR_V2_DATA_DIR, 161 | "3": TOR_V3_DATA_DIR, 162 | "4": TOR_V4_DATA_DIR, 163 | "6": TOR_V6_DATA_DIR, 164 | } 165 | 166 | 167 | def get_tbb_major_version(version): 168 | """Return major version of TBB.""" 169 | return version.split(".")[0] 170 | 171 | 172 | def get_tbb_dirname(version, os_name="linux", lang="en-US"): 173 | """Return path for Tor Browser Bundle for given version and bits.""" 174 | return "tor-browser-%s%s-%s_%s" % (os_name, arch, version, lang) 175 | 176 | 177 | def get_tbb_path(version, os_name="linux", lang="en-US"): 178 | """Return path for Tor Browser Bundle for given version and bits.""" 179 | dirname = get_tbb_dirname(version, os_name, lang) 180 | return join(TBB_BASE_DIR, dirname) 181 | 182 | 183 | def get_tb_bin_path(version, os_name="linux", lang="en-US"): 184 | """Return a binary path for Tor Browser.""" 185 | major = get_tbb_major_version(version) 186 | # bin_path = TBB_V3_FF_BIN_PATH if major is "3" else TBB_V2_FF_BIN_PATH 187 | bin_path = TBB_FF_BIN_PATH_DICT[major] 188 | dir_path = get_tbb_path(version, os_name, lang) 189 | return join(dir_path, bin_path) 190 | 191 | 192 | def get_tor_bin_path(version, os_name="linux", lang="en-US"): 193 | """Return a binary path for Tor.""" 194 | major = get_tbb_major_version(version) 195 | bin_path = TOR_BINARY_PATH_DICT[major] 196 | dir_path = get_tbb_path(version, os_name, lang) 197 | return join(dir_path, bin_path) 198 | 199 | 200 | def get_tbb_profile_path(version, os_name="linux", lang="en-US"): 201 | """Return profile path for Tor Browser Bundle.""" 202 | major = get_tbb_major_version(version) 203 | profile = TBB_PROFILE_DIR_DICT[major] 204 | dir_path = get_tbb_path(version, os_name, lang) 205 | return join(dir_path, profile) 206 | 207 | 208 | def get_tor_data_path(version, os_name="linux", lang="en-US"): 209 | """Return the path for Data dir of Tor.""" 210 | major = get_tbb_major_version(version) 211 | data_path = TOR_DATA_DIR_DICT[major] 212 | tbb_path = get_tbb_path(version, os_name, lang) 213 | return join(tbb_path, data_path) 214 | -------------------------------------------------------------------------------- /tor-browser-crawler-webfp-paper/common.py~: -------------------------------------------------------------------------------- 1 | import os 2 | import platform 3 | 4 | 5 | class TBBTarballVerificationError(Exception): 6 | pass 7 | 8 | 9 | class TBBSigningKeyImportError(Exception): 10 | pass 11 | 12 | 13 | class TBBGetRecommendedVersionError(Exception): 14 | pass 15 | 16 | 17 | class DumpcapTimeoutError(Exception): 18 | pass 19 | 20 | 21 | env_vars = os.environ 22 | # whether we're running on Travis CI or not 23 | running_in_CI = "CONTINUOUS_INTEGRATION" in env_vars and "TRAVIS" in env_vars 24 | 25 | architecture = platform.architecture() 26 | if '64' in architecture[0]: 27 | arch = '64' 28 | machine = 'x86_64' 29 | elif '32' in architecture[0]: 30 | arch = '32' 31 | machine = 'i686' 32 | else: 33 | raise RuntimeError('Architecture is not known: %s' % architecture) 34 | 35 | # shortcuts 36 | path = os.path 37 | join = path.join 38 | dirname = os.path.dirname 39 | expanduser = os.path.expanduser 40 | 41 | # timeouts and pauses 42 | PAUSE_BETWEEN_SITES = 5 # pause before crawling a new site 43 | WAIT_IN_SITE = 5 # time to wait after the page loads 44 | PAUSE_BETWEEN_INSTANCES = 4 # pause before visiting the same site (instances) 45 | SOFT_VISIT_TIMEOUT = 120 # timeout used by selenium and dumpcap 46 | # signal based hard timeout in case soft timeout fails 47 | HARD_VISIT_TIMEOUT = SOFT_VISIT_TIMEOUT + 10 48 | # max dumpcap size in KB 49 | MAX_DUMP_SIZE = 30000 50 | # max filename length 51 | MAX_FNAME_LENGTH = 200 52 | 53 | DISABLE_RANDOMIZEDPIPELINENING = False # use with caution! 54 | STREAM_CLOSE_TIMEOUT = 20 # wait 20 seconds before raising an alarm signal 55 | # otherwise we had many cases where get_streams hanged 56 | 57 | XVFB_W = 1280 58 | XVFB_H = 720 59 | 60 | # Tor browser version suffixes 61 | # The version used by Wang & Goldberg 62 | TBB_V_2_4_7_A1 = "2.4.7-alpha-1" 63 | TBB_WANG_AND_GOLDBERG = TBB_V_2_4_7_A1 64 | 65 | TBB_V_3_5 = "3.5" 66 | TBB_V_4_0_8 = "4.0.8" 67 | TBB_V_6_5_2 = "6.5.2" 68 | TBB_DEFAULT_VERSION = TBB_V_6_5_2 69 | 70 | TBB_KNOWN_VERSIONS = [TBB_V_2_4_7_A1, TBB_V_3_5, TBB_V_4_0_8, TBB_V_6_5_2] 71 | 72 | # Default paths 73 | BASE_DIR = path.abspath(os.path.dirname(__file__)) 74 | DATASET_DIR = join(BASE_DIR, "datasets") 75 | ALEXA_DIR = join(DATASET_DIR, "alexa") 76 | TEST_DIR = join(BASE_DIR, 'test') 77 | TEST_FILES_DIR = join(TEST_DIR, 'files') 78 | DUMMY_TEST_DIR = join(TEST_FILES_DIR, 'dummy') 79 | DUMMY_TEST_DIR_TARGZIPPED = DUMMY_TEST_DIR + ".tar.gz" 80 | TBB_TEST_TARBALL = join(TEST_FILES_DIR, 81 | 'tor-browser-linux64-4.0.99_en-US.tar.xz') 82 | TBB_TEST_TARBALL_EXTRACTED = join(TEST_FILES_DIR, 83 | 'tor-browser-linux64-4.0.99_en-US') 84 | RESULTS_DIR = join(BASE_DIR, 'results') 85 | ETC_DIR = join(BASE_DIR, 'etc') 86 | PERMISSIONS_DB = join(ETC_DIR, 'permissions.sqlite') 87 | HOME_PATH = expanduser('~') 88 | TBB_BASE_DIR = join(BASE_DIR, 'tbb') 89 | 90 | # Top URLs localized (DE) to prevent the effect of localization 91 | LOCALIZED_DATASET = join(ETC_DIR, "localized-urls-100-top.csv") 92 | 93 | # Experiment type determines what to do during the visits 94 | EXP_TYPE_WANG_AND_GOLDBERG = "wang_and_goldberg" # setting from WPES'13 paper 95 | EXP_TYPE_MULTITAB_ALEXA = "multitab_alexa" # open Alexa sites in multiple tabs 96 | 97 | # Tor ports 98 | SOCKS_PORT = 9050 99 | CONTROLLER_PORT = 9051 100 | MAX_ENTRY_GUARDS = "1" 101 | 102 | # defaults for batch and instance numbers 103 | NUM_BATCHES = 10 104 | NUM_INSTANCES = 4 105 | MAX_SITES_PER_TOR_PROCESS = 100 # reset tor process after crawling 100 sites 106 | 107 | # torrc dictionaries 108 | TORRC_DEFAULT = {'SocksPort': str(SOCKS_PORT), 109 | 'ControlPort': str(CONTROLLER_PORT)} 110 | 111 | TORRC_WANG_AND_GOLDBERG = {'SocksPort': str(SOCKS_PORT), 112 | 'ControlPort': str(CONTROLLER_PORT), 113 | 'MaxCircuitDirtiness': '600000', 114 | 'UseEntryGuards': '0' 115 | } 116 | 117 | # Directory structure and paths depend on TBB versions 118 | # Path to Firefox binary in TBB dir 119 | TBB_V2_FF_BIN_PATH = join('App', 'Firefox', 'firefox') 120 | TBB_V3_FF_BIN_PATH = join('Browser', 'firefox') 121 | TBB_V4_FF_BIN_PATH = join('Browser', 'firefox') 122 | 123 | TBB_FF_BIN_PATH_DICT = {"2": TBB_V2_FF_BIN_PATH, 124 | "3": TBB_V3_FF_BIN_PATH, 125 | "4": TBB_V4_FF_BIN_PATH, 126 | } 127 | 128 | # Path to Firefox profile in TBB dir 129 | TBB_V2_PROFILE_PATH = join('Data', 'profile') 130 | TBB_V3_PROFILE_PATH = join('Data', 'Browser', 'profile.default') 131 | TBB_V4_PROFILE_PATH = join('Browser', 'TorBrowser', 'Data', 132 | 'Browser', 'profile.default') 133 | TBB_V6_PROFILE_PATH = TBB_V4_PROFILE_PATH 134 | 135 | TBB_PROFILE_DIR_DICT = {"2": TBB_V2_PROFILE_PATH, 136 | "3": TBB_V3_PROFILE_PATH, 137 | "4": TBB_V4_PROFILE_PATH, 138 | "6": TBB_V6_PROFILE_PATH, 139 | } 140 | 141 | # Path to Tor binary in TBB dir 142 | TOR_V2_BINARY_PATH = join('App', 'tor') 143 | TOR_V3_BINARY_PATH = join('Tor', 'tor') 144 | TOR_V4_BINARY_PATH = join('Browser', 'TorBrowser', 'Tor', 'tor') 145 | 146 | TOR_BINARY_PATH_DICT = {"2": TOR_V2_BINARY_PATH, 147 | "3": TOR_V3_BINARY_PATH, 148 | "4": TOR_V4_BINARY_PATH, 149 | } 150 | # Path to Tor binary in TBB dir 151 | TOR_V2_DATA_DIR = join('Data', 'Tor') 152 | TOR_V3_DATA_DIR = join('Data', 'Tor') 153 | TOR_V4_DATA_DIR = join('Browser', 'TorBrowser', 'Data', 'Tor') 154 | TOR_V6_DATA_DIR = join('Browser', 'TorBrowser', 'Data', 'Tor') 155 | 156 | TOR_DATA_DIR_DICT = {"2": TOR_V2_DATA_DIR, 157 | "3": TOR_V3_DATA_DIR, 158 | "4": TOR_V4_DATA_DIR, 159 | "6": TOR_V6_DATA_DIR, 160 | } 161 | 162 | 163 | def get_tbb_major_version(version): 164 | """Return major version of TBB.""" 165 | return version.split(".")[0] 166 | 167 | 168 | def get_tbb_dirname(version, os_name="linux", lang="en-US"): 169 | """Return path for Tor Browser Bundle for given version and bits.""" 170 | return "tor-browser-%s%s-%s_%s" % (os_name, arch, version, lang) 171 | 172 | 173 | def get_tbb_path(version, os_name="linux", lang="en-US"): 174 | """Return path for Tor Browser Bundle for given version and bits.""" 175 | dirname = get_tbb_dirname(version, os_name, lang) 176 | return join(TBB_BASE_DIR, dirname) 177 | 178 | 179 | def get_tb_bin_path(version, os_name="linux", lang="en-US"): 180 | """Return a binary path for Tor Browser.""" 181 | major = get_tbb_major_version(version) 182 | # bin_path = TBB_V3_FF_BIN_PATH if major is "3" else TBB_V2_FF_BIN_PATH 183 | bin_path = TBB_FF_BIN_PATH_DICT[major] 184 | dir_path = get_tbb_path(version, os_name, lang) 185 | return join(dir_path, bin_path) 186 | 187 | 188 | def get_tor_bin_path(version, os_name="linux", lang="en-US"): 189 | """Return a binary path for Tor.""" 190 | major = get_tbb_major_version(version) 191 | bin_path = TOR_BINARY_PATH_DICT[major] 192 | dir_path = get_tbb_path(version, os_name, lang) 193 | return join(dir_path, bin_path) 194 | 195 | 196 | def get_tbb_profile_path(version, os_name="linux", lang="en-US"): 197 | """Return profile path for Tor Browser Bundle.""" 198 | major = get_tbb_major_version(version) 199 | profile = TBB_PROFILE_DIR_DICT[major] 200 | dir_path = get_tbb_path(version, os_name, lang) 201 | return join(dir_path, profile) 202 | 203 | 204 | def get_tor_data_path(version, os_name="linux", lang="en-US"): 205 | """Return the path for Data dir of Tor.""" 206 | major = get_tbb_major_version(version) 207 | data_path = TOR_DATA_DIR_DICT[major] 208 | tbb_path = get_tbb_path(version, os_name, lang) 209 | return join(tbb_path, data_path) 210 | -------------------------------------------------------------------------------- /tor-browser-crawler-webfp-paper/datacollection/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pankajb64/webfp-crawler-phantomjs/51e195ce73c5c8bf620941c85f4fed0b92e98f98/tor-browser-crawler-webfp-paper/datacollection/__init__.py -------------------------------------------------------------------------------- /tor-browser-crawler-webfp-paper/datacollection/crawler.py: -------------------------------------------------------------------------------- 1 | from log import wl_log, add_log_file_handler, add_symlink 2 | from random import choice 3 | from selenium.common.exceptions import TimeoutException 4 | from torutils import TorController 5 | from visit import Visit 6 | import common as cm 7 | import os 8 | import time 9 | import utils as ut 10 | 11 | 12 | class Crawler(object): 13 | """Provides methods to collect traffic traces.""" 14 | 15 | def __init__(self, torrc_dict, url_list, tbb_version, 16 | experiment=cm.EXP_TYPE_WANG_AND_GOLDBERG, xvfb=False, 17 | capture_screen=True): 18 | # Create instance of Tor controller and sniffer used for the crawler 19 | self.crawl_dir = None 20 | self.crawl_logs_dir = None 21 | self.visit = None 22 | self.urls = url_list # keep list of urls we'll visit 23 | self.init_crawl_dirs() # initializes crawl_dir 24 | self.tor_log = os.path.join(self.crawl_logs_dir, "tor.log") 25 | linkname = os.path.join(cm.RESULTS_DIR, 'latest_tor_log') 26 | add_symlink(linkname, self.tor_log) 27 | self.tbb_version = tbb_version 28 | self.experiment = experiment 29 | self.tor_controller = TorController(torrc_dict, tbb_version, 30 | self.tor_log) 31 | self.tor_process = None 32 | self.tb_driver = None 33 | self.capture_screen = capture_screen 34 | self.xvfb = xvfb 35 | add_log_file_handler(wl_log, self.log_file) 36 | linkname = os.path.join(cm.RESULTS_DIR, 'latest_crawl_log') 37 | add_symlink(linkname, self.log_file) # add a symbolic link 38 | 39 | def crawl(self, num_batches=cm.NUM_BATCHES, 40 | num_instances=cm.NUM_INSTANCES, start_line=0): 41 | wl_log.info("Crawl configuration: batches: %s, instances: %s," 42 | " tbb_version: %s, experiment: %s, no of URLs: %s, " 43 | "crawl dir: %s, XVFB: %s, screenshot: %s" 44 | % (num_batches, num_instances, self.tbb_version, 45 | self.experiment, len(self.urls), self.crawl_dir, 46 | self.xvfb, self.capture_screen)) 47 | # for each batch 48 | for batch_num in xrange(num_batches): 49 | wl_log.info("********** Starting batch %s **********" % batch_num) 50 | site_num = start_line 51 | bg_site = None 52 | batch_dir = ut.create_dir(os.path.join(self.crawl_dir, 53 | str(batch_num))) 54 | # init/reset tor process to have a different circuit. 55 | # make sure that we're not using the same guard node again 56 | wl_log.info("********** Restarting Tor Before Batch **********") 57 | self.tor_controller.restart_tor() 58 | sites_crawled_with_same_proc = 0 59 | 60 | # for each site 61 | for page_url in self.urls: 62 | sites_crawled_with_same_proc += 1 63 | if sites_crawled_with_same_proc > cm.MAX_SITES_PER_TOR_PROCESS: 64 | wl_log.info("********** Restarting Tor Process **********") 65 | self.tor_controller.restart_tor() 66 | sites_crawled_with_same_proc = 0 67 | 68 | wl_log.info("********** Crawling %s **********" % page_url) 69 | page_url = page_url[:cm.MAX_FNAME_LENGTH] 70 | site_dir = ut.create_dir(os.path.join( 71 | batch_dir, ut.get_filename_from_url(page_url, site_num))) 72 | 73 | if self.experiment == cm.EXP_TYPE_MULTITAB_ALEXA: 74 | bg_site = choice(self.urls) 75 | # for each visit 76 | for instance_num in range(num_instances): 77 | wl_log.info("********** Visit #%s to %s **********" % 78 | (instance_num, page_url)) 79 | self.visit = None 80 | try: 81 | self.visit = Visit(batch_num, site_num, 82 | instance_num, page_url, 83 | site_dir, self.tbb_version, 84 | self.tor_controller, bg_site, 85 | self.experiment, self.xvfb, 86 | self.capture_screen) 87 | 88 | self.visit.get() 89 | except KeyboardInterrupt: # CTRL + C 90 | raise KeyboardInterrupt 91 | except (ut.TimeExceededError, TimeoutException) as exc: 92 | wl_log.critical("Visit to %s timed out! %s %s" % 93 | (page_url, exc, type(exc))) 94 | if self.visit: 95 | self.visit.cleanup_visit() 96 | except Exception: 97 | wl_log.critical("Exception crawling %s" % page_url, 98 | exc_info=True) 99 | if self.visit: 100 | self.visit.cleanup_visit() 101 | # END - for each visit 102 | site_num += 1 103 | time.sleep(cm.PAUSE_BETWEEN_SITES) 104 | 105 | def init_crawl_dirs(self): 106 | """Creates results and logs directories for this crawl.""" 107 | self.crawl_dir, self.crawl_logs_dir = self.create_crawl_dir() 108 | sym_link = os.path.join(cm.RESULTS_DIR, 'latest') 109 | add_symlink(sym_link, self.crawl_dir) # add a symbolic link 110 | # Create crawl log 111 | self.log_file = os.path.join(self.crawl_logs_dir, "crawl.log") 112 | 113 | def init_logger(self): 114 | """Configure logging for crawler.""" 115 | add_log_file_handler(wl_log, self.log_file) 116 | 117 | def stop_crawl(self, pack_results=True): 118 | """ Cleans up crawl and kills tor process in case it's running.""" 119 | wl_log.info("Stopping crawl...") 120 | if self.visit: 121 | self.visit.cleanup_visit() 122 | self.tor_controller.kill_tor_proc() 123 | if pack_results: 124 | ut.pack_crawl_data(self.crawl_dir) 125 | 126 | def create_crawl_dir(self): 127 | """Create a timestamped crawl.""" 128 | ut.create_dir(cm.RESULTS_DIR) # ensure that we've a results dir 129 | crawl_dir_wo_ts = os.path.join(cm.RESULTS_DIR, 'crawl') 130 | crawl_dir = ut.create_dir(ut.append_timestamp(crawl_dir_wo_ts)) 131 | crawl_logs_dir = os.path.join(crawl_dir, 'logs') 132 | ut.create_dir(crawl_logs_dir) 133 | return crawl_dir, crawl_logs_dir 134 | -------------------------------------------------------------------------------- /tor-browser-crawler-webfp-paper/datacollection/dumputils.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | from log import wl_log 3 | import os 4 | import common as cm 5 | import utils as ut 6 | import time 7 | 8 | DUMPCAP_START_TIMEOUT = 10.0 9 | 10 | 11 | class Sniffer(object): 12 | """Capture network traffic using dumpcap.""" 13 | 14 | def __init__(self): 15 | self.pcap_file = '/dev/null' # uggh, make sure we set a path 16 | self.pcap_filter = '' 17 | self.p0 = None 18 | self.is_recording = False 19 | 20 | def set_pcap_path(self, pcap_filename): 21 | """Set filename and filter options for capture.""" 22 | self.pcap_file = pcap_filename 23 | 24 | def set_capture_filter(self, _filter): 25 | self.pcap_filter = _filter 26 | 27 | def get_pcap_path(self): 28 | """Return capture (pcap) filename.""" 29 | return self.pcap_file 30 | 31 | def get_capture_filter(self): 32 | """Return capture filter.""" 33 | return self.pcap_filter 34 | 35 | def start_capture(self, pcap_path=None, pcap_filter=""): 36 | """Start capture. Configure sniffer if arguments are given.""" 37 | if cm.running_in_CI: 38 | wl_log.debug("CI run: will not run dumpcap") 39 | return False 40 | if pcap_filter: 41 | self.set_capture_filter(pcap_filter) 42 | 43 | if pcap_path: 44 | self.set_pcap_path(pcap_path) 45 | prefix = "" 46 | if cm.running_in_CI: 47 | prefix = "sudo " # run as sudo in Travis CI since we cannot setcap 48 | command = '{}dumpcap -a duration:{} -a filesize:{} -i any -s 0 -f \'{}\' -w {}'\ 49 | .format(prefix, cm.SOFT_VISIT_TIMEOUT, cm.MAX_DUMP_SIZE, 50 | self.pcap_filter, self.pcap_file) 51 | wl_log.info(command) 52 | self.p0 = subprocess.Popen(command, stdout=subprocess.PIPE, 53 | stderr=subprocess.PIPE, shell=True) 54 | timeout = DUMPCAP_START_TIMEOUT # in seconds 55 | while timeout > 0 and not self.is_dumpcap_running(): 56 | time.sleep(0.1) 57 | timeout -= 0.1 58 | if timeout < 0: 59 | raise cm.DumpcapTimeoutError() 60 | else: 61 | wl_log.debug("dumpcap started in %s seconds" % 62 | (DUMPCAP_START_TIMEOUT - timeout)) 63 | 64 | self.is_recording = True 65 | 66 | def is_dumpcap_running(self): 67 | for proc in ut.gen_all_children_procs(self.p0.pid): 68 | if "dumpcap" in proc.cmdline(): 69 | return True 70 | return False 71 | 72 | def stop_capture(self): 73 | """Kill the dumpcap process.""" 74 | ut.kill_all_children(self.p0.pid) # self.p0.pid is the shell pid 75 | self.p0.kill() 76 | self.is_recording = False 77 | if os.path.isfile(self.pcap_file): 78 | wl_log.info('Dumpcap killed. Capture size: %s Bytes %s' % 79 | (os.path.getsize(self.pcap_file), self.pcap_file)) 80 | else: 81 | wl_log.warning('Dumpcap killed but cannot find capture file: %s' 82 | % self.pcap_file) 83 | -------------------------------------------------------------------------------- /tor-browser-crawler-webfp-paper/datacollection/torutils.py: -------------------------------------------------------------------------------- 1 | import os 2 | from selenium import webdriver 3 | from selenium.common.exceptions import WebDriverException 4 | from selenium.webdriver import DesiredCapabilities 5 | from selenium.webdriver import firefox 6 | from selenium.webdriver.firefox.firefox_binary import FirefoxBinary 7 | import shutil 8 | import socket 9 | from stem.control import Controller 10 | import stem.process 11 | from stem.util import term 12 | import sqlite3 13 | import sys 14 | from httplib import CannotSendRequest 15 | from tld import get_tld 16 | import common as cm 17 | from log import wl_log 18 | from utils import clone_dir_with_timestap 19 | import utils as ut 20 | 21 | 22 | class TorController(object): 23 | def __init__(self, torrc_dict, tbb_version, tor_log='/dev/null'): 24 | self.torrc_dict = torrc_dict 25 | self.controller = None 26 | self.tbb_version = tbb_version 27 | self.tmp_tor_data_dir = None 28 | self.tor_process = None 29 | self.log_file = tor_log 30 | 31 | def tor_log_handler(self, line): 32 | wl_log.info(term.format(line)) 33 | 34 | def restart_tor(self): 35 | """Kill current Tor process and run a new one.""" 36 | self.kill_tor_proc() 37 | self.launch_tor_service(self.log_file) 38 | 39 | def kill_tor_proc(self): 40 | """Kill Tor process.""" 41 | if self.tor_process: 42 | wl_log.info("Killing tor process") 43 | self.tor_process.kill() 44 | if self.tmp_tor_data_dir and os.path.isdir(self.tmp_tor_data_dir): 45 | wl_log.info("Removing tmp tor data dir") 46 | shutil.rmtree(self.tmp_tor_data_dir) 47 | 48 | def launch_tor_service(self, logfile='/dev/null'): 49 | """Launch Tor service and return the process.""" 50 | self.log_file = logfile 51 | self.tmp_tor_data_dir = ut.clone_dir_with_timestap( 52 | cm.get_tor_data_path(self.tbb_version)) 53 | 54 | self.torrc_dict.update({'DataDirectory': self.tmp_tor_data_dir, 55 | 'Log': ['INFO file %s' % logfile]}) 56 | 57 | wl_log.debug("Tor config: %s" % self.torrc_dict) 58 | try: 59 | self.tor_process = stem.process.launch_tor_with_config( 60 | config=self.torrc_dict, 61 | init_msg_handler=self.tor_log_handler, 62 | tor_cmd=cm.get_tor_bin_path(self.tbb_version), 63 | timeout=270000 64 | ) 65 | self.controller = Controller.from_port() 66 | self.controller.authenticate() 67 | return self.tor_process 68 | 69 | except stem.SocketError as exc: 70 | wl_log.critical("Unable to connect to tor on port %s: %s" % 71 | (cm.SOCKS_PORT, exc)) 72 | sys.exit(1) 73 | except: 74 | # most of the time this is due to another instance of 75 | # tor running on the system 76 | wl_log.critical("Error launching Tor", exc_info=True) 77 | sys.exit(1) 78 | 79 | wl_log.info("Tor running at port {0} & controller port {1}." 80 | .format(cm.SOCKS_PORT, cm.CONTROLLER_PORT)) 81 | return self.tor_process 82 | 83 | def close_all_streams(self): 84 | """Close all streams of a controller.""" 85 | wl_log.debug("Closing all streams") 86 | try: 87 | ut.timeout(cm.STREAM_CLOSE_TIMEOUT) 88 | for stream in self.controller.get_streams(): 89 | wl_log.debug("Closing stream %s %s %s " % 90 | (stream.id, stream.purpose, 91 | stream.target_address)) 92 | self.controller.close_stream(stream.id) # MISC reason 93 | except ut.TimeExceededError: 94 | wl_log.critical("Closing streams timed out!") 95 | except: 96 | wl_log.debug("Exception closing stream") 97 | finally: 98 | ut.cancel_timeout() 99 | 100 | 101 | class TorBrowserDriver(webdriver.Firefox, firefox.webdriver.RemoteWebDriver): 102 | def __init__(self, tbb_binary_path=None, tbb_profile_dir=None, 103 | tbb_logfile_path=None, 104 | tbb_version=cm.TBB_DEFAULT_VERSION, page_url="", 105 | capture_screen=True): 106 | #self.sessionId = None 107 | self.is_running = False 108 | self.tbb_version = tbb_version 109 | self.export_lib_path() 110 | # Initialize Tor Browser's profile 111 | self.page_url = page_url 112 | self.capture_screen = capture_screen 113 | self.profile = self.init_tbb_profile(tbb_version) 114 | # set homepage to a blank tab 115 | self.profile.set_preference('browser.startup.page', "0") 116 | self.profile.set_preference('browser.startup.homepage', 'about:newtab') 117 | 118 | # configure Firefox to use Tor SOCKS proxy 119 | self.profile.set_preference('network.proxy.type', 1) 120 | self.profile.set_preference('network.proxy.socks', '127.0.0.1') 121 | self.profile.set_preference('network.proxy.socks_port', cm.SOCKS_PORT) 122 | if cm.DISABLE_RANDOMIZEDPIPELINENING: 123 | self.profile.set_preference( 124 | 'network.http.pipelining.max-optimistic-requests', 5000) 125 | self.profile.set_preference( 126 | 'network.http.pipelining.maxrequests', 15000) 127 | self.profile.set_preference('network.http.pipelining', False) 128 | 129 | self.profile.set_preference( 130 | 'extensions.torlauncher.prompt_at_startup', 131 | 0) 132 | 133 | # Disable cache - Wang & Goldberg's setting 134 | self.profile.set_preference('network.http.use-cache', False) 135 | 136 | # http://www.w3.org/TR/webdriver/#page-load-strategies-1 137 | # wait for all frames to load and make sure there's no 138 | # outstanding http requests (except AJAX) 139 | # https://code.google.com/p/selenium/wiki/DesiredCapabilities 140 | self.profile.set_preference('webdriver.load.strategy', 'conservative') 141 | # Note that W3C doesn't mention "conservative", this may change in the 142 | # upcoming versions of the Firefox Webdriver 143 | # https://w3c.github.io/webdriver/webdriver-spec.html#the-page-load-strategy 144 | 145 | # prevent Tor Browser running it's own Tor process 146 | self.profile.set_preference('extensions.torlauncher.start_tor', False) 147 | self.profile.set_preference( 148 | 'extensions.torbutton.versioncheck_enabled', False) 149 | self.profile.set_preference('permissions.memory_only', False) 150 | self.profile.update_preferences() 151 | # Initialize Tor Browser's binary 152 | self.binary = self.get_tbb_binary(tbb_version=self.tbb_version, 153 | logfile=tbb_logfile_path) 154 | 155 | # Initialize capabilities 156 | self.capabilities = DesiredCapabilities.FIREFOX 157 | #self.capabilities.update({'handlesAlerts': True, 158 | # 'databaseEnabled': True, 159 | # 'javascriptEnabled': True, 160 | # 'browserConnectionEnabled': True}) 161 | 162 | try: 163 | super(TorBrowserDriver, self)\ 164 | .__init__(firefox_profile=self.profile, 165 | firefox_binary=self.binary, 166 | capabilities=self.capabilities) 167 | self.is_running = True 168 | except WebDriverException as error: 169 | wl_log.error("WebDriverException while connecting to Webdriver %s" 170 | % error) 171 | except socket.error as skterr: 172 | wl_log.error("Error connecting to Webdriver", exc_info=True) 173 | wl_log.error(skterr.message) 174 | except Exception as e: 175 | wl_log.error("Error connecting to Webdriver: %s" % e, 176 | exc_info=True) 177 | 178 | def export_lib_path(self): 179 | os.environ["LD_LIBRARY_PATH"] = os.path.dirname( 180 | cm.get_tor_bin_path(self.tbb_version)) 181 | 182 | def get_tbb_binary(self, tbb_version, binary=None, logfile=None): 183 | """Return FirefoxBinary pointing to the TBB's firefox binary.""" 184 | tbb_logfile = None 185 | if not binary: 186 | binary = cm.get_tb_bin_path(tbb_version) 187 | if logfile: 188 | tbb_logfile = open(logfile, 'a+') 189 | 190 | # in case you get an error for the unknown log_file, make sure your 191 | # Selenium version is compatible with the Firefox version in TBB. 192 | tbb_binary = FirefoxBinary(firefox_path=binary, 193 | log_file=tbb_logfile) 194 | return tbb_binary 195 | 196 | def add_canvas_permission(self): 197 | """Create a permission db (permissions.sqlite) and add 198 | 199 | exception for the canvas image extraction. Otherwise screenshots 200 | taken by Selenium will be just blank images due to canvas 201 | fingerprinting defense in TBB.""" 202 | 203 | connect_to_db = sqlite3.connect # @UndefinedVariable 204 | perm_db = connect_to_db(os.path.join(self.prof_dir_path, 205 | "permissions.sqlite")) 206 | cursor = perm_db.cursor() 207 | # http://mxr.mozilla.org/mozilla-esr31/source/build/automation.py.in 208 | cursor.execute("PRAGMA user_version=3") 209 | cursor.execute("""CREATE TABLE IF NOT EXISTS moz_hosts ( 210 | id INTEGER PRIMARY KEY, 211 | host TEXT, 212 | type TEXT, 213 | permission INTEGER, 214 | expireType INTEGER, 215 | expireTime INTEGER, 216 | appId INTEGER, 217 | isInBrowserElement INTEGER)""") 218 | 219 | domain = get_tld(self.page_url) 220 | wl_log.debug("Adding canvas/extractData permission for %s" % domain) 221 | qry = """INSERT INTO 'moz_hosts' 222 | VALUES(NULL,'%s','canvas/extractData',1,0,0,0,0);""" % domain 223 | cursor.execute(qry) 224 | perm_db.commit() 225 | cursor.close() 226 | 227 | def init_tbb_profile(self, version): 228 | profile_directory = cm.get_tbb_profile_path(version) 229 | self.prof_dir_path = clone_dir_with_timestap(profile_directory) 230 | if self.capture_screen and self.page_url: 231 | self.add_canvas_permission() 232 | try: 233 | tbb_profile = webdriver.FirefoxProfile(self.prof_dir_path) 234 | except Exception: 235 | wl_log.error("Error creating the TB profile", exc_info=True) 236 | else: 237 | return tbb_profile 238 | 239 | def quit(self): 240 | """ 241 | Overrides the base class method cleaning the timestamped profile. 242 | 243 | """ 244 | self.is_running = False 245 | try: 246 | wl_log.info("Quit: Removing profile dir") 247 | shutil.rmtree(self.prof_dir_path) 248 | super(TorBrowserDriver, self).quit() 249 | except CannotSendRequest: 250 | wl_log.error("CannotSendRequest while quitting TorBrowserDriver", 251 | exc_info=False) 252 | # following is copied from webdriver.firefox.webdriver.quit() which 253 | # was interrupted due to an unhandled CannotSendRequest exception. 254 | 255 | # kill the browser 256 | self.binary.kill() 257 | # remove the profile folder 258 | try: 259 | shutil.rmtree(self.profile.path) 260 | if self.profile.tempfolder is not None: 261 | shutil.rmtree(self.profile.tempfolder) 262 | except Exception as e: 263 | print(str(e)) 264 | except Exception: 265 | wl_log.error("Exception while quitting TorBrowserDriver", 266 | exc_info=True) 267 | -------------------------------------------------------------------------------- /tor-browser-crawler-webfp-paper/datacollection/torutils_pj.py: -------------------------------------------------------------------------------- 1 | import os 2 | from selenium import webdriver 3 | from selenium.common.exceptions import WebDriverException 4 | from selenium.webdriver import DesiredCapabilities 5 | from selenium.webdriver import firefox 6 | from selenium.webdriver.firefox.firefox_binary import FirefoxBinary 7 | from selenium.webdriver import phantomjs 8 | import shutil 9 | import socket 10 | from stem.control import Controller 11 | import stem.process 12 | from stem.util import term 13 | import sqlite3 14 | import sys 15 | from httplib import CannotSendRequest 16 | from tld import get_tld 17 | import common as cm 18 | from log import wl_log 19 | from utils import clone_dir_with_timestap 20 | import utils as ut 21 | 22 | 23 | class TorController(object): 24 | def __init__(self, torrc_dict, tbb_version, tor_log='/dev/null'): 25 | self.torrc_dict = torrc_dict 26 | self.controller = None 27 | self.tbb_version = tbb_version 28 | self.tmp_tor_data_dir = None 29 | self.tor_process = None 30 | self.log_file = tor_log 31 | 32 | def tor_log_handler(self, line): 33 | wl_log.info(term.format(line)) 34 | 35 | def restart_tor(self): 36 | """Kill current Tor process and run a new one.""" 37 | self.kill_tor_proc() 38 | self.launch_tor_service(self.log_file) 39 | 40 | def kill_tor_proc(self): 41 | """Kill Tor process.""" 42 | if self.tor_process: 43 | wl_log.info("Killing tor process") 44 | self.tor_process.kill() 45 | if self.tmp_tor_data_dir and os.path.isdir(self.tmp_tor_data_dir): 46 | wl_log.info("Removing tmp tor data dir") 47 | shutil.rmtree(self.tmp_tor_data_dir) 48 | 49 | def launch_tor_service(self, logfile='/dev/null'): 50 | """Launch Tor service and return the process.""" 51 | self.log_file = logfile 52 | self.tmp_tor_data_dir = ut.clone_dir_with_timestap( 53 | cm.get_tor_data_path(self.tbb_version)) 54 | 55 | self.torrc_dict.update({'DataDirectory': self.tmp_tor_data_dir, 56 | 'Log': ['INFO file %s' % logfile]}) 57 | 58 | wl_log.debug("Tor config: %s" % self.torrc_dict) 59 | try: 60 | self.tor_process = stem.process.launch_tor_with_config( 61 | config=self.torrc_dict, 62 | init_msg_handler=self.tor_log_handler, 63 | tor_cmd=cm.get_tor_bin_path(self.tbb_version), 64 | timeout=270 65 | ) 66 | self.controller = Controller.from_port() 67 | self.controller.authenticate() 68 | return self.tor_process 69 | 70 | except stem.SocketError as exc: 71 | wl_log.critical("Unable to connect to tor on port %s: %s" % 72 | (cm.SOCKS_PORT, exc)) 73 | sys.exit(1) 74 | except: 75 | # most of the time this is due to another instance of 76 | # tor running on the system 77 | wl_log.critical("Error launching Tor", exc_info=True) 78 | sys.exit(1) 79 | 80 | wl_log.info("Tor running at port {0} & controller port {1}." 81 | .format(cm.SOCKS_PORT, cm.CONTROLLER_PORT)) 82 | return self.tor_process 83 | 84 | def close_all_streams(self): 85 | """Close all streams of a controller.""" 86 | wl_log.debug("Closing all streams") 87 | try: 88 | ut.timeout(cm.STREAM_CLOSE_TIMEOUT) 89 | for stream in self.controller.get_streams(): 90 | wl_log.debug("Closing stream %s %s %s " % 91 | (stream.id, stream.purpose, 92 | stream.target_address)) 93 | self.controller.close_stream(stream.id) # MISC reason 94 | except ut.TimeExceededError: 95 | wl_log.critical("Closing streams timed out!") 96 | except: 97 | wl_log.debug("Exception closing stream") 98 | finally: 99 | ut.cancel_timeout() 100 | 101 | 102 | class TorBrowserDriver(webdriver.PhantomJS, phantomjs.webdriver.RemoteWebDriver): 103 | def __init__(self, tbb_binary_path=None, tbb_profile_dir=None, 104 | tbb_logfile_path=None, 105 | tbb_version=cm.TBB_DEFAULT_VERSION, page_url="", 106 | capture_screen=True): 107 | #self.session_id = None 108 | self.is_running = False 109 | self.tbb_version = tbb_version 110 | self.export_lib_path() 111 | # Initialize Tor Browser's profile 112 | self.page_url = page_url 113 | self.capture_screen = capture_screen 114 | self.profile = self.init_tbb_profile(tbb_version) 115 | # set homepage to a blank tab 116 | self.profile.set_preference('browser.startup.page', "0") 117 | self.profile.set_preference('browser.startup.homepage', 'about:newtab') 118 | 119 | # configure Firefox to use Tor SOCKS proxy 120 | self.profile.set_preference('network.proxy.type', 1) 121 | self.profile.set_preference('network.proxy.socks', '127.0.0.1') 122 | self.profile.set_preference('network.proxy.socks_port', cm.SOCKS_PORT) 123 | if cm.DISABLE_RANDOMIZEDPIPELINENING: 124 | self.profile.set_preference( 125 | 'network.http.pipelining.max-optimistic-requests', 5000) 126 | self.profile.set_preference( 127 | 'network.http.pipelining.maxrequests', 15000) 128 | self.profile.set_preference('network.http.pipelining', False) 129 | 130 | self.profile.set_preference( 131 | 'extensions.torlauncher.prompt_at_startup', 132 | 0) 133 | 134 | # Disable cache - Wang & Goldberg's setting 135 | self.profile.set_preference('network.http.use-cache', False) 136 | 137 | # http://www.w3.org/TR/webdriver/#page-load-strategies-1 138 | # wait for all frames to load and make sure there's no 139 | # outstanding http requests (except AJAX) 140 | # https://code.google.com/p/selenium/wiki/DesiredCapabilities 141 | self.profile.set_preference('webdriver.load.strategy', 'conservative') 142 | # Note that W3C doesn't mention "conservative", this may change in the 143 | # upcoming versions of the Firefox Webdriver 144 | # https://w3c.github.io/webdriver/webdriver-spec.html#the-page-load-strategy 145 | 146 | # prevent Tor Browser running it's own Tor process 147 | self.profile.set_preference('extensions.torlauncher.start_tor', False) 148 | self.profile.set_preference( 149 | 'extensions.torbutton.versioncheck_enabled', False) 150 | self.profile.set_preference('permissions.memory_only', False) 151 | self.profile.update_preferences() 152 | # Initialize Tor Browser's binary 153 | self.binary = self.get_tbb_binary(tbb_version=self.tbb_version, 154 | logfile=tbb_logfile_path) 155 | 156 | # Initialize capabilities 157 | self.capabilities = DesiredCapabilities.PHANTOMJS 158 | self.capabilities.update({'handlesAlerts': True,'databaseEnabled': True, 'browserConnectionEnabled': True, 'javascriptEnabled': True}) 159 | # 'javascriptEnabled': True}) #,'handlesAlerts': True,'databaseEnabled': True, 'browserConnectionEnabled': True 160 | service_args = [ '--proxy=127.0.0.1:%s'%(cm.SOCKS_PORT), '--proxy-type=socks5',] 161 | 162 | try: 163 | super(TorBrowserDriver, self)\ 164 | .__init__(executable_path="/usr/bin/phantomjs", 165 | desired_capabilities=self.capabilities, service_args=service_args) 166 | self.is_running = True 167 | except WebDriverException as error: 168 | wl_log.error("WebDriverException while connecting to Webdriver %s" 169 | % error) 170 | except socket.error as skterr: 171 | wl_log.error("Error connecting to Webdriver", exc_info=True) 172 | wl_log.error(skterr.message) 173 | except Exception as e: 174 | wl_log.error("Error connecting to Webdriver: %s" % e, 175 | exc_info=True) 176 | 177 | def export_lib_path(self): 178 | os.environ["LD_LIBRARY_PATH"] = os.path.dirname( 179 | cm.get_tor_bin_path(self.tbb_version)) 180 | 181 | def get_tbb_binary(self, tbb_version, binary=None, logfile=None): 182 | """Return FirefoxBinary pointing to the TBB's firefox binary.""" 183 | tbb_logfile = None 184 | if not binary: 185 | binary = cm.get_tb_bin_path(tbb_version) 186 | if logfile: 187 | tbb_logfile = open(logfile, 'a+') 188 | 189 | # in case you get an error for the unknown log_file, make sure your 190 | # Selenium version is compatible with the Firefox version in TBB. 191 | tbb_binary = FirefoxBinary(firefox_path=binary, 192 | log_file=tbb_logfile) 193 | return tbb_binary 194 | 195 | def add_canvas_permission(self): 196 | """Create a permission db (permissions.sqlite) and add 197 | 198 | exception for the canvas image extraction. Otherwise screenshots 199 | taken by Selenium will be just blank images due to canvas 200 | fingerprinting defense in TBB.""" 201 | 202 | connect_to_db = sqlite3.connect # @UndefinedVariable 203 | perm_db = connect_to_db(os.path.join(self.prof_dir_path, 204 | "permissions.sqlite")) 205 | cursor = perm_db.cursor() 206 | # http://mxr.mozilla.org/mozilla-esr31/source/build/automation.py.in 207 | cursor.execute("PRAGMA user_version=3") 208 | cursor.execute("""CREATE TABLE IF NOT EXISTS moz_hosts ( 209 | id INTEGER PRIMARY KEY, 210 | host TEXT, 211 | type TEXT, 212 | permission INTEGER, 213 | expireType INTEGER, 214 | expireTime INTEGER, 215 | appId INTEGER, 216 | isInBrowserElement INTEGER)""") 217 | 218 | domain = get_tld(self.page_url) 219 | wl_log.debug("Adding canvas/extractData permission for %s" % domain) 220 | qry = """INSERT INTO 'moz_hosts' 221 | VALUES(NULL,'%s','canvas/extractData',1,0,0,0,0);""" % domain 222 | cursor.execute(qry) 223 | perm_db.commit() 224 | cursor.close() 225 | 226 | def init_tbb_profile(self, version): 227 | profile_directory = cm.get_tbb_profile_path(version) 228 | self.prof_dir_path = clone_dir_with_timestap(profile_directory) 229 | if self.capture_screen and self.page_url: 230 | self.add_canvas_permission() 231 | try: 232 | tbb_profile = webdriver.FirefoxProfile(self.prof_dir_path) 233 | except Exception: 234 | wl_log.error("Error creating the TB profile", exc_info=True) 235 | else: 236 | return tbb_profile 237 | 238 | def quit(self): 239 | """ 240 | Overrides the base class method cleaning the timestamped profile. 241 | 242 | """ 243 | self.is_running = False 244 | try: 245 | wl_log.info("Quit: Removing profile dir") 246 | shutil.rmtree(self.prof_dir_path) 247 | super(TorBrowserDriver, self).quit() 248 | except CannotSendRequest: 249 | wl_log.error("CannotSendRequest while quitting TorBrowserDriver", 250 | exc_info=False) 251 | # following is copied from webdriver.firefox.webdriver.quit() which 252 | # was interrupted due to an unhandled CannotSendRequest exception. 253 | 254 | # kill the browser 255 | self.binary.kill() 256 | # remove the profile folder 257 | try: 258 | shutil.rmtree(self.profile.path) 259 | if self.profile.tempfolder is not None: 260 | shutil.rmtree(self.profile.tempfolder) 261 | except Exception as e: 262 | print(str(e)) 263 | except Exception: 264 | wl_log.error("Exception while quitting TorBrowserDriver", 265 | exc_info=True) 266 | -------------------------------------------------------------------------------- /tor-browser-crawler-webfp-paper/datacollection/visit.py: -------------------------------------------------------------------------------- 1 | from selenium.webdriver.common.keys import Keys 2 | from xvfbwrapper import Xvfb 3 | from torutils import TorBrowserDriver 4 | from log import wl_log 5 | import os 6 | import common as cm 7 | from dumputils import Sniffer 8 | import time 9 | import utils as ut 10 | 11 | 12 | BAREBONE_HOME_PAGE = "file://%s/barebones.html" % cm.ETC_DIR 13 | 14 | VBOX_GATEWAY_IP = "10.0.2.2" # default gateway IP of VirtualBox 15 | LXC_GATEWAY_IP = "172.17.0.1" # default gateway IP of LXC 16 | LOCALHOST_IP = "127.0.0.1" # default localhost IP 17 | 18 | 19 | class Visit(object): 20 | """Hold info about a particular visit to a page.""" 21 | 22 | def __init__(self, batch_num, site_num, instance_num, page_url, 23 | base_dir, tbb_version, tor_controller, bg_site=None, 24 | experiment=cm.EXP_TYPE_WANG_AND_GOLDBERG, xvfb=False, 25 | capture_screen=True): 26 | self.batch_num = batch_num 27 | self.site_num = site_num 28 | self.instance_num = instance_num 29 | self.page_url = page_url 30 | self.bg_site = bg_site 31 | self.experiment = experiment 32 | self.base_dir = base_dir 33 | self.visit_dir = None 34 | self.visit_log_dir = None 35 | self.tbb_version = tbb_version 36 | self.capture_screen = capture_screen 37 | self.tor_controller = tor_controller 38 | self.xvfb = xvfb 39 | self.init_visit_dir() 40 | self.pcap_path = os.path.join( 41 | self.visit_dir, "{}.pcap".format(self.get_instance_name())) 42 | 43 | if self.xvfb and not cm.running_in_CI: 44 | wl_log.info("Starting XVFBm %sX%s" % (cm.XVFB_W, cm.XVFB_H)) 45 | self.vdisplay = Xvfb(width=cm.XVFB_W, height=cm.XVFB_H) 46 | self.vdisplay.start() 47 | 48 | # Create new instance of TorBrowser driver 49 | self.tb_driver = TorBrowserDriver( 50 | tbb_logfile_path=os.path.join( 51 | self.visit_dir, "logs", "firefox.log"), 52 | tbb_version=tbb_version, 53 | page_url=page_url) 54 | 55 | self.sniffer = Sniffer() # sniffer to capture the network traffic 56 | 57 | def init_visit_dir(self): 58 | """Create results and logs directories for this visit.""" 59 | visit_name = str(self.instance_num) 60 | self.visit_dir = os.path.join(self.base_dir, visit_name) 61 | ut.create_dir(self.visit_dir) 62 | self.visit_log_dir = os.path.join(self.visit_dir, 'logs') 63 | ut.create_dir(self.visit_log_dir) 64 | 65 | def get_instance_name(self): 66 | """Construct and return a filename for the instance.""" 67 | inst_file_name = '{}_{}_{}'\ 68 | .format(self.batch_num, self.site_num, self.instance_num) 69 | return inst_file_name 70 | 71 | def cleanup_visit(self): 72 | """Kill sniffer and Tor browser if they're running.""" 73 | wl_log.info("Cleaning up visit.") 74 | wl_log.info("Cancelling timeout") 75 | ut.cancel_timeout() 76 | 77 | if self.sniffer and self.sniffer.is_recording: 78 | wl_log.info("Stopping sniffer...") 79 | self.sniffer.stop_capture() 80 | if self.tb_driver and self.tb_driver.is_running: 81 | # shutil.rmtree(self.tb_driver.prof_dir_path) 82 | wl_log.info("Quitting selenium driver...") 83 | self.tb_driver.quit() 84 | 85 | # close all open streams to prevent pollution 86 | self.tor_controller.close_all_streams() 87 | if self.xvfb and not cm.running_in_CI: 88 | self.vdisplay.stop() 89 | 90 | def take_screenshot(self): 91 | try: 92 | out_png = os.path.join(self.visit_dir, 'screenshot.png') 93 | wl_log.info("Taking screenshot of %s to %s" % (self.page_url, 94 | out_png)) 95 | self.tb_driver.get_screenshot_as_file(out_png) 96 | if cm.running_in_CI: 97 | wl_log.debug("Screenshot data:image/png;base64,%s" 98 | % self.tb_driver.get_screenshot_as_base64()) 99 | except: 100 | wl_log.info("Exception while taking screenshot of: %s" 101 | % self.page_url) 102 | 103 | def get_wang_and_goldberg(self): 104 | """Visit the site according to Wang and Goldberg (WPES'13) settings.""" 105 | ut.timeout(cm.HARD_VISIT_TIMEOUT) # set timeout to stop the visit 106 | 107 | self.sniffer.start_capture( 108 | self.pcap_path, 109 | 'tcp and not host %s and not host %s and not host %s' 110 | % (VBOX_GATEWAY_IP, LOCALHOST_IP, LXC_GATEWAY_IP)) 111 | 112 | time.sleep(cm.PAUSE_BETWEEN_INSTANCES) 113 | try: 114 | self.tb_driver.set_page_load_timeout(cm.SOFT_VISIT_TIMEOUT) 115 | except: 116 | wl_log.info("Exception setting a timeout {}".format(self.page_url)) 117 | 118 | wl_log.info("Crawling URL: {}".format(self.page_url)) 119 | 120 | t1 = time.time() 121 | self.tb_driver.get(self.page_url) 122 | page_load_time = time.time() - t1 123 | wl_log.info("{} loaded in {} sec" 124 | .format(self.page_url, page_load_time)) 125 | time.sleep(cm.WAIT_IN_SITE) 126 | if self.capture_screen: 127 | self.take_screenshot() 128 | self.cleanup_visit() 129 | 130 | def get_multitab(self): 131 | """Open two tab, use one to load a background site and the other to 132 | load the real site.""" 133 | PAUSE_BETWEEN_TAB_OPENINGS = 0.5 134 | ut.timeout(cm.HARD_VISIT_TIMEOUT) # set timeout to kill running procs 135 | # load a blank page - a page is needed to send keys to the browser 136 | self.tb_driver.get(BAREBONE_HOME_PAGE) 137 | self.sniffer.start_capture(self.pcap_path, 138 | 'tcp and not host %s and not host %s' 139 | % (VBOX_GATEWAY_IP, LOCALHOST_IP)) 140 | 141 | time.sleep(cm.PAUSE_BETWEEN_INSTANCES) 142 | try: 143 | self.tb_driver.set_page_load_timeout(cm.SOFT_VISIT_TIMEOUT) 144 | except: 145 | wl_log.info("Exception setting a timeout {}".format(self.page_url)) 146 | 147 | wl_log.info("Crawling URL: {} with {} in the background". 148 | format(self.page_url, self.bg_site)) 149 | 150 | body = self.tb_driver.find_element_by_tag_name("body") 151 | body.send_keys(Keys.CONTROL + 't') # open a new tab 152 | # now that the focus is on the address bar, load the background 153 | # site by "typing" it to the address bar and "pressing" ENTER (\n) 154 | # simulated by send_keys function 155 | body.send_keys('%s\n' % self.bg_site) 156 | 157 | # the delay between the loading of background and real sites 158 | time.sleep(PAUSE_BETWEEN_TAB_OPENINGS) 159 | 160 | body = self.tb_driver.find_element_by_tag_name("body") 161 | body.send_keys(Keys.CONTROL + 't') # open a new tab 162 | 163 | t1 = time.time() 164 | self.tb_driver.get(self.page_url) # load the real site in the 2nd tab 165 | 166 | page_load_time = time.time() - t1 167 | wl_log.info("{} loaded in {} sec" 168 | .format(self.page_url, page_load_time)) 169 | time.sleep(cm.WAIT_IN_SITE) 170 | if self.capture_screen: 171 | self.take_screenshot() 172 | self.cleanup_visit() 173 | 174 | def get(self): 175 | """Call the specific visit function depending on the experiment.""" 176 | if self.experiment == cm.EXP_TYPE_WANG_AND_GOLDBERG: 177 | self.get_wang_and_goldberg() 178 | elif self.experiment == cm.EXP_TYPE_MULTITAB_ALEXA: 179 | self.get_multitab() 180 | else: 181 | raise ValueError("Cannot determine experiment type") 182 | -------------------------------------------------------------------------------- /tor-browser-crawler-webfp-paper/etc/barebones.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | TEST 4 | 5 | 6 | -------------------------------------------------------------------------------- /tor-browser-crawler-webfp-paper/etc/localized-urls-100-top.csv: -------------------------------------------------------------------------------- 1 | http://www.google.de 2 | http://www.facebook.com 3 | http://www.youtube.com 4 | http://de.yahoo.com 5 | http://www.baidu.com 6 | http://www.wikipedia.org 7 | http://www.qq.com 8 | http://www.linkedin.com 9 | http://www.live.com 10 | http://www.twitter.com 11 | http://www.amazon.com 12 | http://www.taobao.com 13 | http://www.blogspot.com 14 | http://www.sina.com.cn 15 | http://www.wordpress.com 16 | http://www.yandex.ru 17 | http://www.bing.com 18 | http://www.ebay.com 19 | http://www.hao123.com 20 | http://www.vk.com 21 | http://www.163.com 22 | http://www.tumblr.com 23 | http://www.pinterest.com 24 | http://www.ask.com 25 | http://de.msn.com 26 | http://www.microsoft.com/de-de/default.aspx 27 | http://www.mail.ru 28 | http://www.weibo.com 29 | http://www.apple.com 30 | http://www.paypal.com 31 | http://www.tmall.com 32 | http://www.instagram.com 33 | http://www.xvideos.com 34 | http://www.imdb.com 35 | http://www.sohu.com 36 | http://www.craigslist.org 37 | http://www.360.cn 38 | http://www.soso.com 39 | http://www.go.com 40 | http://www.xhamster.com 41 | http://www.bbc.co.uk 42 | http://www.stackoverflow.com 43 | http://www.neobux.com 44 | http://www.fc2.com 45 | http://www.imgur.com 46 | http://www.alibaba.com 47 | http://www.cnn.com 48 | http://www.adcash.com 49 | http://www.wordpress.org 50 | http://www.espn.go.com 51 | http://www.flickr.com 52 | http://www.huffingtonpost.com 53 | http://www.odnoklassniki.ru 54 | http://www.vube.com 55 | http://www.conduit.com 56 | http://www.adobe.com 57 | http://www.gmw.cn 58 | http://www.aliexpress.com 59 | http://www.reddit.com 60 | http://www.pornhub.com 61 | http://www.about.com 62 | http://www.youku.com 63 | http://www.godaddy.com 64 | http://www.rakuten.co.jp 65 | http://www.xinhuanet.com 66 | http://www.ku6.com 67 | http://www.dailymotion.com/de 68 | http://www.ifeng.com 69 | http://www.cnet.com 70 | http://www.netflix.com 71 | http://www.vimeo.com 72 | http://www.uol.com.br 73 | http://www.dailymail.co.uk 74 | http://www.youporn.com 75 | http://www.kickass.to 76 | http://www.adf.ly 77 | http://www.aol.com 78 | http://www.redtube.com 79 | http://www.themeforest.net 80 | http://www.dropbox.com 81 | http://www.sogou.com 82 | http://www.livejasmin.com 83 | http://www.indiatimes.com 84 | http://www.amazonaws.com 85 | http://www.globo.com 86 | http://www.people.com.cn 87 | http://www.xnxx.com 88 | http://www.slideshare.net 89 | http://www.nytimes.com 90 | http://www.directrev.com 91 | http://www.pixnet.net 92 | http://www.avg.com/de-de/ 93 | http://www.alipay.com 94 | http://www.wikimedia.org 95 | http://www.fiverr.com 96 | http://www.ameblo.jp 97 | http://www.deviantart.com 98 | http://www.hootsuite.com 99 | http://www.livedoor.com 100 | http://www.yelp.com 101 | -------------------------------------------------------------------------------- /tor-browser-crawler-webfp-paper/etc/urls-100-10-mon.csv: -------------------------------------------------------------------------------- 1 | http://www.bongacams.com 2 | http://www.chaturbate.com 3 | http://www.livejasmin.com 4 | http://www.porn555.com 5 | http://www.pornhub.com 6 | http://www.thepiratebay.org 7 | http://www.txxx.com 8 | http://www.xhamster.com 9 | http://www.xnxx.com 10 | http://www.xvideos.com 11 | http://www.bongacams.com 12 | http://www.chaturbate.com 13 | http://www.livejasmin.com 14 | http://www.porn555.com 15 | http://www.pornhub.com 16 | http://www.thepiratebay.org 17 | http://www.txxx.com 18 | http://www.xhamster.com 19 | http://www.xnxx.com 20 | http://www.xvideos.com 21 | http://www.bongacams.com 22 | http://www.chaturbate.com 23 | http://www.livejasmin.com 24 | http://www.porn555.com 25 | http://www.pornhub.com 26 | http://www.thepiratebay.org 27 | http://www.txxx.com 28 | http://www.xhamster.com 29 | http://www.xnxx.com 30 | http://www.xvideos.com 31 | http://www.bongacams.com 32 | http://www.chaturbate.com 33 | http://www.livejasmin.com 34 | http://www.porn555.com 35 | http://www.pornhub.com 36 | http://www.thepiratebay.org 37 | http://www.txxx.com 38 | http://www.xhamster.com 39 | http://www.xnxx.com 40 | http://www.xvideos.com 41 | http://www.bongacams.com 42 | http://www.chaturbate.com 43 | http://www.livejasmin.com 44 | http://www.porn555.com 45 | http://www.pornhub.com 46 | http://www.thepiratebay.org 47 | http://www.txxx.com 48 | http://www.xhamster.com 49 | http://www.xnxx.com 50 | http://www.xvideos.com 51 | http://www.bongacams.com 52 | http://www.chaturbate.com 53 | http://www.livejasmin.com 54 | http://www.porn555.com 55 | http://www.pornhub.com 56 | http://www.thepiratebay.org 57 | http://www.txxx.com 58 | http://www.xhamster.com 59 | http://www.xnxx.com 60 | http://www.xvideos.com 61 | http://www.bongacams.com 62 | http://www.chaturbate.com 63 | http://www.livejasmin.com 64 | http://www.porn555.com 65 | http://www.pornhub.com 66 | http://www.thepiratebay.org 67 | http://www.txxx.com 68 | http://www.xhamster.com 69 | http://www.xnxx.com 70 | http://www.xvideos.com 71 | http://www.bongacams.com 72 | http://www.chaturbate.com 73 | http://www.livejasmin.com 74 | http://www.porn555.com 75 | http://www.pornhub.com 76 | http://www.thepiratebay.org 77 | http://www.txxx.com 78 | http://www.xhamster.com 79 | http://www.xnxx.com 80 | http://www.xvideos.com 81 | http://www.bongacams.com 82 | http://www.chaturbate.com 83 | http://www.livejasmin.com 84 | http://www.porn555.com 85 | http://www.pornhub.com 86 | http://www.thepiratebay.org 87 | http://www.txxx.com 88 | http://www.xhamster.com 89 | http://www.xnxx.com 90 | http://www.xvideos.com 91 | http://www.bongacams.com 92 | http://www.chaturbate.com 93 | http://www.livejasmin.com 94 | http://www.porn555.com 95 | http://www.pornhub.com 96 | http://www.thepiratebay.org 97 | http://www.txxx.com 98 | http://www.xhamster.com 99 | http://www.xnxx.com 100 | http://www.xvideos.com 101 | -------------------------------------------------------------------------------- /tor-browser-crawler-webfp-paper/etc/urls-100-google.csv: -------------------------------------------------------------------------------- 1 | http://www.google.com 2 | http://www.google.com 3 | http://www.google.com 4 | http://www.google.com 5 | http://www.google.com 6 | http://www.google.com 7 | http://www.google.com 8 | http://www.google.com 9 | http://www.google.com 10 | http://www.google.com 11 | http://www.google.com 12 | http://www.google.com 13 | http://www.google.com 14 | http://www.google.com 15 | http://www.google.com 16 | http://www.google.com 17 | http://www.google.com 18 | http://www.google.com 19 | http://www.google.com 20 | http://www.google.com 21 | http://www.google.com 22 | http://www.google.com 23 | http://www.google.com 24 | http://www.google.com 25 | http://www.google.com 26 | http://www.google.com 27 | http://www.google.com 28 | http://www.google.com 29 | http://www.google.com 30 | http://www.google.com 31 | http://www.google.com 32 | http://www.google.com 33 | http://www.google.com 34 | http://www.google.com 35 | http://www.google.com 36 | http://www.google.com 37 | http://www.google.com 38 | http://www.google.com 39 | http://www.google.com 40 | http://www.google.com 41 | http://www.google.com 42 | http://www.google.com 43 | http://www.google.com 44 | http://www.google.com 45 | http://www.google.com 46 | http://www.google.com 47 | http://www.google.com 48 | http://www.google.com 49 | http://www.google.com 50 | http://www.google.com 51 | http://www.google.com 52 | http://www.google.com 53 | http://www.google.com 54 | http://www.google.com 55 | http://www.google.com 56 | http://www.google.com 57 | http://www.google.com 58 | http://www.google.com 59 | http://www.google.com 60 | http://www.google.com 61 | http://www.google.com 62 | http://www.google.com 63 | http://www.google.com 64 | http://www.google.com 65 | http://www.google.com 66 | http://www.google.com 67 | http://www.google.com 68 | http://www.google.com 69 | http://www.google.com 70 | http://www.google.com 71 | http://www.google.com 72 | http://www.google.com 73 | http://www.google.com 74 | http://www.google.com 75 | http://www.google.com 76 | http://www.google.com 77 | http://www.google.com 78 | http://www.google.com 79 | http://www.google.com 80 | http://www.google.com 81 | http://www.google.com 82 | http://www.google.com 83 | http://www.google.com 84 | http://www.google.com 85 | http://www.google.com 86 | http://www.google.com 87 | http://www.google.com 88 | http://www.google.com 89 | http://www.google.com 90 | http://www.google.com 91 | http://www.google.com 92 | http://www.google.com 93 | http://www.google.com 94 | http://www.google.com 95 | http://www.google.com 96 | http://www.google.com 97 | http://www.google.com 98 | http://www.google.com 99 | http://www.google.com 100 | http://www.google.com 101 | -------------------------------------------------------------------------------- /tor-browser-crawler-webfp-paper/etc/urls-100-top.csv: -------------------------------------------------------------------------------- 1 | http://www.google.com 2 | http://www.youtube.com 3 | http://www.facebook.com 4 | http://www.baidu.com 5 | http://www.wikipedia.org 6 | http://www.yahoo.com 7 | http://www.qq.com 8 | http://www.reddit.com 9 | http://www.taobao.com 10 | http://www.twitter.com 11 | http://www.amazon.com 12 | http://www.tmall.com 13 | http://www.sohu.com 14 | http://www.live.com 15 | http://www.vk.com 16 | http://www.instagram.com 17 | http://www.jd.com 18 | http://www.weibo.com 19 | http://www.360.cn 20 | http://www.linkedin.com 21 | http://www.netflix.com 22 | http://www.ebay.com 23 | http://www.imgur.com 24 | http://www.pornhub.com 25 | http://www.bing.com 26 | http://www.onclkds.com 27 | http://www.msn.com 28 | http://www.microsoft.com 29 | http://www.livejasmin.com 30 | http://www.twitch.tv 31 | http://www.hao123.com 32 | http://www.alipay.com 33 | http://www.wordpress.com 34 | http://www.aliexpress.com 35 | http://www.xvideos.com 36 | http://www.stackoverflow.com 37 | http://www.imdb.com 38 | http://www.blogspot.com 39 | http://www.pinterest.com 40 | http://www.github.com 41 | http://www.office.com 42 | http://www.apple.com 43 | http://www.csdn.net 44 | http://www.popads.net 45 | http://www.microsoftonline.com 46 | http://www.wikia.com 47 | http://www.whatsapp.com 48 | http://www.diply.com 49 | http://www.paypal.com 50 | http://www.xhamster.com 51 | http://www.ntd.tv 52 | http://www.adobe.com 53 | http://www.coccoc.com 54 | http://www.bongacams.com 55 | http://www.soso.com 56 | http://www.dropbox.com 57 | http://www.googleusercontent.com 58 | http://www.pixnet.net 59 | http://www.txxx.com 60 | http://www.craigslist.org 61 | http://www.so.com 62 | http://www.thepiratebay.org 63 | http://www.porn555.com 64 | http://www.bbc.com 65 | http://www.cnn.com 66 | http://www.fc2.com 67 | http://www.clicksgear.com 68 | http://www.china.com 69 | http://www.booking.com 70 | http://www.soundcloud.com 71 | http://www.quora.com 72 | http://www.naver.com 73 | http://www.uptodown.com 74 | http://www.nytimes.com 75 | http://www.ask.com 76 | http://www.ettoday.net 77 | http://www.savefrom.net 78 | http://www.dailymotion.com 79 | http://www.amazonaws.com 80 | http://www.xnxx.com 81 | http://www.blastingnews.com 82 | http://www.theguardian.com 83 | http://www.detik.com 84 | http://www.espn.com 85 | http://www.vice.com 86 | http://www.blogger.com 87 | http://www.fbcdn.net 88 | http://www.onlinesbi.com 89 | http://www.tribunnews.com 90 | http://www.stackexchange.com 91 | http://www.vimeo.com 92 | http://www.salesforce.com 93 | http://www.flipkart.com 94 | http://www.chaturbate.com 95 | http://www.ladbible.com 96 | http://www.spotify.com 97 | http://www.steamcommunity.com 98 | http://www.buzzfeed.com 99 | http://www.daikynguyenvn.com 100 | http://www.chase.com -------------------------------------------------------------------------------- /tor-browser-crawler-webfp-paper/geckodriver.log: -------------------------------------------------------------------------------- 1 | 1500328561044 geckodriver INFO geckodriver 0.18.0 2 | 1500328561047 geckodriver INFO Listening on 127.0.0.1:59677 3 | 1500328562588 geckodriver::marionette INFO Starting browser /home/pankaj/wf_is/webfp-crawler-phantomjs/tor-browser-crawler-webfp-paper/tbb/tor-browser-linux64-6.5.2_en-US/Browser/firefox with args ["-marionette"] 4 | 1500328562800 addons.manager DEBUG Loaded provider scope for resource://gre/modules/addons/XPIProvider.jsm: ["XPIProvider"] 5 | 1500328562800 addons.manager DEBUG Loaded provider scope for resource://gre/modules/LightweightThemeManager.jsm: ["LightweightThemeManager"] 6 | 1500328562800 addons.manager DEBUG Loaded provider scope for resource://gre/modules/addons/GMPProvider.jsm 7 | 1500328562900 addons.manager DEBUG Loaded provider scope for resource://gre/modules/addons/PluginProvider.jsm 8 | 1500328562900 addons.manager DEBUG Starting provider: XPIProvider 9 | 1500328562900 addons.xpi DEBUG startup 10 | 1500328562900 addons.xpi INFO Mapping {73a6fe31-595d-460b-a920-fcc0f8843232} to /tmp/rust_mozprofile.65WNOk7gDoFe/extensions/{73a6fe31-595d-460b-a920-fcc0f8843232}.xpi 11 | 1500328562900 addons.xpi INFO Mapping tor-launcher@torproject.org to /tmp/rust_mozprofile.65WNOk7gDoFe/extensions/tor-launcher@torproject.org.xpi 12 | 1500328562900 addons.xpi INFO Mapping torbutton@torproject.org to /tmp/rust_mozprofile.65WNOk7gDoFe/extensions/torbutton@torproject.org.xpi 13 | 1500328562900 addons.xpi INFO Mapping https-everywhere-eff@eff.org to /tmp/rust_mozprofile.65WNOk7gDoFe/extensions/https-everywhere-eff@eff.org.xpi 14 | 1500328562900 addons.xpi INFO SystemAddonInstallLocation directory is missing 15 | 1500328562900 addons.xpi INFO Mapping e10srollout@mozilla.org to /home/pankaj/wf_is/webfp-crawler-phantomjs/tor-browser-crawler-webfp-paper/tbb/tor-browser-linux64-6.5.2_en-US/Browser/browser/features/e10srollout@mozilla.org.xpi 16 | 1500328562900 addons.xpi INFO Mapping {972ce4c6-7e08-4474-a285-3208198ce6fd} to /home/pankaj/wf_is/webfp-crawler-phantomjs/tor-browser-crawler-webfp-paper/tbb/tor-browser-linux64-6.5.2_en-US/Browser/browser/extensions/{972ce4c6-7e08-4474-a285-3208198ce6fd}.xpi 17 | 1500328562900 addons.xpi DEBUG checkForChanges 18 | 1500328562900 addons.xpi INFO SystemAddonInstallLocation directory is missing 19 | 1500328562900 addons.xpi DEBUG Loaded add-on state from prefs: {"app-profile":{"{73a6fe31-595d-460b-a920-fcc0f8843232}":{"d":"/home/pankaj/wf_is/webfp-crawler-phantomjs/tor-browser-crawler-webfp-paper/tbb/tor-browser-linux64-6.5.2_en-US/Browser/TorBrowser/Data/Browser/profile.default/extensions/{73a6fe31-595d-460b-a920-fcc0f8843232}.xpi","e":true,"v":"5.0.5","st":1500326959000},"tor-launcher@torproject.org":{"d":"/home/pankaj/wf_is/webfp-crawler-phantomjs/tor-browser-crawler-webfp-paper/tbb/tor-browser-linux64-6.5.2_en-US/Browser/TorBrowser/Data/Browser/profile.default/extensions/tor-launcher@torproject.org.xpi","e":true,"v":"0.2.12.2","st":1500326959000},"torbutton@torproject.org":{"d":"/home/pankaj/wf_is/webfp-crawler-phantomjs/tor-browser-crawler-webfp-paper/tbb/tor-browser-linux64-6.5.2_en-US/Browser/TorBrowser/Data/Browser/profile.default/extensions/torbutton@torproject.org.xpi","e":true,"v":"1.9.7.4","st":1500326959000},"https-everywhere-eff@eff.org":{"d":"/home/pankaj/wf_is/webfp-crawler-phantomjs/tor-browser-crawler-webfp-paper/tbb/tor-browser-linux64-6.5.2_en-US/Browser/TorBrowser/Data/Browser/profile.default/extensions/https-everywhere-eff@eff.org.xpi","e":true,"v":"5.2.19","st":1500326960000}},"app-system-defaults":{"e10srollout@mozilla.org":{"d":"/home/pankaj/wf_is/webfp-crawler-phantomjs/tor-browser-crawler-webfp-paper/tbb/tor-browser-linux64-6.5.2_en-US/Browser/browser/features/e10srollout@mozilla.org.xpi","e":true,"v":"1.10","st":1500326957000}},"app-global":{"{972ce4c6-7e08-4474-a285-3208198ce6fd}":{"d":"/home/pankaj/wf_is/webfp-crawler-phantomjs/tor-browser-crawler-webfp-paper/tbb/tor-browser-linux64-6.5.2_en-US/Browser/browser/extensions/{972ce4c6-7e08-4474-a285-3208198ce6fd}.xpi","e":true,"v":"52.2.0","st":1500326957000}}} 20 | 1500328562900 addons.xpi DEBUG getModTime: Recursive scan of {73a6fe31-595d-460b-a920-fcc0f8843232} 21 | 1500328562900 addons.xpi DEBUG Changed add-on {73a6fe31-595d-460b-a920-fcc0f8843232} in app-profile 22 | 1500328562900 addons.xpi DEBUG getModTime: Recursive scan of tor-launcher@torproject.org 23 | 1500328562900 addons.xpi DEBUG Changed add-on tor-launcher@torproject.org in app-profile 24 | 1500328562900 addons.xpi DEBUG getModTime: Recursive scan of torbutton@torproject.org 25 | 1500328562900 addons.xpi DEBUG Changed add-on torbutton@torproject.org in app-profile 26 | 1500328562900 addons.xpi DEBUG getModTime: Recursive scan of https-everywhere-eff@eff.org 27 | 1500328562900 addons.xpi DEBUG Changed add-on https-everywhere-eff@eff.org in app-profile 28 | 1500328562900 addons.xpi DEBUG getModTime: Recursive scan of e10srollout@mozilla.org 29 | 1500328562900 addons.xpi DEBUG Changed add-on e10srollout@mozilla.org in app-system-defaults 30 | 1500328562900 addons.xpi DEBUG getModTime: Recursive scan of {972ce4c6-7e08-4474-a285-3208198ce6fd} 31 | 1500328562900 addons.xpi DEBUG Changed add-on {972ce4c6-7e08-4474-a285-3208198ce6fd} in app-global 32 | 1500328562900 addons.xpi DEBUG getInstallState changed: true, state: {"app-profile":{"{73a6fe31-595d-460b-a920-fcc0f8843232}":{"d":"/tmp/rust_mozprofile.65WNOk7gDoFe/extensions/{73a6fe31-595d-460b-a920-fcc0f8843232}.xpi","e":true,"v":"5.0.5","st":1500328562000},"tor-launcher@torproject.org":{"d":"/tmp/rust_mozprofile.65WNOk7gDoFe/extensions/tor-launcher@torproject.org.xpi","e":true,"v":"0.2.12.2","st":1500328562000},"torbutton@torproject.org":{"d":"/tmp/rust_mozprofile.65WNOk7gDoFe/extensions/torbutton@torproject.org.xpi","e":true,"v":"1.9.7.4","st":1500328562000},"https-everywhere-eff@eff.org":{"d":"/tmp/rust_mozprofile.65WNOk7gDoFe/extensions/https-everywhere-eff@eff.org.xpi","e":true,"v":"5.2.19","st":1500328562000}},"app-system-defaults":{"e10srollout@mozilla.org":{"d":"/home/pankaj/wf_is/webfp-crawler-phantomjs/tor-browser-crawler-webfp-paper/tbb/tor-browser-linux64-6.5.2_en-US/Browser/browser/features/e10srollout@mozilla.org.xpi","e":true,"v":"1.10","st":1500326957000}},"app-global":{"{972ce4c6-7e08-4474-a285-3208198ce6fd}":{"d":"/home/pankaj/wf_is/webfp-crawler-phantomjs/tor-browser-crawler-webfp-paper/tbb/tor-browser-linux64-6.5.2_en-US/Browser/browser/extensions/{972ce4c6-7e08-4474-a285-3208198ce6fd}.xpi","e":true,"v":"52.2.0","st":1500326957000}}} 33 | 1500328562900 addons.xpi-utils DEBUG Opening XPI database /tmp/rust_mozprofile.65WNOk7gDoFe/extensions.json 34 | 1500328562900 addons.xpi-utils DEBUG Successfully read XPI database 35 | 1500328562900 addons.xpi-utils DEBUG Add-on torbutton@torproject.org modified in app-profile 36 | *** Blocklist::_loadBlocklistFromFile: blocklist is disabled 37 | 1500328562900 DeferredSave.extensions.json DEBUG Save changes 38 | 1500328562900 DeferredSave.extensions.json DEBUG Save changes 39 | 1500328562900 addons.xpi-utils DEBUG Add-on tor-launcher@torproject.org modified in app-profile 40 | 1500328562900 DeferredSave.extensions.json DEBUG Starting timer 41 | 1500328562900 DeferredSave.extensions.json DEBUG Save changes 42 | 1500328562900 DeferredSave.extensions.json DEBUG Save changes 43 | 1500328562900 addons.xpi-utils DEBUG Add-on {73a6fe31-595d-460b-a920-fcc0f8843232} modified in app-profile 44 | 1500328562900 DeferredSave.extensions.json DEBUG Starting write 45 | 1500328562900 DeferredSave.extensions.json DEBUG Save changes 46 | 1500328562900 DeferredSave.extensions.json DEBUG Data changed while write in progress 47 | 1500328563000 DeferredSave.extensions.json DEBUG Save changes 48 | 1500328563000 addons.xpi-utils DEBUG Add-on https-everywhere-eff@eff.org modified in app-profile 49 | 1500328563000 DeferredSave.extensions.json DEBUG Write succeeded 50 | 1500328563000 addons.xpi-utils DEBUG XPI Database saved, setting schema version preference to 19 51 | 1500328563000 DeferredSave.extensions.json DEBUG Starting timer 52 | 1500328563000 DeferredSave.extensions.json DEBUG Starting write 53 | 1500328563000 DeferredSave.extensions.json DEBUG Write succeeded 54 | 1500328563100 DeferredSave.extensions.json DEBUG Save changes 55 | 1500328563100 DeferredSave.extensions.json DEBUG Save changes 56 | 1500328563100 addons.manager DEBUG Registering startup change 'changed' for torbutton@torproject.org 57 | 1500328563100 addons.xpi-utils DEBUG Make addon app-profile:torbutton@torproject.org visible 58 | 1500328563100 DeferredSave.extensions.json DEBUG Save changes 59 | 1500328563100 addons.manager DEBUG Registering startup change 'changed' for tor-launcher@torproject.org 60 | 1500328563100 addons.xpi-utils DEBUG Make addon app-profile:tor-launcher@torproject.org visible 61 | 1500328563100 DeferredSave.extensions.json DEBUG Save changes 62 | 1500328563100 addons.manager DEBUG Registering startup change 'changed' for {73a6fe31-595d-460b-a920-fcc0f8843232} 63 | 1500328563100 addons.xpi-utils DEBUG Make addon app-profile:{73a6fe31-595d-460b-a920-fcc0f8843232} visible 64 | 1500328563100 DeferredSave.extensions.json DEBUG Save changes 65 | 1500328563100 addons.manager DEBUG Registering startup change 'changed' for https-everywhere-eff@eff.org 66 | 1500328563100 addons.xpi-utils DEBUG Make addon app-profile:https-everywhere-eff@eff.org visible 67 | 1500328563100 DeferredSave.extensions.json DEBUG Save changes 68 | 1500328563100 addons.xpi-utils DEBUG Make addon app-system-defaults:e10srollout@mozilla.org visible 69 | 1500328563100 DeferredSave.extensions.json DEBUG Save changes 70 | 1500328563100 addons.xpi-utils DEBUG Make addon app-global:{972ce4c6-7e08-4474-a285-3208198ce6fd} visible 71 | 1500328563100 DeferredSave.extensions.json DEBUG Save changes 72 | 1500328563100 addons.xpi DEBUG Updating XPIState for {"id":"torbutton@torproject.org","syncGUID":"cxvROhrGYeAB","location":"app-profile","version":"1.9.7.4","type":"extension","internalName":null,"updateURL":"data:text/plain,","updateKey":"-","optionsURL":"chrome://torbutton/content/preferences.xul","optionsType":null,"aboutURL":null,"icons":{},"iconURL":"chrome://torbutton/skin/tor.png","icon64URL":null,"defaultLocale":{"name":"Torbutton","description":null,"creator":"Mike Perry","homepageURL":"https://www.torproject.org/projects/torbrowser.html.en"},"visible":true,"active":true,"userDisabled":false,"appDisabled":false,"descriptor":"/tmp/rust_mozprofile.65WNOk7gDoFe/extensions/torbutton@torproject.org.xpi","installDate":946684800000,"updateDate":1500328562000,"applyBackgroundUpdates":1,"bootstrap":false,"skinnable":false,"size":1719042,"sourceURI":null,"releaseNotesURI":null,"softDisabled":false,"foreignInstall":true,"hasBinaryComponents":false,"strictCompatibility":false,"locales":[],"targetApplications":[{"id":"{ec8030f7-c20a-464f-9b0e-13a3a9e97384}","minVersion":"45.0","maxVersion":"10000.0"}],"targetPlatforms":[],"multiprocessCompatible":true,"signedState":0,"seen":true,"dependencies":[],"hasEmbeddedWebExtension":false,"mpcOptedOut":false} 73 | 1500328563100 addons.xpi DEBUG Updating XPIState for {"id":"tor-launcher@torproject.org","syncGUID":"xUFvDe1sv_Gk","location":"app-profile","version":"0.2.12.2","type":"extension","internalName":null,"updateURL":"data:text/plain,","updateKey":"-","optionsURL":null,"optionsType":null,"aboutURL":null,"icons":{"32":"icon.png","48":"icon.png"},"iconURL":null,"icon64URL":null,"defaultLocale":{"name":"TorLauncher","description":null,"creator":"The Tor Project, Inc.","homepageURL":"https://www.torproject.org/projects/torbrowser.html","contributors":["Pearl Crescent, LLC"]},"visible":true,"active":true,"userDisabled":false,"appDisabled":false,"descriptor":"/tmp/rust_mozprofile.65WNOk7gDoFe/extensions/tor-launcher@torproject.org.xpi","installDate":946684800000,"updateDate":1500328562000,"applyBackgroundUpdates":1,"bootstrap":false,"skinnable":false,"size":1832741,"sourceURI":null,"releaseNotesURI":null,"softDisabled":false,"foreignInstall":true,"hasBinaryComponents":false,"strictCompatibility":false,"locales":[],"targetApplications":[{"id":"{ec8030f7-c20a-464f-9b0e-13a3a9e97384}","minVersion":"24.0","maxVersion":"*.*.*"},{"id":"{3550f703-e582-4d05-9a08-453d09bdfdc6}","minVersion":"10.0","maxVersion":"*.*.*"},{"id":"{33cb9019-c295-46dd-be21-8c4936574bee}","minVersion":"1.4","maxVersion":"*.*.*"}],"targetPlatforms":[],"multiprocessCompatible":true,"signedState":0,"seen":true,"dependencies":[],"hasEmbeddedWebExtension":false,"mpcOptedOut":false} 74 | 1500328563100 addons.xpi DEBUG Updating XPIState for {"id":"{73a6fe31-595d-460b-a920-fcc0f8843232}","syncGUID":"XgY61D-FuchS","location":"app-profile","version":"5.0.5","type":"extension","internalName":null,"updateURL":null,"updateKey":null,"optionsURL":"chrome://noscript/content/noscriptOptions.xul","optionsType":null,"aboutURL":"chrome://noscript/content/about.xul","icons":{},"iconURL":"chrome://noscript/skin/icon32.png","icon64URL":"chrome://noscript/skin/icon64.png","defaultLocale":{"name":"NoScript","description":"Extra protection for your Firefox: NoScript allows JavaScript, Java (and other plugins) only for trusted domains of your choice (e.g. your home-banking web site). This whitelist based pre-emptive blocking approach prevents exploitation of security vulnerabilities (known and even unknown!) with no loss of functionality... Experts will agree: Firefox is really safer with NoScript :-)","creator":"Giorgio Maone","homepageURL":"https://noscript.net","contributors":["aivo (Estonian translation)","Alberto Martínez, EduLeo & Urko (Spanish translation)","Algimantas Margevičius (Lithuanian translation)","Asaf Bartov & baryoni (Hebrew translation)","Alf and Liesbeth (Dutch translation)","Alexander Sokolov, Sergei Smirnov and negodnik (Russian translation)","Batuhan Çetin and eveterinary (Turkish translation)","Baurzhan Muftakhidinov (Kazakh translation)","Beerboy & Haebaru (Japanese translation)","Carsten Winkler (Danish translation)","Chiou Po-Jung (Chinese Traditional translation)","Dario Ornelas (Portuguese translation)","drAcOniS and Petr Jirsa (Czech translation)","Drive DRKA and Dzmitry Drazdou (Belarusian translation)","Engin Yazılan, Erkan Kaplan & Fathi (Turkish translation)","Georgi Marchev (Bulgarian translation)","Håvard Mork (Norwegian bokmål translation)","Hwasung Kim (Places bookmarklet patch)","Ivan Pesic, dragan021 (Serbian translation)","Ivan Jonoski (Macedonian translation)","Jameka (Swedish translation)","Joan-Josep Bargues (Catalan translation)","Joshua Issac (Malay translation)","Khaled Hosny & Nassim Dhaher (Arabic translation)","Krcko (Croatian translation)","Ian Moody (English GB translation)","LocaLiceR (Hungarian translation)","Lukasz Biegaj & Teo (Polish translation)","Michela Venuto (inspiration)","Mika Pirinen (Finnish translation)","Mindaugas Jakutis (Lithuanian translation)","Mikes Kaszmán István (Hungarian translation)","MozUA (Ukrainian translation)","Pedram Veisi (Persian translation)","Peter Bradley (Welsh translation)","Raryel Costa Souza (Brazilian Portuguese)","regfreak (Indonesian translation)","roebek (Galician translation)","Qen (Thai translation)","seaousak (Korean translation)","SlovakSoft (Slovak translation)","Sonickydon (Greek translation)","Stiepan A. Kovac (Croatian translation)","swarnava (Bengali translation)","Tomaz Macus (Slovenian translation)","Thomas Weber & Volker Hable (German translation)","tonynguyen and loveleeyoungae (Vietnamese translation)","x10firefox and Ultravioletu (Romanian translation)","Xavier Robin & BlackJack (French translation)","X.F Mao & George C. Tsoi (Simplified Chinese)"]},"visible":true,"active":true,"userDisabled":false,"appDisabled":false,"descriptor":"/tmp/rust_mozprofile.65WNOk7gDoFe/extensions/{73a6fe31-595d-460b-a920-fcc0f8843232}.xpi","installDate":946684800000,"updateDate":1500328562000,"applyBackgroundUpdates":1,"bootstrap":false,"skinnable":false,"size":2129125,"sourceURI":null,"releaseNotesURI":null,"softDisabled":false,"foreignInstall":true,"hasBinaryComponents":false,"strictCompatibility":false,"locales":[],"targetApplications":[{"id":"{ec8030f7-c20a-464f-9b0e-13a3a9e97384}","minVersion":"45.0","maxVersion":"*"},{"id":"{92650c4d-4b8e-4d2a-b7eb-24ecf4f6b63a}","minVersion":"2.42","maxVersion":"*"},{"id":"{8de7fcbb-c55c-4fbe-bfc5-fc555c87dbc4}","minVersion":"27.0.2","maxVersion":"*"}],"targetPlatforms":[],"multiprocessCompatible":true,"signedState":2,"seen":true,"dependencies":[],"hasEmbeddedWebExtension":false,"mpcOptedOut":false} 75 | 1500328563100 addons.xpi DEBUG Updating XPIState for {"id":"https-everywhere-eff@eff.org","syncGUID":"_DyCflCNgzyb","location":"app-profile","version":"5.2.19","type":"extension","internalName":null,"updateURL":"https://www.eff.org/files/https-everywhere-eff-update-2048.rdf","updateKey":"MIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEA6MR8W/galdxnpGqBsYbqOzQb2eyW15YFjDDEMI0ZOzt8f504obNs920lDnpPD2/KqgsfjOgw2K7xWDJIj/18xUvWPk3LDkrnokNiRkA3KOx3W6fHycKL+zID7zy+xZYBuh2fLyQtWV1VGQ45iNRp9+Zo7rH86cdfgkdnWTlNSHyTLW9NbXvyv/E12bppPcEvgCTAQXgnDVJ0/sqmeiijn9tTFh03aM+R2V/21h8aTraAS24qiPCz6gkmYGC8yr6mglcnNoYbsLNYZ69zF1XHcXPduCPdPdfLlzVlKK1/U7hkA28eG3BIAMh6uJYBRJTpiGgaGdPd7YekUB8S6cy+CQIDAQAB","optionsURL":"chrome://https-everywhere/content/observatory-preferences.xul","optionsType":null,"aboutURL":"chrome://https-everywhere/content/about.xul","icons":{},"iconURL":"chrome://https-everywhere/skin/icon-active-48.png","icon64URL":null,"defaultLocale":{"name":"HTTPS Everywhere","description":"Encrypt the Web! Automatically use HTTPS security on many sites.","creator":"EFF Technologists","homepageURL":"https://www.eff.org/https-everywhere"},"visible":true,"active":true,"userDisabled":false,"appDisabled":false,"descriptor":"/tmp/rust_mozprofile.65WNOk7gDoFe/extensions/https-everywhere-eff@eff.org.xpi","installDate":946684800000,"updateDate":1500328562000,"applyBackgroundUpdates":1,"bootstrap":false,"skinnable":false,"size":10786399,"sourceURI":null,"releaseNotesURI":null,"softDisabled":false,"foreignInstall":true,"hasBinaryComponents":false,"strictCompatibility":false,"locales":[],"targetApplications":[{"id":"{ec8030f7-c20a-464f-9b0e-13a3a9e97384}","minVersion":"26.0","maxVersion":"40.0"},{"id":"{92650c4d-4b8e-4d2a-b7eb-24ecf4f6b63a}","minVersion":"2.17","maxVersion":"2.32"},{"id":"{3550f703-e582-4d05-9a08-453d09bdfdc6}","minVersion":"17.*","maxVersion":"35.*"},{"id":"{a79fe89b-6662-4ff4-8e88-09950ad4dfde}","minVersion":"0.1","maxVersion":"40.0"},{"id":"{aa3c5121-dab2-40e2-81ca-7ea25febc110}","minVersion":"26.0","maxVersion":"40.0"}],"targetPlatforms":[],"multiprocessCompatible":true,"signedState":2,"seen":true,"dependencies":[],"hasEmbeddedWebExtension":false,"mpcOptedOut":false} 76 | 1500328563100 addons.xpi DEBUG Updating XPIState for {"id":"e10srollout@mozilla.org","syncGUID":"{cae736cf-ae80-4149-a7b9-cc8900c200e4}","location":"app-system-defaults","version":"1.10","type":"extension","internalName":null,"updateURL":null,"updateKey":null,"optionsURL":null,"optionsType":null,"aboutURL":null,"icons":{},"iconURL":null,"icon64URL":null,"defaultLocale":{"name":"Multi-process staged rollout","description":"Staged rollout of Firefox multi-process feature.","creator":null,"homepageURL":null},"visible":true,"active":true,"userDisabled":false,"appDisabled":false,"descriptor":"/home/pankaj/wf_is/webfp-crawler-phantomjs/tor-browser-crawler-webfp-paper/tbb/tor-browser-linux64-6.5.2_en-US/Browser/browser/features/e10srollout@mozilla.org.xpi","installDate":1500326957000,"updateDate":1500326957000,"applyBackgroundUpdates":1,"bootstrap":true,"skinnable":false,"size":8265,"sourceURI":null,"releaseNotesURI":null,"softDisabled":false,"foreignInstall":false,"hasBinaryComponents":false,"strictCompatibility":false,"locales":[],"targetApplications":[{"id":"{ec8030f7-c20a-464f-9b0e-13a3a9e97384}","minVersion":"52.2.0","maxVersion":"52.*"}],"targetPlatforms":[],"multiprocessCompatible":true,"seen":true,"dependencies":[],"hasEmbeddedWebExtension":false,"mpcOptedOut":false} 77 | 1500328563100 addons.xpi DEBUG Updating XPIState for {"id":"{972ce4c6-7e08-4474-a285-3208198ce6fd}","syncGUID":"0KIGzoXsj43B","location":"app-global","version":"52.2.0","type":"theme","internalName":"classic/1.0","updateURL":null,"updateKey":null,"optionsURL":null,"optionsType":null,"aboutURL":null,"icons":{"32":"icon.png","48":"icon.png"},"iconURL":null,"icon64URL":null,"defaultLocale":{"name":"Default","description":"The default theme.","creator":"Mozilla","homepageURL":null,"contributors":["Mozilla Contributors"]},"visible":true,"active":true,"userDisabled":false,"appDisabled":false,"descriptor":"/home/pankaj/wf_is/webfp-crawler-phantomjs/tor-browser-crawler-webfp-paper/tbb/tor-browser-linux64-6.5.2_en-US/Browser/browser/extensions/{972ce4c6-7e08-4474-a285-3208198ce6fd}.xpi","installDate":946684800000,"updateDate":1500326957000,"applyBackgroundUpdates":1,"skinnable":true,"size":4932,"sourceURI":null,"releaseNotesURI":null,"softDisabled":false,"foreignInstall":false,"hasBinaryComponents":false,"strictCompatibility":true,"locales":[],"targetApplications":[{"id":"{ec8030f7-c20a-464f-9b0e-13a3a9e97384}","minVersion":"52.2.0","maxVersion":"52.2.0"}],"targetPlatforms":[],"seen":true,"dependencies":[],"hasEmbeddedWebExtension":false} 78 | 1500328563100 DeferredSave.extensions.json DEBUG Save changes 79 | 1500328563100 addons.xpi DEBUG Updating database with changes to installed add-ons 80 | 1500328563100 addons.xpi-utils DEBUG Updating add-on states 81 | 1500328563100 addons.xpi-utils DEBUG Writing add-ons list 82 | 1500328563100 addons.xpi DEBUG Registering manifest for /home/pankaj/wf_is/webfp-crawler-phantomjs/tor-browser-crawler-webfp-paper/tbb/tor-browser-linux64-6.5.2_en-US/Browser/browser/features/e10srollout@mozilla.org.xpi 83 | 1500328563100 addons.xpi DEBUG Loading bootstrap scope from /home/pankaj/wf_is/webfp-crawler-phantomjs/tor-browser-crawler-webfp-paper/tbb/tor-browser-linux64-6.5.2_en-US/Browser/browser/features/e10srollout@mozilla.org.xpi 84 | 1500328563100 addons.xpi DEBUG Calling bootstrap method startup on e10srollout@mozilla.org version 1.10 85 | 1500328563100 addons.manager DEBUG Registering shutdown blocker for XPIProvider 86 | 1500328563100 addons.manager DEBUG Provider finished startup: XPIProvider 87 | 1500328563100 addons.manager DEBUG Starting provider: LightweightThemeManager 88 | 1500328563100 addons.manager DEBUG Registering shutdown blocker for LightweightThemeManager 89 | 1500328563100 addons.manager DEBUG Provider finished startup: LightweightThemeManager 90 | 1500328563100 addons.manager DEBUG Starting provider: GMPProvider 91 | 1500328563100 addons.manager DEBUG Registering shutdown blocker for GMPProvider 92 | 1500328563100 addons.manager DEBUG Provider finished startup: GMPProvider 93 | 1500328563100 addons.manager DEBUG Starting provider: PluginProvider 94 | 1500328563100 addons.manager DEBUG Registering shutdown blocker for PluginProvider 95 | 1500328563100 addons.manager DEBUG Provider finished startup: PluginProvider 96 | 1500328563100 addons.manager DEBUG Completed startup sequence 97 | 1500328563800 Marionette INFO Listening on port 40791 98 | 1500328563800 DeferredSave.extensions.json DEBUG Starting timer 99 | 1500328563900 addons.manager DEBUG Starting provider: PreviousExperimentProvider 100 | 1500328563900 addons.manager DEBUG Registering shutdown blocker for PreviousExperimentProvider 101 | 1500328563900 addons.manager DEBUG Provider finished startup: PreviousExperimentProvider 102 | 1500328563900 DeferredSave.extensions.json DEBUG Starting write 103 | 1500328564000 Marionette WARN TLS certificate errors will be ignored for this session 104 | 1500328564000 DeferredSave.extensions.json DEBUG Write succeeded 105 | JavaScript error: chrome://torbutton/content/tor-circuit-display.js, line 363: TypeError: myController is null 106 | 1500328565200 addons.manager DEBUG Starting provider: 107 | 1500328565200 addons.manager DEBUG Registering shutdown blocker for 108 | 1500328565200 addons.manager DEBUG Provider finished startup: 109 | 1500328584700 addons.xpi DEBUG Calling bootstrap method shutdown on e10srollout@mozilla.org version 1.10 110 | [Parent 9720] WARNING: pipe error: Broken pipe: file /home/debian/build/tor-browser/ipc/chromium/src/chrome/common/ipc_channel_posix.cc, line 685 111 | JavaScript error: chrome://noscript/content/Main.js, line 148: TypeError: this._disposeE10s is not a function 112 | 1500328585000 addons.manager DEBUG shutdown 113 | 1500328585000 addons.manager DEBUG Calling shutdown blocker for XPIProvider 114 | 1500328585000 addons.xpi DEBUG shutdown 115 | 1500328585000 addons.xpi-utils DEBUG shutdown 116 | 1500328585000 addons.manager DEBUG Calling shutdown blocker for LightweightThemeManager 117 | 1500328585000 addons.manager DEBUG Calling shutdown blocker for GMPProvider 118 | 1500328585000 addons.manager DEBUG Calling shutdown blocker for PluginProvider 119 | 1500328585000 addons.manager DEBUG Calling shutdown blocker for PreviousExperimentProvider 120 | 1500328585000 addons.manager DEBUG Calling shutdown blocker for 121 | 1500328585000 addons.xpi DEBUG Notifying XPI shutdown observers 122 | 1500328585000 addons.manager DEBUG Async provider shutdown done 123 | 1500328640789 geckodriver INFO geckodriver 0.18.0 124 | 1500328640792 geckodriver INFO Listening on 127.0.0.1:46727 125 | -------------------------------------------------------------------------------- /tor-browser-crawler-webfp-paper/log.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | 4 | LOG_PREFIX = 'webfp' 5 | 6 | 7 | def reset_logger(logger): 8 | """Remove all the handlers for a logger.""" 9 | for handler in logger.handlers: 10 | if isinstance(handler, logging.FileHandler): 11 | handler.close() 12 | 13 | elif isinstance(handler, logging.StreamHandler): 14 | handler.flush() 15 | 16 | # print "****handler removed****" 17 | logger.removeHandler(handler) 18 | 19 | 20 | def add_log_file_handler(logger, filename): 21 | fh = logging.FileHandler(filename) 22 | frmt = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') 23 | log_level = logger.getEffectiveLevel() # get global log level 24 | init_log_handler(fh, logger, log_level, frmt) 25 | 26 | 27 | def init_log_handler(handler, logger, level, frmt): 28 | """ Initialize log handler.""" 29 | handler.setLevel(level) 30 | handler.setFormatter(frmt) 31 | logger.addHandler(handler) 32 | 33 | 34 | def get_logger(logname, logtype='fc', level=logging.DEBUG, 35 | frmt=None, filename=''): 36 | """Create and return a logger with the given name. 37 | 38 | logtype f: file, c: console, fc: both 39 | 40 | """ 41 | logger = logging.getLogger(logname) 42 | logger.setLevel(level) 43 | frmt = frmt or logging.Formatter('%(asctime)s - \ 44 | %(levelname)s - %(message)s') 45 | 46 | if 'f' in logtype: 47 | log_filename = filename if filename else 'crawl.log' 48 | fh = logging.FileHandler(log_filename) 49 | 50 | init_log_handler(fh, logger, level, frmt) 51 | 52 | if 'c' in logtype: 53 | ch = logging.StreamHandler() 54 | init_log_handler(ch, logger, level, frmt) 55 | 56 | return logger 57 | 58 | 59 | def add_symlink(linkname, src_file): 60 | """Create a symbolic link pointing to src_file""" 61 | if os.path.lexists(linkname): # check and remove if link exists 62 | try: 63 | os.unlink(linkname) 64 | except: 65 | pass 66 | try: 67 | os.symlink(src_file, linkname) 68 | except: 69 | print "Cannot create symlink!" 70 | 71 | wl_log = get_logger(LOG_PREFIX, logtype='c') 72 | -------------------------------------------------------------------------------- /tor-browser-crawler-webfp-paper/main.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import traceback 3 | import logging 4 | import common as cm 5 | import sys 6 | from log import wl_log 7 | import utils as ut 8 | from datacollection.crawler import Crawler 9 | 10 | 11 | if __name__ == '__main__': 12 | # Parse arguments 13 | parser = argparse.ArgumentParser(description='Crawl a list of URLs \ 14 | in several batches.') 15 | # list of urls to be crawled 16 | parser.add_argument('-u', '--url-list', help='URL list file path') 17 | parser.add_argument('-b', '--browser-version', help="Tor browser's version" 18 | "used to crawl, possible values are: " 19 | "'wang_and_goldberg' (%s) or 'last_stable' " 20 | "(default: last_stable (%s))" 21 | % (cm.TBB_WANG_AND_GOLDBERG, cm.TBB_DEFAULT_VERSION), 22 | default=cm.TBB_DEFAULT_VERSION) 23 | parser.add_argument('-v', '--verbose', help='increase output verbosity', 24 | action='store_true') 25 | parser.add_argument("-e", "--experiment", help="Experiment type. Possible" 26 | " values are: 'wang_and_goldberg', 'multitab_alexa'") 27 | 28 | # For understanding batch and instance parameters please refer to Wang and 29 | # Goldberg WPES'13 paper, Section 4.1.4 30 | parser.add_argument('--batch', help='Number of batches (default: %s)' 31 | % cm.NUM_BATCHES, default=cm.NUM_BATCHES) 32 | parser.add_argument('--instance', help='Number of instances (default: %s)' 33 | % cm.NUM_INSTANCES, default=cm.NUM_INSTANCES) 34 | 35 | parser.add_argument('--start', help='Crawl URLs after this line (1)') 36 | parser.add_argument('--stop', help='Crawl URLs until this line') 37 | parser.add_argument('--action', help='Type of action: crawl, pack_data') 38 | parser.add_argument('-i', '--input', help='Input data (crawl dir, etc. )') 39 | parser.add_argument('-x', '--xvfb', help='Use XVFB (for headless testing)', 40 | action='store_true', default=False) 41 | parser.add_argument('-c', '--capture-screen', 42 | help='Capture page screenshots', 43 | action='store_true', default=False) 44 | 45 | args = parser.parse_args() 46 | action = args.action 47 | if action == "pack_data": 48 | path = args.input 49 | ut.pack_crawl_data(path) 50 | sys.exit(0) 51 | 52 | url_list_path = args.url_list 53 | verbose = args.verbose 54 | tbb_version = args.browser_version 55 | experiment = args.experiment 56 | no_of_batches = int(args.batch) 57 | no_of_instances = int(args.instance) 58 | start_line = int(args.start) if args.start else 1 59 | stop_line = int(args.stop) if args.stop else 999999999999 60 | xvfb = args.xvfb 61 | capture_screen = args.capture_screen 62 | if verbose: 63 | wl_log.setLevel(logging.DEBUG) 64 | else: 65 | wl_log.setLevel(logging.INFO) 66 | 67 | # Validate the given arguments 68 | # Read urls 69 | url_list = [] 70 | import os 71 | if not url_list_path or not os.path.isfile(url_list_path): 72 | ut.die("ERROR: No URL list given!" 73 | "Run the following to get help: python main --help") 74 | else: 75 | try: 76 | with open(url_list_path) as f: 77 | url_list = f.read().splitlines()[start_line - 1:stop_line] 78 | except Exception as e: 79 | ut.die("Error opening file: {} \n{}" 80 | .format(e, traceback.format_exc())) 81 | 82 | if experiment == cm.EXP_TYPE_WANG_AND_GOLDBERG: 83 | torrc_dict = cm.TORRC_WANG_AND_GOLDBERG 84 | elif experiment == cm.EXP_TYPE_MULTITAB_ALEXA: 85 | torrc_dict = cm.TORRC_DEFAULT 86 | else: 87 | ut.die("Experiment type is not recognized." 88 | " Use --help to see the possible values.") 89 | 90 | if not tbb_version: 91 | # Assign the last stable version of TBB 92 | tbb_version = cm.TBB_DEFAULT_VERSION 93 | elif tbb_version not in cm.TBB_KNOWN_VERSIONS: 94 | ut.die("Version of Tor browser is not recognized." 95 | " Use --help to see which are the accepted values.") 96 | 97 | crawler = Crawler(torrc_dict, url_list, tbb_version, 98 | experiment, xvfb, capture_screen) 99 | wl_log.info("Command line parameters: %s" % sys.argv) 100 | 101 | # Run the crawl 102 | try: 103 | crawler.crawl(no_of_batches, no_of_instances, 104 | start_line=start_line - 1) 105 | except KeyboardInterrupt: 106 | wl_log.warning("Keyboard interrupt! Quitting...") 107 | except Exception as e: 108 | wl_log.error("Exception: \n%s" 109 | % (traceback.format_exc())) 110 | finally: 111 | crawler.stop_crawl(pack_results=False) 112 | -------------------------------------------------------------------------------- /tor-browser-crawler-webfp-paper/requirements.txt: -------------------------------------------------------------------------------- 1 | requests 2 | stem 3 | psutil 4 | xvfbwrapper 5 | selenium 6 | tld 7 | -------------------------------------------------------------------------------- /tor-browser-crawler-webfp-paper/setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import common as cm 3 | import subprocess 4 | import utils as ut 5 | from log import wl_log 6 | 7 | 8 | def get_tbb_filename(tbb_ver): 9 | if int(tbb_ver.split(".")[0]) <= 2: 10 | file_name = 'tor-browser-gnu-linux-%s-%s-dev-en-US.tar.gz' %\ 11 | (cm.machine, tbb_ver) 12 | else: 13 | file_name = 'tor-browser-linux%s-%s_en-US.tar.xz' % (cm.arch, tbb_ver) 14 | return file_name 15 | 16 | 17 | def get_tbb_base_url(tbb_ver): 18 | archive_url = "https://archive.torproject.org/tor-package-archive/torbrowser/" # noqa 19 | if int(tbb_ver.split(".")[0]) <= 2: 20 | base_url = "%slinux/" % (archive_url) 21 | else: 22 | base_url = "%s%s/" % (archive_url, tbb_ver) 23 | return base_url 24 | 25 | 26 | def get_url_by_tbb_ver(tbb_ver): 27 | base_url = get_tbb_base_url(tbb_ver) 28 | tbb_tarball_url = "%s%s" % (base_url, get_tbb_filename(tbb_ver)) 29 | return tbb_tarball_url 30 | 31 | 32 | def download_tbb_tarball(tbb_ver, dl_dir=""): 33 | tbb_url = get_url_by_tbb_ver(tbb_ver) 34 | base_dir = dl_dir if dl_dir else cm.TBB_BASE_DIR 35 | tarball_path = os.path.join(base_dir, get_tbb_filename(tbb_ver)) 36 | if not os.path.isfile(tarball_path): 37 | wl_log.info("Will download %s to %s" % (tbb_url, tarball_path)) 38 | ut.download_file(tbb_url, tarball_path) 39 | ut.extract_tbb_tarball(tarball_path) 40 | if verify_tbb_tarball(tbb_ver, tarball_path, tbb_url): 41 | return tarball_path 42 | # we cannot verify the integrity of the downloaded tarball 43 | raise cm.TBBTarballVerificationError("Cannot verify the integrity of %s" 44 | % tarball_path) 45 | 46 | 47 | def verify_tbb_sig(sig_file): 48 | """Verify the .""" 49 | ret_code = subprocess.Popen(['/usr/bin/gpg', 50 | '--verify', sig_file]).wait() 51 | return True if ret_code == 0 else False 52 | 53 | 54 | def verify_tbb_tarball(tbb_ver, tarball_path, tbb_url): 55 | tarball_filename = get_tbb_filename(tbb_ver) 56 | tarball_sha_sum = ut.sha_256_sum_file(tarball_path).lower() 57 | sha_sum_url = "%s%s" % (get_tbb_base_url(tbb_ver), "sha256sums.txt") 58 | sha_sum_path = "%s%s" % (tarball_path, ".sha256sums.txt") 59 | sha_sum_sig_url = "%s%s" % (sha_sum_url, ".asc") 60 | sha_sum_sig_path = "%s%s" % (sha_sum_path, ".asc") 61 | if not os.path.isfile(sha_sum_path): 62 | ut.download_file(sha_sum_url, sha_sum_path) 63 | if not os.path.isfile(sha_sum_sig_path): 64 | ut.download_file(sha_sum_sig_url, sha_sum_sig_path) 65 | 66 | if not verify_tbb_sig(sha_sum_sig_path): 67 | return False 68 | 69 | # https://github.com/micahflee/torbrowser-launcher/blob/3f1146e1a084c4e8021da968104cbc2877ae01e6/torbrowser_launcher/launcher.py#L560 70 | for line in ut.gen_read_lines(sha_sum_path): 71 | if tarball_sha_sum in line.lower() and tarball_filename in line: 72 | return True 73 | return False 74 | 75 | 76 | def import_gpg_key(key_fp): 77 | """Import GPG key with the given fingerprint.""" 78 | wl_log.info("Will import the GPG key %s" % key_fp) 79 | # https://www.torproject.org/docs/verifying-signatures.html.en 80 | ret_code = subprocess.Popen(['/usr/bin/gpg', '--keyserver', 81 | 'x-hkp://pool.sks-keyservers.net', 82 | '--recv-keys', key_fp]).wait() 83 | return True if ret_code == 0 else False 84 | 85 | 86 | def import_tbb_signing_keys(): 87 | """Import signing GPG keys for TBB.""" 88 | tbb_devs_key = '0x4E2C6E8793298290' 89 | erinns_key = '0x416F061063FEE659' # old key 90 | if import_gpg_key(tbb_devs_key) and import_gpg_key(erinns_key): 91 | return True 92 | else: 93 | raise cm.TBBSigningKeyImportError("Cannot import TBB signing keys") 94 | 95 | 96 | def get_recommended_tbb_version(): 97 | """Get the recommended TBB version from RecommendedTBBVersions file.""" 98 | tbb_versions_url = "https://www.torproject.org/projects/torbrowser/RecommendedTBBVersions" # noqa 99 | versions = ut.read_url(tbb_versions_url) 100 | for line in versions.split(): 101 | if "Linux" in line: 102 | return line.split("-")[0].lstrip('"') 103 | raise cm.TBBGetRecommendedVersionError() 104 | 105 | 106 | def setup_env(): 107 | """Initialize the tbb directory and import TBB signing keys. 108 | 109 | Download recommended TBB version and verify it. 110 | """ 111 | import_tbb_signing_keys() 112 | ut.create_dir(cm.TBB_BASE_DIR) 113 | tbb_rec_ver = get_recommended_tbb_version() 114 | download_tbb_tarball(tbb_rec_ver) 115 | 116 | if __name__ == '__main__': 117 | setup_env() 118 | -------------------------------------------------------------------------------- /tor-browser-crawler-webfp-paper/start.sh: -------------------------------------------------------------------------------- 1 | #Set the MTU to 1500 2 | sudo ifconfig eth0 mtu 1500 3 | 4 | #Disable offloads 5 | sudo ethtool -K eth0 tx off rx off tso off gso off gro off lro off 6 | 7 | #Run the main script 8 | python ./main.py -u ./etc/urls-100-top.csv -e multitab_alexa 9 | -------------------------------------------------------------------------------- /tor-browser-crawler-webfp-paper/test/common_test.py: -------------------------------------------------------------------------------- 1 | from os.path import dirname, realpath, join 2 | import sys 3 | import unittest 4 | sys.path.append(dirname(dirname(realpath(__file__)))) 5 | import common as cm 6 | 7 | 8 | class Test(unittest.TestCase): 9 | 10 | def test_get_tbb_major_version(self): 11 | ver_dict = {"2.3.25-15": "2", 12 | "3.5": "3", 13 | "4.0.8": "4", 14 | "10.0.8": "10" 15 | } 16 | for version, major_v in ver_dict.iteritems(): 17 | self.assert_(cm.get_tbb_major_version(version) == major_v) 18 | 19 | def test_get_tbb_dirname(self): 20 | self.assert_(cm.get_tbb_dirname("2.3.25-16") == 21 | "tor-browser-linux%s-2.3.25-16_en-US" % cm.arch) 22 | self.assert_(cm.get_tbb_dirname(cm.TBB_V_3_5) == 23 | "tor-browser-linux%s-3.5_en-US" % cm.arch) 24 | self.assert_(cm.get_tbb_dirname(cm.TBB_V_3_5, "linux") == 25 | "tor-browser-linux%s-3.5_en-US" % cm.arch) 26 | self.assert_(cm.get_tbb_dirname(cm.TBB_V_3_5, lang="en-US") == 27 | "tor-browser-linux%s-3.5_en-US" % cm.arch) 28 | 29 | def test_get_tb_bin_path(self): 30 | ver_str = "2.3.25-16" 31 | tb_bin_path_v2_3_25_16 = cm.get_tb_bin_path(ver_str) 32 | self.assert_(ver_str in tb_bin_path_v2_3_25_16) 33 | self.assert_(join('App', 'Firefox', 'firefox') in 34 | tb_bin_path_v2_3_25_16) 35 | 36 | tb_bin_path_v3_5 = cm.get_tb_bin_path(cm.TBB_V_3_5) 37 | self.assert_(cm.TBB_V_3_5 in tb_bin_path_v3_5) 38 | self.assert_(join('Browser', 'firefox') in tb_bin_path_v3_5) 39 | 40 | tb_bin_path_V_4_0_8 = cm.get_tb_bin_path(cm.TBB_V_4_0_8) 41 | self.assert_(cm.TBB_V_4_0_8 in tb_bin_path_V_4_0_8) 42 | self.assert_(join('Browser', 'firefox') in tb_bin_path_V_4_0_8) 43 | 44 | self.assert_(cm.TBB_BASE_DIR in tb_bin_path_v2_3_25_16) 45 | self.assert_(cm.TBB_BASE_DIR in tb_bin_path_v3_5) 46 | self.assert_(cm.TBB_BASE_DIR in tb_bin_path_V_4_0_8) 47 | 48 | def test_get_tbb_profile_path(self): 49 | ver_str = "2.3.25-16" 50 | tbb_prof_path_v2_3_25_16 = cm.get_tbb_profile_path(ver_str) 51 | self.assert_(cm.TBB_V2_PROFILE_PATH in tbb_prof_path_v2_3_25_16) 52 | self.assert_(cm.get_tbb_dirname(ver_str) in 53 | tbb_prof_path_v2_3_25_16) 54 | self.assert_(cm.TBB_BASE_DIR in tbb_prof_path_v2_3_25_16) 55 | 56 | tbb_prof_path_v3_5 = cm.get_tbb_profile_path(cm.TBB_V_3_5) 57 | self.assert_(cm.TBB_V3_PROFILE_PATH in tbb_prof_path_v3_5) 58 | self.assert_(cm.get_tbb_dirname(cm.TBB_V_3_5) in 59 | tbb_prof_path_v3_5) 60 | self.assert_(cm.TBB_BASE_DIR in tbb_prof_path_v3_5) 61 | 62 | tbb_prof_path_v4_0_8 = cm.get_tbb_profile_path(cm.TBB_V_4_0_8) 63 | self.assert_(cm.TBB_V3_PROFILE_PATH in tbb_prof_path_v4_0_8) 64 | self.assert_(cm.get_tbb_dirname(cm.TBB_V_4_0_8) in 65 | tbb_prof_path_v4_0_8) 66 | self.assert_(cm.TBB_BASE_DIR in tbb_prof_path_v4_0_8) 67 | 68 | def test_get_tbb_data_dir_path(self): 69 | ver_str = "2.3.25-16" 70 | tor_data_path_v2_3_25_16 = cm.get_tor_data_path(ver_str) 71 | self.assert_(ver_str in tor_data_path_v2_3_25_16) 72 | self.assert_(join('Data', 'Tor') in 73 | tor_data_path_v2_3_25_16) 74 | self.assert_(cm.TBB_BASE_DIR in tor_data_path_v2_3_25_16) 75 | self.assert_(cm.get_tbb_dirname(ver_str) in 76 | tor_data_path_v2_3_25_16) 77 | 78 | tor_data_path_v3_5 = cm.get_tor_data_path(cm.TBB_V_3_5) 79 | self.assert_(cm.TBB_V_3_5 in tor_data_path_v3_5) 80 | self.assert_(join('Data', 'Tor') in 81 | tor_data_path_v3_5) 82 | self.assert_(cm.TBB_BASE_DIR in tor_data_path_v3_5) 83 | self.assert_(cm.get_tbb_dirname(cm.TBB_V_3_5) in 84 | tor_data_path_v3_5) 85 | 86 | tor_data_path_v4_0_8 = cm.get_tor_data_path(cm.TBB_V_4_0_8) 87 | self.assert_(cm.TBB_V_4_0_8 in tor_data_path_v4_0_8) 88 | self.assert_(join('Browser', 'TorBrowser', 'Data', 'Tor') in 89 | tor_data_path_v4_0_8) 90 | self.assert_(cm.TBB_BASE_DIR in tor_data_path_v4_0_8) 91 | self.assert_(cm.get_tbb_dirname(cm.TBB_V_4_0_8) in 92 | tor_data_path_v4_0_8) 93 | 94 | def test_get_tor_bin_path(self): 95 | ver_str = "2.3.25-16" 96 | tor_bin_path_v2_3_25_16 = cm.get_tor_bin_path(ver_str) 97 | self.assert_(cm.TBB_BASE_DIR in tor_bin_path_v2_3_25_16) 98 | self.assert_(cm.TOR_V2_BINARY_PATH in tor_bin_path_v2_3_25_16) 99 | self.assert_(cm.get_tbb_dirname(ver_str) in 100 | tor_bin_path_v2_3_25_16) 101 | 102 | tor_bin_path_v3_5 = cm.get_tor_bin_path(cm.TBB_V_3_5) 103 | self.assert_(cm.TBB_BASE_DIR in tor_bin_path_v3_5) 104 | self.assert_(cm.TOR_V3_BINARY_PATH in tor_bin_path_v3_5) 105 | self.assert_(cm.get_tbb_dirname(cm.TBB_V_3_5) in 106 | tor_bin_path_v3_5) 107 | 108 | tor_bin_path_v4_0_8 = cm.get_tor_bin_path(cm.TBB_V_4_0_8) 109 | self.assert_(cm.TBB_BASE_DIR in tor_bin_path_v4_0_8) 110 | self.assert_(cm.TOR_V4_BINARY_PATH in tor_bin_path_v4_0_8) 111 | self.assert_(cm.get_tbb_dirname(cm.TBB_V_4_0_8) in 112 | tor_bin_path_v4_0_8) 113 | 114 | if __name__ == "__main__": 115 | unittest.main() 116 | -------------------------------------------------------------------------------- /tor-browser-crawler-webfp-paper/test/crawler_test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import unittest 4 | import shutil 5 | sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) 6 | from datacollection.crawler import Crawler 7 | import common as cm 8 | 9 | TEST_URL_LIST = ['https://www.google.de', 10 | 'https://torproject.org', 11 | 'https://firstlook.org/theintercept/'] 12 | 13 | 14 | class Test(unittest.TestCase): 15 | 16 | def test_crawl(self): 17 | # this test takes at least a few minutes to finish 18 | crawler = Crawler(cm.TORRC_WANG_AND_GOLDBERG, TEST_URL_LIST, 19 | cm.TBB_DEFAULT_VERSION, capture_screen=True) 20 | try: 21 | crawler.crawl(1, 1) # we can pass batch and instance numbers 22 | except Exception as e: 23 | self.fail("It raised an exception: %s" % e) 24 | self.assertTrue(os.path.isdir(crawler.crawl_dir)) 25 | self.assertTrue(os.path.isdir(crawler.crawl_logs_dir)) 26 | self.assertTrue(os.path.isfile(crawler.log_file)) 27 | self.assertTrue(os.path.isfile(crawler.tor_log)) 28 | self.assertEqual(crawler.experiment, cm.EXP_TYPE_WANG_AND_GOLDBERG) 29 | self.assertListEqual(crawler.urls, TEST_URL_LIST) 30 | self.assertEqual(crawler.tbb_version, cm.TBB_DEFAULT_VERSION) 31 | self.assertFalse(crawler.xvfb) 32 | crawler.stop_crawl(pack_results=True) 33 | tar_gz_crawl_data = crawler.crawl_dir + ".tar.gz" 34 | self.assertTrue(os.path.isfile(tar_gz_crawl_data)) 35 | shutil.rmtree(crawler.crawl_dir) 36 | os.remove(tar_gz_crawl_data) 37 | 38 | if __name__ == "__main__": 39 | unittest.main() 40 | -------------------------------------------------------------------------------- /tor-browser-crawler-webfp-paper/test/dumputils_test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import unittest 4 | from urllib2 import urlopen 5 | import time 6 | sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) 7 | from datacollection.dumputils import Sniffer 8 | import common as cm 9 | 10 | TEST_CAP_FILTER = 'host 255.255.255.255' 11 | TEST_PCAP_PATH = os.path.join(cm.TEST_FILES_DIR, 'test.pcap') 12 | 13 | 14 | class SnifferTest(unittest.TestCase): 15 | 16 | def setUp(self): 17 | self.snf = Sniffer() 18 | 19 | def tearDown(self): 20 | pass 21 | 22 | def test_is_dumpcap_running(self): 23 | self.snf.set_pcap_path(TEST_PCAP_PATH) 24 | self.snf.start_capture() 25 | self.assertTrue(self.snf.is_dumpcap_running()) 26 | self.snf.stop_capture() 27 | if os.path.isfile(TEST_PCAP_PATH): 28 | os.remove(TEST_PCAP_PATH) 29 | 30 | def test_default_cap_filter(self): 31 | self.assertTrue(self.snf.get_capture_filter() == '') 32 | 33 | def test_default_pcap_path(self): 34 | self.assertTrue(self.snf.get_pcap_path() == '/dev/null') 35 | 36 | def test_set_pcap_path(self): 37 | self.snf.set_pcap_path(TEST_PCAP_PATH) 38 | self.assertTrue(TEST_PCAP_PATH == self.snf.get_pcap_path(), 39 | "Sniffer pcap path cannot be set %s %s" 40 | % (TEST_PCAP_PATH, self.snf.get_pcap_path())) 41 | 42 | def test_set_capture_filter(self): 43 | self.snf.set_capture_filter(TEST_CAP_FILTER) 44 | self.assertTrue(TEST_CAP_FILTER == self.snf.get_capture_filter(), 45 | "Sniffer filter cannot be set %s %s" 46 | % (TEST_CAP_FILTER, self.snf.get_capture_filter())) 47 | 48 | def test_start_capture(self): 49 | if os.path.isfile(TEST_PCAP_PATH): 50 | os.remove(TEST_PCAP_PATH) 51 | self.snf.set_pcap_path(TEST_PCAP_PATH) 52 | self.snf.start_capture() 53 | time.sleep(1) 54 | f = urlopen("https://torproject.org/", timeout=10) 55 | self.assertTrue(f) 56 | self.snf.stop_capture() 57 | # TODO investigate why the we cannot capture on CI 58 | if not cm.running_in_CI: 59 | self.assertTrue(os.path.isfile(TEST_PCAP_PATH), 60 | "Cannot find pcap file") 61 | self.assertGreater(os.path.getsize(TEST_PCAP_PATH), 0) 62 | os.remove(TEST_PCAP_PATH) 63 | 64 | if __name__ == "__main__": 65 | unittest.main() 66 | -------------------------------------------------------------------------------- /tor-browser-crawler-webfp-paper/test/env_test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import unittest 4 | import commands 5 | sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) 6 | import common as cm 7 | 8 | 9 | class Test(unittest.TestCase): 10 | def assert_py_pkg_installed(self, pkg_name): 11 | try: 12 | __import__(pkg_name) 13 | except: 14 | self.fail('Cannot find python package.\ 15 | Install it by sudo pip install %s' % pkg_name) 16 | 17 | def run_cmd(self, cmd): 18 | return commands.getstatusoutput('%s ' % cmd) 19 | 20 | def assert_installed(self, pkg_name, msg=""): 21 | cmd = 'which %s' % pkg_name 22 | status, _ = self.run_cmd(cmd) 23 | self.assertFalse(status, "%s is not installed." 24 | "Install it with sudo apt-get install %s" % 25 | (pkg_name, pkg_name)) 26 | 27 | def test_dumpcap(self): 28 | self.assert_installed('dumpcap') 29 | 30 | def test_xvfb(self): 31 | self.assert_installed('Xvfb') 32 | 33 | def test_stem(self): 34 | self.assert_py_pkg_installed('stem') 35 | 36 | def test_psutil(self): 37 | self.assert_py_pkg_installed('psutil') 38 | 39 | def test_xvfbwrapper(self): 40 | self.assert_py_pkg_installed('xvfbwrapper') 41 | 42 | def test_argparse(self): 43 | self.assert_py_pkg_installed('argparse') 44 | 45 | def test_requests(self): 46 | self.assert_py_pkg_installed('requests') 47 | 48 | def test_webfp_path(self): 49 | self.assertTrue(os.path.isdir(cm.BASE_DIR), 50 | 'Cannot find base dir path %s' % cm.BASE_DIR) 51 | 52 | def test_tb_bin_path(self): 53 | tb_bin_path = cm.get_tb_bin_path(version=cm.TBB_DEFAULT_VERSION) 54 | self.assertTrue(os.path.isfile(tb_bin_path), 55 | 'Cannot find Tor Browser binary path %s' 56 | % tb_bin_path) 57 | 58 | def test_tbb_profile_path(self): 59 | tbb_profile_path = cm.get_tbb_profile_path(cm.TBB_DEFAULT_VERSION) 60 | self.assertTrue(os.path.isdir(tbb_profile_path), 61 | 'Cannot find Tor Browser profile dir %s' 62 | % tbb_profile_path) 63 | 64 | def test_selenium(self): 65 | self.assert_py_pkg_installed('selenium') 66 | 67 | def test_py_selenium_version(self): 68 | import selenium 69 | pkg_ver = selenium.__version__ 70 | err_msg = "Python Selenium package should be greater than 2.45.0" 71 | min_v = 2 72 | min_minor_v = 45 73 | min_micro_v = 0 74 | version, minor_v, micro_v = pkg_ver.split('.') 75 | self.assertGreaterEqual(version, min_v, err_msg) 76 | if version == min_v: 77 | self.assertGreaterEqual(minor_v, min_minor_v, err_msg) 78 | if minor_v == min_minor_v: 79 | self.assertGreaterEqual(micro_v, min_micro_v, err_msg) 80 | 81 | 82 | if __name__ == "__main__": 83 | unittest.main() 84 | -------------------------------------------------------------------------------- /tor-browser-crawler-webfp-paper/test/setup_test.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | import sys 4 | sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) 5 | import setup as se 6 | import common as cm 7 | arch = cm.arch 8 | machine = cm.machine 9 | from shutil import rmtree 10 | 11 | 12 | class Test(unittest.TestCase): 13 | 14 | def setUp(self): 15 | pass 16 | 17 | def tearDown(self): 18 | pass 19 | 20 | def test_get_url_tbb_ver(self): 21 | wang_and_goldberg_url = "https://archive.torproject.org/tor-package-archive/torbrowser/linux/tor-browser-gnu-linux-%s-2.4.7-alpha-1-dev-en-US.tar.gz" % machine # noqa 22 | self.assertEqual(se.get_url_by_tbb_ver(cm.TBB_WANG_AND_GOLDBERG), 23 | wang_and_goldberg_url) 24 | 25 | tbb_3_5_url = "https://archive.torproject.org/tor-package-archive/torbrowser/3.5/tor-browser-linux%s-3.5_en-US.tar.xz" % arch # noqa 26 | self.assertEqual(se.get_url_by_tbb_ver(cm.TBB_V_3_5), 27 | tbb_3_5_url) 28 | tbb_4_0_8_url = "https://archive.torproject.org/tor-package-archive/torbrowser/4.0.8/tor-browser-linux%s-4.0.8_en-US.tar.xz" % arch # noqa 29 | self.assertEqual(se.get_url_by_tbb_ver(cm.TBB_V_4_0_8), 30 | tbb_4_0_8_url) 31 | 32 | def test_get_tbb_filename(self): 33 | tbb_2_4_7A1 = "tor-browser-gnu-linux-%s-2.4.7-alpha-1-dev-en-US.tar.gz"\ 34 | % machine 35 | self.assertEqual(se.get_tbb_filename(cm.TBB_WANG_AND_GOLDBERG), 36 | tbb_2_4_7A1) 37 | self.assertEqual(se.get_tbb_filename(cm.TBB_V_3_5), 38 | "tor-browser-linux%s-3.5_en-US.tar.xz" % arch) 39 | self.assertEqual(se.get_tbb_filename(cm.TBB_V_4_0_8), 40 | "tor-browser-linux%s-4.0.8_en-US.tar.xz" % arch) 41 | 42 | def test_download_tbb_tarball(self): 43 | try: 44 | tbb_4_0_8_path = se.download_tbb_tarball(cm.TBB_V_4_0_8, 45 | dl_dir=cm.TEST_FILES_DIR) 46 | except cm.TBBTarballVerificationError as e: 47 | self.fail(e.message) 48 | os.path.isfile(tbb_4_0_8_path) 49 | tbb_4_0_8_path.endswith("tar.xz") 50 | sha_sum_path = "%s.%s" % (tbb_4_0_8_path, "sha256sums.txt") 51 | sha_sum_sig_path = "%s%s" % (sha_sum_path, ".asc") 52 | os.remove(tbb_4_0_8_path) 53 | os.remove(sha_sum_sig_path) 54 | os.remove(sha_sum_path) 55 | rmtree(tbb_4_0_8_path.split(".tar")[0]) 56 | 57 | def test_get_recommended_tbb_version(self): 58 | rec_ver = se.get_recommended_tbb_version() 59 | self.assertGreaterEqual(rec_ver.split('.')[0], 4) 60 | 61 | def test_import_tbb_signing_keys(self): 62 | try: 63 | se.import_tbb_signing_keys() 64 | except cm.TBBSigningKeyImportError as e: 65 | self.fail(e.message) 66 | 67 | def test_import_gpg_key(self): 68 | self.assertFalse(se.import_gpg_key("0xNONHEXADECIMAL")) 69 | 70 | def test_verify_tbb_signature(self): 71 | GOOD_SIG = "tor-browser-linux64-4.0.99_en-US.tar.xz.sha256sums.txt.asc" 72 | BAD_SIG = "bad_sig_tor-browser-linux64-4.0.99_en-US.tar.xz.sha256sums.txt.asc" # noqa 73 | self.assertTrue(se.verify_tbb_sig(os.path.join(cm.TEST_FILES_DIR, GOOD_SIG))) # noqa 74 | self.assertFalse(se.verify_tbb_sig(os.path.join(cm.TEST_FILES_DIR, BAD_SIG))) # noqa 75 | 76 | if __name__ == "__main__": 77 | unittest.main() 78 | -------------------------------------------------------------------------------- /tor-browser-crawler-webfp-paper/test/torutils_test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import time 4 | import unittest 5 | from selenium import webdriver 6 | from selenium.common.exceptions import NoSuchElementException, TimeoutException 7 | from selenium.webdriver.support.ui import WebDriverWait 8 | from selenium.webdriver.support import expected_conditions as EC 9 | sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) 10 | import common as cm 11 | from utils import get_hash_of_directory 12 | from datacollection.torutils import TorBrowserDriver 13 | from datacollection.torutils import TorController 14 | 15 | # Test URLs are taken from the TBB test suit 16 | # https://gitweb.torproject.org/boklm/tor-browser-bundle-testsuite.git/tree/mozmill-tests/tbb-tests/https-everywhere.js 17 | HTTP_URL = "http://httpbin.org/" 18 | HTTPS_URL = "https://httpbin.org/" 19 | 20 | 21 | class TestTorUtils(unittest.TestCase): 22 | 23 | @classmethod 24 | def setUpClass(cls): 25 | cls.tor_controller = TorController(cm.TORRC_WANG_AND_GOLDBERG, 26 | cm.TBB_DEFAULT_VERSION) 27 | cls.tor_process = cls.tor_controller.launch_tor_service() 28 | 29 | def test_launch_tor_service(self): 30 | self.tor_process.kill() 31 | self.tor_process = self.tor_controller.launch_tor_service() 32 | self.assertTrue(self.tor_process, 'Cannot launch Tor process') 33 | 34 | def test_tb_orig_profile_not_modified(self): 35 | """Visiting a site should not modify the original profile contents.""" 36 | tbb_profile_dir = cm.get_tbb_profile_path(cm.TBB_DEFAULT_VERSION) 37 | profile_hash_before = get_hash_of_directory(tbb_profile_dir) 38 | tb_driver = TorBrowserDriver() 39 | tb_driver.get("http://check.torproject.org") 40 | tb_driver.quit() 41 | profile_hash_after = get_hash_of_directory(tbb_profile_dir) 42 | assert(profile_hash_after == profile_hash_before) 43 | 44 | def test_tb_drv_simple_visit(self): 45 | tb_driver = TorBrowserDriver() 46 | tb_driver.get("http://check.torproject.org") 47 | tb_driver.implicitly_wait(60) 48 | h1_on = tb_driver.find_element_by_css_selector("h1.on") 49 | self.assertTrue(h1_on) 50 | tb_driver.quit() 51 | 52 | def test_tb_extensions(self): 53 | tb_driver = TorBrowserDriver() 54 | # test HTTPS Everywhere 55 | tb_driver.get(HTTP_URL) 56 | time.sleep(1) 57 | try: 58 | WebDriverWait(tb_driver, 60).until( 59 | EC.title_contains("httpbin") 60 | ) 61 | except TimeoutException: 62 | self.fail("The title should contain httpbin") 63 | self.assertEqual(tb_driver.current_url, HTTPS_URL) 64 | # NoScript should disable WebGL 65 | webgl_test_url = "https://developer.mozilla.org/samples/webgl/sample1/index.html" # noqa 66 | tb_driver.get(webgl_test_url) 67 | try: 68 | WebDriverWait(tb_driver, 60).until( 69 | EC.alert_is_present() 70 | ) 71 | except TimeoutException: 72 | self.fail("WebGL error alert should be present") 73 | tb_driver.switch_to_alert().dismiss() 74 | tb_driver.implicitly_wait(30) 75 | el = tb_driver.find_element_by_class_name("__noscriptPlaceholder__") 76 | self.assertTrue(el) 77 | # sanity check for the above test 78 | self.assertRaises(NoSuchElementException, 79 | tb_driver.find_element_by_class_name, 80 | "__nosuch_class_exist") 81 | tb_driver.quit() 82 | 83 | def test_https_everywhere_disabled(self): 84 | """Test to make sure the HTTP->HTTPS redirection observed in the 85 | 86 | previous test (test_tb_extensions) is really due to HTTPSEverywhere - 87 | but not because the site is HTTPS by default. See, the following: 88 | https://gitweb.torproject.org/boklm/tor-browser-bundle-testsuite.git/tree/mozmill-tests/tbb-tests/https-everywhere-disabled.js 89 | """ 90 | 91 | ff_driver = webdriver.Firefox() 92 | ff_driver.get(HTTP_URL) 93 | time.sleep(1) 94 | # make sure it doesn't redirect to https 95 | self.assertEqual(ff_driver.current_url, HTTP_URL) 96 | ff_driver.quit() 97 | 98 | def test_close_all_streams(self): 99 | streams_open = False 100 | new_tb_drv = TorBrowserDriver() 101 | new_tb_drv.get('http://www.google.com') 102 | time.sleep(cm.WAIT_IN_SITE) 103 | self.tor_controller.close_all_streams() 104 | for stream in self.tor_controller.controller.get_streams(): 105 | print stream.id, stream.purpose, stream.target_address, "open!" 106 | streams_open = True 107 | new_tb_drv.quit() 108 | self.assertFalse(streams_open, 'Could not close all streams.') 109 | 110 | @classmethod 111 | def tearDownClass(cls): 112 | # cls.tor_process.kill() 113 | cls.tor_controller.kill_tor_proc() 114 | 115 | if __name__ == "__main__": 116 | unittest.main() 117 | -------------------------------------------------------------------------------- /tor-browser-crawler-webfp-paper/test/utils_test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import unittest 4 | from time import sleep 5 | import commands as cmds 6 | sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) 7 | import utils as ut 8 | import common as cm 9 | from shutil import rmtree 10 | from tld import get_tld 11 | 12 | 13 | class UtilsTest(unittest.TestCase): 14 | 15 | def test_get_filename_from_url(self): 16 | filename = ut.get_filename_from_url("http://google.com", 0) 17 | self.assertEqual("0-google.com", filename) 18 | filename = ut.get_filename_from_url("https://yahoo.com", 99) 19 | self.assertEqual("99-yahoo.com", filename) 20 | filename = ut.get_filename_from_url("https://123abc.com/somepath", 999) 21 | self.assertEqual("999-123abc.com-somepath", filename) 22 | filename = ut.get_filename_from_url( 23 | "https://123abc.com/somepath/", 123) 24 | self.assertEqual("123-123abc.com-somepath-", filename) 25 | filename = ut.get_filename_from_url( 26 | "https://123abc.com/somepath/q=query&q2=q2", 234) 27 | self.assertEqual("234-123abc.com-somepath-q-query-q2-q2", filename) 28 | 29 | def test_timeout(self): 30 | ut.timeout(1) 31 | try: 32 | sleep(1.1) 33 | except ut.TimeExceededError: 34 | pass # this is what we want 35 | else: 36 | self.fail("Cannot set timeout") 37 | 38 | def test_cancel_timeout(self): 39 | ut.timeout(1) 40 | ut.cancel_timeout() 41 | try: 42 | sleep(1.1) 43 | except ut.TimeExceededError: 44 | self.fail("Cannot cancel timeout") 45 | 46 | def test_pack_crawl_data(self): 47 | self.assertTrue(ut.pack_crawl_data(cm.DUMMY_TEST_DIR)) 48 | self.assertTrue(os.path.isfile(cm.DUMMY_TEST_DIR_TARGZIPPED)) 49 | 50 | cmd = 'file "%s"' % cm.DUMMY_TEST_DIR_TARGZIPPED # linux file command 51 | status, cmd_out = cmds.getstatusoutput(cmd) 52 | if not status: # command executed successfully 53 | if 'gzip compressed data' not in cmd_out: 54 | self.fail("Cannot confirm file type") 55 | 56 | self.failIf(ut.is_targz_archive_corrupt(cm.DUMMY_TEST_DIR_TARGZIPPED)) 57 | os.remove(cm.DUMMY_TEST_DIR_TARGZIPPED) 58 | 59 | def test_get_public_suffix(self): 60 | urls = ('http://www.foo.org', 61 | 'https://www.foo.org', 62 | 'http://www.subdomain.foo.org', 63 | 'http://www.subdomain.foo.org:80/subfolder', 64 | 'https://www.subdomain.foo.org:80/subfolder?p1=4545&p2=54545', 65 | 'https://www.subdomain.foo.org:80/subfolder/baefasd==/65') 66 | for pub_suf_test_url in urls: 67 | self.assertEqual(get_tld(pub_suf_test_url), 68 | "foo.org") 69 | 70 | def test_extract_archive(self): 71 | ut.extract_tbb_tarball(cm.TBB_TEST_TARBALL) 72 | self.assertTrue(os.path.isdir(cm.TBB_TEST_TARBALL_EXTRACTED)) 73 | self.assertTrue(os.path.isfile( 74 | os.path.join(cm.TBB_TEST_TARBALL_EXTRACTED, "dummy.txt"))) 75 | rmtree(cm.TBB_TEST_TARBALL_EXTRACTED) 76 | 77 | if __name__ == "__main__": 78 | unittest.main() 79 | -------------------------------------------------------------------------------- /tor-browser-crawler-webfp-paper/test/visit_test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import unittest 4 | import shutil 5 | sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) 6 | import common as cm 7 | import utils as ut 8 | import datacollection.visit as vi 9 | from datacollection.torutils import TorController 10 | join = os.path.join 11 | TEST_URL = "https://torproject.org" 12 | 13 | 14 | class VisitTest(unittest.TestCase): 15 | 16 | @classmethod 17 | def setUpClass(cls): 18 | cls.tor_controller = TorController(cm.TORRC_WANG_AND_GOLDBERG, 19 | cm.TBB_DEFAULT_VERSION) 20 | cls.tor_process = cls.tor_controller.launch_tor_service() 21 | 22 | def check_expected_visit_dirs_and_files(self, visit): 23 | self.assertTrue(os.path.isdir(visit.base_dir)) 24 | inst_dir = join(visit.base_dir, str(visit.instance_num)) 25 | self.assertTrue(os.path.isdir(inst_dir)) 26 | 27 | pcap_path = join(inst_dir, "%s.pcap" % visit.get_instance_name()) 28 | # TODO investigate why the we cannot capture on CI 29 | if not cm.running_in_CI: 30 | self.assertTrue(os.path.isfile(pcap_path)) 31 | # expect the capture to be > 10K 32 | self.assertGreater(os.path.getsize(pcap_path), 10000) 33 | 34 | self.screenshot_path = join(inst_dir, "screenshot.png") 35 | if visit.capture_screen: 36 | self.assertTrue(os.path.isfile(self.screenshot_path)) 37 | self.assertGreater(os.path.getsize(self.screenshot_path), 0) 38 | else: 39 | self.assertFalse(os.path.isfile(self.screenshot_path)) 40 | 41 | instance_ff_log_dir = join(inst_dir, "logs") 42 | self.assertTrue(os.path.isdir(instance_ff_log_dir)) 43 | instance_ff_log = join(instance_ff_log_dir, "firefox.log") 44 | self.assertTrue(os.path.isfile(instance_ff_log)) 45 | self.assertGreater(os.path.getsize(instance_ff_log), 0) 46 | 47 | def setup_crawl_dirs(self, test_url=TEST_URL): 48 | crawl_name = ut.append_timestamp("crawl") 49 | self.crawl_dir = ut.create_dir(join(cm.TEST_FILES_DIR, crawl_name)) 50 | batch_dir = ut.create_dir(join(self.crawl_dir, str(self.batch_num))) 51 | self.site_dir = ut.create_dir(join(batch_dir, 52 | ut.get_filename_from_url(test_url, 53 | self.site_num))) 54 | 55 | def test_visit_with_defaults(self): 56 | self.setup_crawl_dirs() 57 | visit = vi.Visit(self.batch_num, self.site_num, 58 | self.instance_num, TEST_URL, 59 | self.site_dir, cm.TBB_DEFAULT_VERSION, 60 | self.tor_controller) 61 | self.run_visit(visit) 62 | 63 | def test_visit_noxvfb(self): 64 | self.setup_crawl_dirs() 65 | visit = vi.Visit(self.batch_num, self.site_num, 66 | self.instance_num, TEST_URL, 67 | self.site_dir, cm.TBB_DEFAULT_VERSION, 68 | self.tor_controller, bg_site=None, 69 | experiment=cm.EXP_TYPE_WANG_AND_GOLDBERG, xvfb=False, 70 | capture_screen=True) 71 | self.run_visit(visit) 72 | 73 | def test_screen_capture(self): 74 | cap_test_url = "https://check.torproject.org/" 75 | self.setup_crawl_dirs(cap_test_url) 76 | visit = vi.Visit(self.batch_num, self.site_num, 77 | self.instance_num, cap_test_url, 78 | self.site_dir, cm.TBB_DEFAULT_VERSION, 79 | self.tor_controller, bg_site=None, 80 | experiment=cm.EXP_TYPE_WANG_AND_GOLDBERG, xvfb=False, 81 | capture_screen=True) 82 | self.run_visit(visit) 83 | # A blank page for https://check.torproject.org/ amounts to ~4.8KB. 84 | # A real screen capture on the other hand, is ~57KB. If the capture 85 | # is not blank it should be at least greater than 30KB. 86 | self.assertGreater(os.path.getsize(self.screenshot_path), 30000) 87 | 88 | def test_visit_multitab_exp(self): 89 | self.setup_crawl_dirs() 90 | visit = vi.Visit(self.batch_num, self.site_num, 91 | self.instance_num, TEST_URL, 92 | self.site_dir, cm.TBB_DEFAULT_VERSION, 93 | self.tor_controller, bg_site="https://google.com", 94 | experiment=cm.EXP_TYPE_MULTITAB_ALEXA, xvfb=False, 95 | capture_screen=True) 96 | self.run_visit(visit) 97 | 98 | def run_visit(self, visit): 99 | visit.get() 100 | self.check_expected_visit_dirs_and_files(visit) 101 | 102 | def setUp(self): 103 | self.site_num = 0 104 | self.batch_num = 0 105 | self.instance_num = 0 106 | 107 | def tearDown(self): 108 | shutil.rmtree(self.crawl_dir) 109 | 110 | @classmethod 111 | def tearDownClass(cls): 112 | # cls.tor_process.kill() 113 | cls.tor_controller.kill_tor_proc() 114 | 115 | if __name__ == "__main__": 116 | unittest.main() 117 | -------------------------------------------------------------------------------- /tor-browser-crawler-webfp-paper/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import signal 4 | import re 5 | import commands 6 | from time import strftime 7 | import distutils.dir_util as du 8 | from log import wl_log 9 | import psutil 10 | from urllib2 import urlopen 11 | from hashlib import sha256 12 | 13 | 14 | class TimeExceededError(Exception): 15 | pass 16 | 17 | 18 | def get_hash_of_directory(path): 19 | """Return md5 hash of the directory pointed by path.""" 20 | from hashlib import md5 21 | m = md5() 22 | for root, _, files in os.walk(path): 23 | for f in files: 24 | full_path = os.path.join(root, f) 25 | for line in open(full_path).readlines(): 26 | m.update(line) 27 | return m.digest() 28 | 29 | 30 | def create_dir(dir_path): 31 | """Create a directory if it doesn't exist.""" 32 | if not os.path.exists(dir_path): 33 | os.makedirs(dir_path) 34 | return dir_path 35 | 36 | 37 | def append_timestamp(_str=''): 38 | """Append a timestamp to a string and return it.""" 39 | return _str + strftime('%y%m%d_%H%M%S') 40 | 41 | 42 | def clone_dir_with_timestap(orig_dir_path): 43 | """Copy a folder into the same directory and append a timestamp.""" 44 | new_dir = create_dir(append_timestamp(orig_dir_path)) 45 | try: 46 | du.copy_tree(orig_dir_path, new_dir) 47 | except Exception, e: 48 | wl_log.error("Error while cloning the dir with timestamp" + str(e)) 49 | finally: 50 | return new_dir 51 | 52 | 53 | def raise_signal(signum, frame): 54 | raise TimeExceededError 55 | 56 | 57 | def timeout(duration): 58 | """Timeout after given duration.""" 59 | signal.signal(signal.SIGALRM, raise_signal) # linux only !!! 60 | signal.alarm(duration) # alarm after X seconds 61 | 62 | 63 | def cancel_timeout(): 64 | """Cancel a running alarm.""" 65 | signal.alarm(0) 66 | 67 | 68 | def get_filename_from_url(url, prefix): 69 | """Return base filename for the url.""" 70 | url = url.replace('https://', '') 71 | url = url.replace('http://', '') 72 | url = url.replace('www.', '') 73 | dashed = re.sub(r'[^A-Za-z0-9._]', '-', url) 74 | return '%s-%s' % (prefix, re.sub(r'-+', '-', dashed)) 75 | 76 | 77 | def is_targz_archive_corrupt(arc_path): 78 | # http://stackoverflow.com/a/2001749/3104416 79 | tar_gz_check_cmd = "gunzip -c %s | tar t > /dev/null" % arc_path 80 | tar_status, tar_txt = commands.getstatusoutput(tar_gz_check_cmd) 81 | if tar_status: 82 | wl_log.critical("Tar check failed: %s tar_status: %s tar_txt: %s" 83 | % (tar_gz_check_cmd, tar_status, tar_txt)) 84 | return tar_status 85 | return False # no error 86 | 87 | 88 | def pack_crawl_data(crawl_dir): 89 | """Compress the crawl dir into a tar archive.""" 90 | if not os.path.isdir(crawl_dir): 91 | wl_log.critical("Cannot find the crawl dir: %s" % crawl_dir) 92 | return False 93 | if crawl_dir.endswith(os.path.sep): 94 | crawl_dir = crawl_dir[:-1] 95 | crawl_name = os.path.basename(crawl_dir) 96 | containing_dir = os.path.dirname(crawl_dir) 97 | os.chdir(containing_dir) 98 | arc_path = "%s.tar.gz" % crawl_name 99 | tar_cmd = "tar czvf %s %s" % (arc_path, crawl_name) 100 | wl_log.debug("Packing the crawl dir with cmd: %s" % tar_cmd) 101 | status, txt = commands.getstatusoutput(tar_cmd) 102 | if status or is_targz_archive_corrupt(arc_path): 103 | wl_log.critical("Tar command failed or archive is corrupt:\ 104 | %s \nSt: %s txt: %s" % (tar_cmd, status, txt)) 105 | return False 106 | else: 107 | return True 108 | 109 | 110 | def gen_all_children_procs(parent_pid): 111 | parent = psutil.Process(parent_pid) 112 | for child in parent.children(recursive=True): 113 | yield child 114 | 115 | 116 | def kill_all_children(parent_pid): 117 | """Kill all child process of a given parent.""" 118 | for child in gen_all_children_procs(parent_pid): 119 | child.kill() 120 | 121 | 122 | def die(last_words="Unknown problem, quitting!"): 123 | wl_log.error(last_words) 124 | sys.exit(1) 125 | 126 | 127 | def read_file(path, binary=False): 128 | """Read and return the file content.""" 129 | options = 'rb' if binary else 'rU' 130 | with open(path, options) as f: 131 | return f.read() 132 | 133 | 134 | def sha_256_sum_file(path, binary=True): 135 | """Return the SHA-256 sum of the file.""" 136 | return sha256(read_file(path, binary=binary)).hexdigest() 137 | 138 | 139 | def gen_read_lines(path): 140 | """Generator for reading the lines in a file.""" 141 | with open(path, 'rU') as f: 142 | for line in f: 143 | yield line 144 | 145 | 146 | def read_url(uri): 147 | """Fetch and return a URI content.""" 148 | w = urlopen(uri) 149 | return w.read() 150 | 151 | 152 | def write_to_file(file_path, data): 153 | """Write data to file and close.""" 154 | with open(file_path, 'w') as ofile: 155 | ofile.write(data) 156 | 157 | 158 | def download_file(uri, file_path): 159 | write_to_file(file_path, read_url(uri)) 160 | 161 | 162 | def extract_tbb_tarball(archive_path): 163 | arch_dir = os.path.dirname(archive_path) 164 | extracted_dir = os.path.join(arch_dir, "tor-browser_en-US") 165 | tar_cmd = "tar xvf %s -C %s" % (archive_path, arch_dir) 166 | status, txt = commands.getstatusoutput(tar_cmd) 167 | if status or not os.path.isdir(extracted_dir): 168 | wl_log.error("Error extracting TBB tarball %s: (%s: %s)" 169 | % (tar_cmd, status, txt)) 170 | return False 171 | dest_dir = archive_path.split(".tar")[0] 172 | mv_cmd = "mv %s %s" % (extracted_dir, dest_dir) 173 | status, txt = commands.getstatusoutput(mv_cmd) 174 | if status or not os.path.isdir(dest_dir): 175 | wl_log.error("Error moving extracted TBB with the command %s: (%s: %s)" 176 | % (mv_cmd, status, txt)) 177 | return False 178 | return True 179 | --------------------------------------------------------------------------------