├── .gitignore ├── .travis.yml ├── CHANGELOG ├── EmulatingAndroidFonts.md ├── LICENSE ├── README.md ├── VERSION ├── __init__.py ├── automation ├── BrowserManager.py ├── CommandSequence.py ├── Commands │ ├── __init__.py │ ├── browser_commands.py │ ├── command_executor.py │ ├── profile_commands.py │ └── utils │ │ ├── XPathUtil.py │ │ ├── __init__.py │ │ ├── file_utils.py │ │ ├── firefox_profile.py │ │ ├── gen_utils.py │ │ ├── lso.py │ │ └── webdriver_extensions.py ├── DataAggregator │ ├── DataAggregator.py │ ├── LevelDBAggregator.py │ └── __init__.py ├── DeployBrowsers │ ├── __init__.py │ ├── configure_firefox.py │ ├── deploy_browser.py │ ├── deploy_firefox.py │ ├── firefox_extensions │ │ ├── adblock_plus-2.7.xpi │ │ ├── ghostery │ │ │ ├── ghostery-5.4.10.xpi │ │ │ └── store.json │ │ └── https_everywhere-5.1.0.xpi │ ├── screen_resolutions.txt │ └── user_agent_strings.txt ├── Errors.py ├── Extension │ └── firefox │ │ ├── data │ │ ├── content.js │ │ ├── create_content_policy_table.sql │ │ ├── create_cookies_table.sql │ │ ├── create_http_requests_table.sql │ │ ├── create_http_responses_table.sql │ │ ├── create_javascript_table.sql │ │ ├── create_pages_table.sql │ │ ├── remove_webdriver_attributes.js │ │ └── trigger_sensor_events.js │ │ ├── doc │ │ └── main.md │ │ ├── index.js │ │ ├── lib │ │ ├── content-policy-instrument.js │ │ ├── cookie-instrument.js │ │ ├── http-instrument.js │ │ ├── http-post-parser.js │ │ ├── javascript-instrument.js │ │ ├── loggingdb.js │ │ ├── page-manager.js │ │ └── socket.js │ │ ├── node_modules │ │ └── bufferpack │ │ │ ├── .npmignore │ │ │ ├── CHANGELOG │ │ │ ├── LICENSE │ │ │ ├── bufferpack.js │ │ │ └── package.json │ │ ├── package.json │ │ └── test │ │ └── test-main.js ├── MPLogger.py ├── Proxy │ ├── MITMProxy.py │ ├── __init__.py │ ├── cert │ │ ├── mitmproxy-ca-cert.cer │ │ ├── mitmproxy-ca-cert.p12 │ │ ├── mitmproxy-ca-cert.pem │ │ ├── mitmproxy-ca.pem │ │ └── mitmproxy-dhparam.pem │ ├── cert8.db │ ├── deploy_mitm_proxy.py │ ├── key3.db │ └── mitm_commands.py ├── SocketInterface.py ├── TaskManager.py ├── __init__.py ├── default_browser_params.json ├── default_manager_params.json ├── schema.sql └── utilities │ ├── Cookie.py │ ├── __init__.py │ ├── build_cookie_table.py │ ├── db_utils.py │ ├── domain_utils.py │ └── platform_utils.py ├── clustering └── Clustering_JS_scripts.ipynb ├── demo.py ├── feature_extraction ├── SensorAccesByRankPlot.ipynb ├── __init__.py ├── extract_features.py └── utils.py ├── install-analysis.sh ├── install.sh ├── mobile_sensor_crawl.py ├── requirements.txt └── test ├── __init__.py ├── conftest.py ├── expected.py ├── manual_test.py ├── openwpmtest.py ├── test_adblock_plus.py ├── test_crawl.py ├── test_custom_function_command.py ├── test_disable_webdriver_self_id.py ├── test_env.py ├── test_extension.py ├── test_http_instrumentation.py ├── test_js_instrument.py ├── test_pages ├── abp │ ├── adblock_plus_test.html │ ├── adspot │ │ └── 1.js │ ├── adsystem │ │ └── 3.js │ └── bannerads │ │ └── 2.js ├── audio_fingerprinting.html ├── battery_fingerprinting.html ├── canvas_fingerprinting.html ├── expected_source.html ├── http_stacktrace.html ├── http_test_page.html ├── http_test_page_2.html ├── instrument_object.html ├── js_call_stack.html ├── js_cookie.html ├── lso │ ├── FlashCookie.swf │ ├── flash-cookie.js │ └── setlso.html ├── post_file_upload.html ├── post_request.html ├── post_request_ajax.html ├── property_enumeration.html ├── sensor_value_test.html ├── sensors.html ├── shared │ ├── test_favicon.ico │ ├── test_image.png │ ├── test_image_2.png │ ├── test_script.js │ ├── test_script_2.js │ ├── test_style.css │ └── utils.js ├── simple_a.html ├── simple_b.html ├── simple_c.html ├── simple_d.html ├── stack.js └── webrtc_localip.html ├── test_profile.py ├── test_sensors.py ├── test_simple_commands.py ├── test_storage_vectors.py ├── test_trigger_sensor_events.py └── utilities.py /.gitignore: -------------------------------------------------------------------------------- 1 | # firefox directories 2 | firefox-bin/ 3 | 4 | # VIM tmp files 5 | *~ 6 | .*.sw* 7 | 8 | # A bug in selenium creates this on unix systems 9 | C:\\nppdf32Log\\debuglog.txt 10 | 11 | # PyCharm 12 | .idea/* 13 | *.idea 14 | */idea 15 | 16 | # Byte-compiled / optimized / DLL files 17 | __pycache__/ 18 | *.py[cod] 19 | 20 | # C extensions 21 | *.so 22 | 23 | # Distribution / packaging 24 | .Python 25 | env/ 26 | bin/ 27 | build/ 28 | develop-eggs/ 29 | dist/ 30 | eggs/ 31 | #lib/ 32 | #lib64/ 33 | parts/ 34 | sdist/ 35 | var/ 36 | *.egg-info/ 37 | .installed.cfg 38 | *.egg 39 | 40 | # Installer logs 41 | pip-log.txt 42 | pip-delete-this-directory.txt 43 | 44 | # Unit test / coverage reports 45 | htmlcov/ 46 | .tox/ 47 | .coverage 48 | .cache 49 | nosetests.xml 50 | coverage.xml 51 | 52 | # Translations 53 | *.mo 54 | 55 | # Mr Developer 56 | .mr.developer.cfg 57 | .project 58 | .pydevproject 59 | 60 | # Rope 61 | .ropeproject 62 | 63 | # Django stuff: 64 | *.log 65 | *.pot 66 | 67 | # Sphinx documentation 68 | docs/_build/ 69 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | sudo: required 2 | language: python 3 | os: linux 4 | dist: trusty 5 | group: deprecated-2017Q4 6 | env: 7 | # See, https://docs.travis-ci.com/user/speeding-up-the-build/ 8 | # We need a balanced distribution of the tests 9 | # Once we add and remove tests, this distribution may become unbalanced. 10 | # Feel free to move tests around to make the running time of the jobs 11 | # as close as possible. 12 | - TESTS=test_[a-b,d-e]* 13 | # test_crawl.py is the longest running test. 14 | - TESTS=test_c* 15 | - TESTS=test_[f-h]* 16 | - TESTS=test_[i-z]* 17 | git: 18 | depth: 3 19 | before_install: 20 | - "export DISPLAY=:99.0" 21 | # https://github.com/npm/npm/issues/20203 22 | # !!! comment the following when the cert issue is fixed 23 | - "npm config set strict-ssl false" 24 | install: 25 | - echo "y" | ./install.sh 26 | - pip install -r requirements.txt 27 | before_script: 28 | - cd test 29 | script: 30 | - py.test -s -v --durations=10 $TESTS 31 | -------------------------------------------------------------------------------- /CHANGELOG: -------------------------------------------------------------------------------- 1 | v0.7.0 - 2016-11-15 2 | ====== 3 | 4 | Changes: 5 | * Bugfixes to extension instrumentation where records would be dropped when 6 | the extension was under heavy load and fail to re-enable until the browser 7 | was restarted. 8 | * Bugfix to extension / socket interface 9 | * Add `run_custom_function` command 10 | * Using alternative serialization/parallelization with `dill` and 11 | `multiprocess` 12 | * Better documentation 13 | * Bugfixes to install script 14 | * Add `save_screenshot` and `dump_page_source` commands 15 | * Add Audio API instrumentation 16 | * Bugfix to `browse` command 17 | * Bugfix to extension instrumentation injection to avoid Security Errors 18 | 19 | v0.6.2 - 2016-04-08 20 | ====== 21 | 22 | Changes: 23 | * Bugfix to browse command. Now supports sleeping after get. 24 | 25 | v0.6.1 - 2016-04-08 26 | ====== 27 | 28 | Critical: 29 | * Bugfix in LevelDBAggregator preventing data loss 30 | 31 | Changes: 32 | * Bump to Firefox 45 & Selenium 2.53.0 33 | * Update certificate stored 34 | * Added sleep argument to `get` command 35 | * Added install script for development dependencies 36 | * Improved error handling in TaskManager and Proxy 37 | * Version bumps and bugfixes in HTTPS Everywhere, Ghostery, and ABP 38 | * Tests added! 39 | * Numerous bugfixes and improvements in Javascript Instrumentation 40 | 41 | v0.6.0 - 2015-12-22 42 | ====== 43 | 44 | Changes: 45 | * Cleanup of Firefox prefs to make browsers faster and reduce phoning home 46 | * Use LevelDB for javascript file storage 47 | * Improved HTTP Cookie Parsing 48 | * Several bugfixes to extension instrumentation 49 | * Improved profile handling during shutdown and crashes 50 | * Improved handling of child Exceptions 51 | * Inital platform tests 52 | * Improvements to javascript instrumentation 53 | 54 | v0.5.1 - 2015-10-15 55 | ====== 56 | 57 | Changes: 58 | * Save json serialized headers and fix cookie parsing 59 | 60 | v0.5.0 - 2015-10-14 61 | ====== 62 | 63 | Changes: 64 | * Added support for saving all javascript files de-duplicated and compressed 65 | * Created two configuration dictionaries. One for individual browsers and 66 | another for the entire infrastructure 67 | * Support for using OpenWPM as a submodule 68 | * Firefox (v39) and Selenium (v2.47.1) 69 | * Added support for launching Ghostery, HTTPS Everywhere, and AdBlock Plus 70 | * Removed Random Extension Support 71 | * Bugfix for broken profile saving. 72 | * Bugfix for profile clearing when memory limits are exceeded 73 | * Numerous stability fixes 74 | * Full Logging support in all commands 75 | 76 | v0.4.0 77 | ====== 78 | 79 | Changes: 80 | * Significant stability improvements for long crawls 81 | * Support for logging with logging module 82 | * A large number of bugfixes related to process handling 83 | * Prevention of a large number of stray tmp files/folders during long crawls 84 | * Process/memory watchdog to handle orphaned processes and keep memory usage 85 | reasonable 86 | * Numerous bugfixes for extension 87 | * Failure thresholds to prevent infinite loops of browser respawns or 88 | command execution attempts (instead, Errors and raised) 89 | * Script to install dependencies 90 | * API changes to command timeouts 91 | * Move SocketInterface from pickle to json serialization 92 | 93 | Known Issues: 94 | * Encoding issues cause a very small percentage of data to be dropped by the 95 | extension 96 | * Malformed queries are occassionally sent to the DataAggregator and are 97 | dropped. The cause is unknown. 98 | * Forking can be done in a more memory efficient way 99 | 100 | 0.3.1 - Fixes #5 101 | 0.3.0 - Experimental merge of Fourthparty + framework to allow additional 102 | javascript instrumentation. 103 | 0.2.3 - Timeout logging 104 | 0.2.2 - Browse command + better scrolling + bugfixes 105 | 0.2.1 - Support for MITMProxy v0.11 + minor bugfixes 106 | 0.2.0 - Complete re-write of HTTP Cookie parsing 107 | 0.1.1 - Simplfied load of default settings, including wiki demo 108 | 0.1.0 - Initial Public Release 109 | -------------------------------------------------------------------------------- /EmulatingAndroidFonts.md: -------------------------------------------------------------------------------- 1 | To mitigate detection of the OpenWPM-Mobile by font-based fingerprinting, 2 | you may uninstall all fonts present on your crawler machine and install fonts 3 | extracted from a real Android device. 4 | 5 | ## 1 Extracting Android fonts: 6 | 7 | Connect the Android device you want to emulate in USB debugging mode. 8 | Copy the Android fonts from the phone using `adb`: 9 | 10 | ``` 11 | mkdir android_fonts # create a directory for the font files 12 | cd android_fonts 13 | adb pull /system/fonts # copy the font files from the device 14 | ``` 15 | 16 | ## 2 Adding Android fonts to the crawler machine: 17 | 18 | ``` 19 | mv ~/.fonts ~/.fonts_BKP # back-up existing user-specific fonts - may or may not exist 20 | mkdir -p ~/.fonts # create the user-specific font directory 21 | cp android_fonts/* ~/.fonts # copy font files extracted from the Android device 22 | fc-cache -f -v # update the font cache 23 | 24 | ``` 25 | 26 | 27 | ## 3 Comment out the aliases for `MS Gothic` and `MS PGothic` fonts in `/etc/fonts/conf.avail/30-cjk-aliases.conf` 28 | 29 | ``` 30 | 31 | 44 | ... 45 | 58 | ``` 59 | 60 | ## 4 Remove existing system-wide fonts: 61 | We need to empty `/usr/share/fonts` and `/usr/local/share/fonts` 62 | 63 | ``` 64 | mkdir ~/usr_share_bkp 65 | mkdir ~/usr_local_share_bkp 66 | mv /usr/share/fonts/* ~/usr_share_bkp 67 | mv /usr/local/share/fonts* ~/usr_local_share_bkp 68 | fc-cache -f -v 69 | ``` 70 | 71 | If you are using a non-Debian based distro, check `/etc/fonts/fonts.conf` 72 | for `` and move the font files in those dirs to a backup dir. 73 | 74 | ### Restoring old fonts after the crawl: 75 | mv ~/usr_share_bkp/* /usr/share/fonts/ 76 | mv ~/usr_local_share_bkp/* /usr/local/share/fonts/ 77 | mv ~/.fonts_BKP ~/.fonts 78 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | OpenWPM-Mobile [![Build Status](https://travis-ci.org/sensor-js/OpenWPM-mobile.svg?branch=master)](https://travis-ci.org/sensor-js/OpenWPM-mobile) 2 | ======= 3 | 4 | OpenWPM-Mobile is a mobile web privacy measurement framework that is based on 5 | [OpenWPM](https://github.com/citp/OpenWPM). OpenWPM-Mobile is developed for the paper titled "[`The Web's Sixth Sense: A Study of Scripts Accessing Smartphone Sensors`](https://sensor-js.xyz)" to measure the ecosystem of scripts accessing mobile sensors. 6 | 7 | ## Installation 8 | 9 | Run the following to install OpenWPM-Mobile. 10 | 11 | ```./install.sh``` 12 | 13 | To install the analysis related packages and files: 14 | 15 | ```install-analysis.sh``` 16 | 17 | 18 | ## Basic usage 19 | 20 | Edit [`mobile_sensor_crawl.py`](https://github.com/sensor-js/OpenWPM_mobile/blob/mobile_sensors/mobile_sensor_crawl.py) to change the crawl parameters, such as number of sites to crawl and the number of browsers to run in parallel. 21 | 22 | Then start a crawl by running: 23 | 24 | ```python mobile_sensor_crawl.py``` 25 | 26 | 27 | ## Imitating Mobile Browser 28 | OpenWPM-Mobile takes several steps to realistically imitate Firefox for Android. 29 | 30 | This involves overriding navigator object’s user agent, platform, 31 | appVersion and appCodeName strings; matching the screen resolution, 32 | screen dimensions, pixel depth, color depth; enabling touch 33 | status; removing plugins and supported MIME types that may indicate a desktop browser. 34 | 35 | OpenWPM-Mobile also uses the preferences used to configure Firefox 36 | for Android such as hiding the scroll bars and disabling popup windows. 37 | We relied on the values provided in the [`mobile.js`](https://dxr.mozilla.org/mozilla-esr45/source/mobile/android/app/mobile.js) script found in the Firefox for Android source code repository. 38 | 39 | When running crawls with OpenWPM-Mobile we installed 40 | Android fonts on our crawler machines to mitigate font-based 41 | fingerprinting. You may follow the instructions provided in 42 | [EmulatingAndroidFonts.md](https://github.com/sensor-js/OpenWPM_mobile/blob/mobile_sensors/EmulatingAndroidFonts.md) 43 | to install Android fonts on your crawler machines. 44 | 45 | ## Running tests 46 | 47 | The following will run all the tests: 48 | 49 | ```pytest test``` 50 | 51 | If you don't want to run the (slow) crawling test `test_crawl.py` execute the following: 52 | 53 | ```pytest test -m "not slow"``` 54 | 55 | ## Data Analysis 56 | 57 | Consult to the [OpenWPM repository](https://github.com/citp/OpenWPM#instrumentation-and-data-access) for details of the data format. 58 | 59 | ### Feature extraction and clustering 60 | 61 | Follow the steps below to extract binary script features and cluster scripts similar using the methodology described in the [paper](https://sensor-js.xyz/ccs-18-a-study-of-scripts-accessing-smartphone-sensors.pdf). 62 | 63 | 1. Run the following command to extract features for scripts discovered in the crawl: 64 | 65 | ```python extract_features.py``` 66 | 67 | Make sure to point to the correct database containing the crawl results inside [`extract_features.py`](https://github.com/sensor-js/OpenWPM_mobile/blob/mobile_sensors/feature_extraction/extract_features.py#L813). 68 | 69 | 2. Once features are extracted you can generate clusters from the extracted features by using the [`Clustering_JS_scripts.ipynb`](https://github.com/sensor-js/OpenWPM_mobile/blob/mobile_sensors/cluster_scripts/Clustering_JS_scripts.ipynb) Jupyter notebook. 70 | 71 | Make sure to point to the newly generated feature file (```features.csv```) from the step 1. 72 | 73 | ## Citation 74 | If you use OpenWPM-Mobile in your research, please cite our CCS 2018 paper titled [`The Web's Sixth Sense: A Study of Scripts Accessing Smartphone Sensors`](https://sensor-js.xyz/ccs-18-a-study-of-scripts-accessing-smartphone-sensors.pdf). You can use the following BibTeX. 75 | 76 | ``` 77 | @inproceedings{sensor-js-2018, 78 | author = "Anupam Das and Gunes Acar and Nikita Borisov and Amogh Pradeep", 79 | title = "{The Web's Sixth Sense: A Study of Scripts Accessing Smartphone Sensors}", 80 | booktitle = {Proceedings of ACM CCS 2018}, 81 | year = "2018", 82 | } 83 | ``` 84 | 85 | ## License 86 | 87 | OpenWPM-Mobile is licensed under GNU GPLv3. Additional code has been included from 88 | [OpenWPM](https://github.com/citp/OpenWPM) (which OpenWPM-Mobile is based on), 89 | [FourthParty](https://github.com/fourthparty/fourthparty) and 90 | [Privacy Badger](https://github.com/EFForg/privacybadgerfirefox), all of which 91 | are licensed GPLv3+. 92 | 93 | -------------------------------------------------------------------------------- /VERSION: -------------------------------------------------------------------------------- 1 | 0.7.0 2 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sensor-js/OpenWPM-mobile/1834f4c5b8fd0d3976b2e57f5310fd72860a681f/__init__.py -------------------------------------------------------------------------------- /automation/CommandSequence.py: -------------------------------------------------------------------------------- 1 | from Errors import CommandExecutionError 2 | 3 | class CommandSequence: 4 | """A CommandSequence wraps a series of commands to be performed 5 | on a visit to one top-level site into one logical 6 | "site visit," keyed by a visit id. An example of a CommandSequence 7 | that visits a page and dumps cookies modified on that visit would be: 8 | 9 | sequence = CommandSequence(url) 10 | sequence.get() 11 | sequence.dump_profile_cookies() 12 | task_manager.execute_command_sequence(sequence) 13 | 14 | CommandSequence guarantees that a series of commands will be performed 15 | by a single browser instance. 16 | 17 | NOTE: Commands dump_profile_cookies and dump_flash_cookies will close 18 | the current tab - any command that relies on the page still being open, 19 | like save_screenshot, extract_links, or dump_page_source, should be 20 | called prior to one of those two commands. 21 | """ 22 | 23 | def __init__(self, url, reset=False, blocking=False): 24 | """Initialize command sequence. 25 | 26 | Parameters 27 | ---------- 28 | url : str 29 | url of page visit the command sequence should execute on 30 | reset : bool 31 | True if browser should clear state and restart after sequence 32 | blocking : bool 33 | True if sequence should block parent process during execution 34 | """ 35 | self.url = url 36 | self.reset = reset 37 | self.blocking = blocking 38 | self.commands_with_timeout = [] 39 | self.total_timeout = 0 40 | self.contains_get_or_browse = False 41 | 42 | def get(self, sleep=0, timeout=60): 43 | """ goes to a url """ 44 | self.total_timeout += timeout 45 | command = ('GET', self.url, sleep) 46 | self.commands_with_timeout.append((command, timeout)) 47 | self.contains_get_or_browse = True 48 | 49 | def browse(self, num_links = 2, sleep=0, timeout=60): 50 | """ browse a website and visit links on the page """ 51 | self.total_timeout += timeout 52 | command = ('BROWSE', self.url, num_links, sleep) 53 | self.commands_with_timeout.append((command, timeout)) 54 | self.contains_get_or_browse = True 55 | 56 | def dump_flash_cookies(self, timeout=60): 57 | """ dumps the local storage vectors (flash, localStorage, cookies) to db 58 | Side effect: closes the current tab.""" 59 | self.total_timeout += timeout 60 | if not self.contains_get_or_browse: 61 | raise CommandExecutionError("No get or browse request preceding " 62 | "the dump storage vectors command", self) 63 | command = ('DUMP_FLASH_COOKIES',) 64 | self.commands_with_timeout.append((command, timeout)) 65 | 66 | def dump_profile_cookies(self, timeout=60): 67 | """ dumps from the profile path to a given file (absolute path) 68 | Side effect: closes the current tab.""" 69 | self.total_timeout += timeout 70 | if not self.contains_get_or_browse: 71 | raise CommandExecutionError("No get or browse request preceding " 72 | "the dump storage vectors command", self) 73 | command = ('DUMP_PROFILE_COOKIES',) 74 | self.commands_with_timeout.append((command, timeout)) 75 | 76 | def dump_profile(self, dump_folder, close_webdriver=False, compress=True, timeout=120): 77 | """ dumps from the profile path to a given file (absolute path) """ 78 | self.total_timeout += timeout 79 | command = ('DUMP_PROF', dump_folder, close_webdriver, compress) 80 | self.commands_with_timeout.append((command, timeout)) 81 | 82 | def extract_links(self, timeout=30): 83 | """Extracts links found on web page and dumps them externally""" 84 | self.total_timeout += timeout 85 | if not self.contains_get_or_browse: 86 | raise CommandExecutionError("No get or browse request preceding " 87 | "the dump storage vectors command", self) 88 | command = ('EXTRACT_LINKS',) 89 | self.commands_with_timeout.append((command, timeout)) 90 | 91 | def save_screenshot(self, screenshot_name, timeout=30): 92 | """Saves screenshot of page to 'screenshots' directory in data directory.""" 93 | self.total_timeout += timeout 94 | if not self.contains_get_or_browse: 95 | raise CommandExecutionError("No get or browse request preceding " 96 | "the save screenshot command", self) 97 | command = ('SAVE_SCREENSHOT', screenshot_name,) 98 | self.commands_with_timeout.append((command, timeout)) 99 | 100 | def dump_page_source(self, dump_name, timeout=30): 101 | """Dumps rendered source of current page visit to 'sources' directory.""" 102 | self.total_timeout += timeout 103 | if not self.contains_get_or_browse: 104 | raise CommandExecutionError("No get or browse request preceding " 105 | "the dump page source command", self) 106 | command = ('DUMP_PAGE_SOURCE', dump_name,) 107 | self.commands_with_timeout.append((command, timeout)) 108 | 109 | def run_custom_function(self, function_handle, func_args=(), timeout=30): 110 | """Run a custom by passing the function handle""" 111 | self.total_timeout += timeout 112 | if not self.contains_get_or_browse: 113 | raise CommandExecutionError("No get or browse request preceding " 114 | "the dump page source command", self) 115 | command = ('RUN_CUSTOM_FUNCTION', function_handle, func_args) 116 | self.commands_with_timeout.append((command, timeout)) 117 | -------------------------------------------------------------------------------- /automation/Commands/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sensor-js/OpenWPM-mobile/1834f4c5b8fd0d3976b2e57f5310fd72860a681f/automation/Commands/__init__.py -------------------------------------------------------------------------------- /automation/Commands/command_executor.py: -------------------------------------------------------------------------------- 1 | import browser_commands 2 | import profile_commands 3 | 4 | 5 | def execute_command(command, webdriver, proxy_queue, browser_settings, browser_params, manager_params, extension_socket): 6 | """ 7 | executes BrowserManager commands by passing command tuples into necessary helper function 8 | commands are of form (COMMAND, ARG0, ARG1, ...) 9 | the only imports in this file should be imports to helper libraries 10 | """ 11 | if command[0] == 'GET': 12 | browser_commands.get_website(url=command[1], sleep=command[2], visit_id=command[3], 13 | webdriver=webdriver, proxy_queue=proxy_queue, 14 | browser_params=browser_params, extension_socket=extension_socket) 15 | 16 | if command[0] == 'BROWSE': 17 | browser_commands.browse_website(url=command[1], num_links=command[2], sleep=command[3], 18 | visit_id=command[4], webdriver=webdriver, 19 | proxy_queue=proxy_queue, browser_params=browser_params, 20 | manager_params=manager_params, extension_socket=extension_socket) 21 | 22 | if command[0] == 'DUMP_FLASH_COOKIES': 23 | browser_commands.dump_flash_cookies(start_time=command[1], visit_id=command[2], 24 | webdriver=webdriver, browser_params=browser_params, 25 | manager_params=manager_params) 26 | 27 | if command[0] == 'DUMP_PROFILE_COOKIES': 28 | browser_commands.dump_profile_cookies(start_time=command[1], visit_id=command[2], 29 | webdriver=webdriver, browser_params=browser_params, 30 | manager_params=manager_params) 31 | 32 | if command[0] == 'DUMP_PROF': 33 | profile_commands.dump_profile(browser_profile_folder=browser_params['profile_path'], 34 | manager_params=manager_params, 35 | browser_params=browser_params, 36 | tar_location=command[1], close_webdriver=command[2], 37 | webdriver=webdriver, browser_settings=browser_settings, 38 | compress=command[3], 39 | save_flash=browser_params['disable_flash'] is False) 40 | 41 | if command[0] == 'EXTRACT_LINKS': 42 | browser_commands.extract_links(webdriver, browser_params, manager_params) 43 | 44 | if command[0] == 'SAVE_SCREENSHOT': 45 | browser_commands.save_screenshot(screenshot_name=command[1], webdriver=webdriver, 46 | browser_params=browser_params, manager_params=manager_params) 47 | 48 | if command[0] == 'DUMP_PAGE_SOURCE': 49 | browser_commands.dump_page_source(dump_name=command[1], webdriver=webdriver, 50 | browser_params=browser_params, manager_params=manager_params) 51 | 52 | if command[0] == 'RUN_CUSTOM_FUNCTION': 53 | arg_dict = {"command": command, 54 | "driver": webdriver, 55 | "proxy_queue": proxy_queue, 56 | "browser_settings": browser_settings, 57 | "browser_params": browser_params, 58 | "manager_params": manager_params, 59 | "extension_socket": extension_socket} 60 | command[1](*command[2], **arg_dict) 61 | -------------------------------------------------------------------------------- /automation/Commands/utils/XPathUtil.py: -------------------------------------------------------------------------------- 1 | # XPathUtil.py 2 | # A collecton of utilities to extract and parse 3 | # XPaths encountered while scraping. 4 | # 5 | # Steven Englehardt (github.com/englehardt) 6 | from bs4 import BeautifulSoup as bs 7 | import bs4 8 | import re 9 | 10 | def is_clickable(xpath): 11 | #We consider any xpath that has an 'a', 'button', 12 | #or 'input' tag to be clickable as it most likely 13 | #contains a link. It may make sense to see check 14 | # or other tags... 15 | index_regex = re.compile(r'\[[^\]]*\]') #match index and id brackets 16 | #check xpath for necessary tags 17 | temp = re.sub(index_regex,'',xpath) 18 | temp = temp.split('/') 19 | if 'a' in temp or 'button' in temp or 'input' in temp: 20 | return True 21 | return False 22 | 23 | # ExtractXPath(element, use_id) 24 | # - element: a bs4 tag node 25 | # - use_id: defaults True 26 | # 27 | # Traverses up the tag tree of a Beautiful Soup node 28 | # to return the XPath of that node. 29 | # 30 | # Use of ids is preferred when the xpath will be used 31 | # outside of BeautifulSoup. Since an id is unique to 32 | # all elements of the tree, it allows the use of a 33 | # wildcard for all parent nodes. This minimizes the 34 | # chances of incorrect indexing (which can occur if 35 | # javascript changes a page during processing). 36 | 37 | class ExtractXPathError(Exception): 38 | def __init__(self, value): 39 | self.value = value 40 | def __str__(self): 41 | return repr(self.value) 42 | 43 | def check_previous_tags(node, use_id=True): 44 | #index of node 45 | counter = 1 46 | for tag in node.previous_siblings: 47 | if type(tag) != bs4.element.Tag: 48 | continue 49 | elif tag.name == node.name: 50 | counter += 1 51 | 52 | #XPath name 53 | if counter > 1: 54 | xpath = node.name + '[' + str(counter) + ']' 55 | else: 56 | xpath = node.name 57 | 58 | return xpath 59 | 60 | def ExtractXPath(element, use_id = True): 61 | # Check that element is a tag node 62 | if type(element) != bs4.element.Tag: 63 | raise ExtractXPathError(str(type(element)) + 64 | ' is not a supported data type. Only tag nodes from the tag tree are accepted.') 65 | 66 | ##### Starting node 67 | #Check id first 68 | if use_id and element.get('id') != None: 69 | return '//*/' + element.name + '[@id=\"' + element.get('id') + '\"]' 70 | 71 | xpath = check_previous_tags(element) 72 | 73 | ##### Parent Nodes 74 | for parent in element.parents: 75 | #End of XPath - exclude from string 76 | if parent.name == '[document]': 77 | break 78 | 79 | #Check id first 80 | if use_id and parent.get('id') != None: 81 | return '//*/' + parent.name + '[@id=\"' + parent.get('id') + '\"]/' + xpath 82 | 83 | xpath = check_previous_tags(parent) + '/' + xpath 84 | 85 | xpath = '/' + xpath 86 | return xpath 87 | 88 | # xp1_wildcard adds wildcard functionality to XPath 1.0 89 | # strings using the limited function set supported by the 1.0 90 | # implementation. 91 | # 92 | # xp1_lowercase likewise adds lowercase functionality 93 | # 94 | # Hopefully you never need these... 95 | 96 | def xp1_lowercase(string): 97 | return 'translate('+ string + ", 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz')" 98 | 99 | # Converts a string with a wildcard in it to an XPath 1.0 100 | # compatible string *** ONLY SUPPORTS 1 WILDCARD *** 101 | # string: string w/ wildcard that you are searching for 102 | # attr: tag attribute you are searching for (e.g. 'text()' or '@id' or ...) 103 | def xp1_wildcard(attr, string, normalize=True): 104 | parts = string.split('*') 105 | 106 | if normalize: 107 | attr = 'normalize-space(' + attr + ')' 108 | 109 | if len(parts) != 2: 110 | print "ERROR: This function is meant to support 1 wildcard" 111 | return '[' + attr + '=' + string + ']' 112 | else: 113 | pt1 = '' 114 | pt2 = '' 115 | 116 | if parts[0] != '': 117 | pt1 = 'starts-with(' + attr + ', \'' + parts[0] + '\')' 118 | if parts[1] != '': 119 | pt2 = ('contains(substring(' + attr + 120 | ', string-length(' + attr + ')-'+ str(len(parts[1])-1) + 121 | '), \'' + parts[1] + '\')') 122 | 123 | if pt1 == '' and pt2 != '': 124 | return '[' + pt2 + ']' 125 | elif pt1 != '' and pt2 == '': 126 | return '[' + pt1 + ']' 127 | elif pt1 != '' and pt2 != '': 128 | return ('[' + pt1 + ' and ' + pt2 + ']') 129 | else: 130 | print "ERROR: The string is empty" 131 | return '[' + attr + '=' + string + ']' 132 | 133 | if __name__=='__main__': 134 | #Output some sample XPaths 135 | print "--- Sample XPaths ---" 136 | import urllib2 137 | import re 138 | from random import choice 139 | rsp = urllib2.urlopen('http://www.reddit.com/') 140 | if rsp.getcode() == 200: 141 | soup = bs(rsp.read(), 'lxml') 142 | elements = soup.findAll(text = re.compile('[A-Za-z0-9]{10,}')) 143 | for i in range(0,5): 144 | element = choice(elements).parent 145 | print "HTML" 146 | print element 147 | print "XPath" 148 | print ExtractXPath(element) 149 | print "**************" 150 | -------------------------------------------------------------------------------- /automation/Commands/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sensor-js/OpenWPM-mobile/1834f4c5b8fd0d3976b2e57f5310fd72860a681f/automation/Commands/utils/__init__.py -------------------------------------------------------------------------------- /automation/Commands/utils/file_utils.py: -------------------------------------------------------------------------------- 1 | # A collection of file utilities 2 | import shutil 3 | import os 4 | 5 | def rmsubtree(location): 6 | """Clears all subfolders and files in location""" 7 | for root, dirs, files in os.walk(location): 8 | for f in files: 9 | os.unlink(os.path.join(root, f)) 10 | for d in dirs: 11 | shutil.rmtree(os.path.join(root, d)) 12 | -------------------------------------------------------------------------------- /automation/Commands/utils/firefox_profile.py: -------------------------------------------------------------------------------- 1 | ### This is code adapted from KU Leuven crawler code written by 2 | ### Gunes Acar and Marc Juarez 3 | from glob import glob 4 | import sqlite3 5 | import time 6 | import os 7 | 8 | def tmp_sqlite_files_exist(path): 9 | """Check if temporary sqlite files(wal, shm) exist in a given path.""" 10 | return glob(os.path.join(path, '*-wal')) or \ 11 | glob(os.path.join(path, '*-shm')) 12 | 13 | 14 | def sleep_until_sqlite_checkpoint(profile_dir, timeout=60): 15 | """ 16 | We wait until all the shm and wal files are checkpointed to DB. 17 | https://www.sqlite.org/wal.html#ckpt. 18 | """ 19 | while (timeout > 0 and tmp_sqlite_files_exist(profile_dir)): 20 | time.sleep(1) 21 | timeout -= 1 22 | print "Waited for %s seconds for sqlite checkpointing" % (60 - timeout) 23 | 24 | 25 | def get_localStorage(profile_directory, mod_since): 26 | #TODO how to support modified since??? 27 | ff_ls_file = os.path.join(profile_directory, 'webappsstore.sqlite') 28 | if not os.path.isfile(ff_ls_file): 29 | print "Cannot find localstorage DB %s" % ff_ls_file 30 | else: 31 | conn = sqlite3.connect(ff_ls_file) 32 | with conn: 33 | cur = conn.cursor() 34 | cur.execute('SELECT scope, KEY, value \ 35 | FROM webappsstore2 \ 36 | WHERE last;') 37 | rows = cur.fetchall() 38 | return rows 39 | 40 | def get_cookies(profile_directory, mod_since): 41 | cookie_db = os.path.join(profile_directory, 'cookies.sqlite') 42 | if not os.path.isfile(cookie_db): 43 | print "cannot find cookie.db", cookie_db 44 | else: 45 | conn = sqlite3.connect(cookie_db) 46 | with conn: 47 | c = conn.cursor() 48 | c.execute('SELECT baseDomain, name, value, host, path, expiry,\ 49 | lastAccessed, creationTime, isSecure, isHttpOnly \ 50 | FROM moz_cookies \ 51 | WHERE lastAccessed > ?;',(int(mod_since*1000000),)) 52 | rows = c.fetchall() 53 | return rows 54 | -------------------------------------------------------------------------------- /automation/Commands/utils/gen_utils.py: -------------------------------------------------------------------------------- 1 | import smtplib 2 | import os 3 | from time import sleep 4 | 5 | 6 | def get_last_crawled(log_file): 7 | last_line = "" 8 | for line in open(log_file): 9 | if "EXECUTING COMMAND: ('GET'" in line: 10 | last_line = line 11 | return int(last_line.split(", ")[-1].split(")")[0]) 12 | 13 | 14 | def poll_openwpm_log(log_file="~/openwpm/openwpm.log"): 15 | POLL_LOG_FREQ = 900 # sec 16 | log_file = os.path.expanduser(log_file) 17 | while True: 18 | last_crawled = get_last_crawled(log_file) 19 | print("last_crawled %s" % last_crawled) 20 | send_alert_email("Crawled %s sites" % last_crawled) 21 | sleep(POLL_LOG_FREQ) 22 | 23 | 24 | def send_alert_email(msg="Cannot reach the phone"): 25 | fromaddr = 'appmonit@gmail.com' 26 | toaddrs = 'appmonit@gmail.com' 27 | msg = 'Subject: %s\n\n%s' % ("[appmonit-alert]", msg) 28 | # Credentials (if needed) 29 | username = 'appmonit' 30 | password = 'appmonit1' # TODO change it 31 | # The actual mail send 32 | server = smtplib.SMTP('smtp.gmail.com:587') 33 | server.starttls() 34 | server.login(username, password) 35 | server.sendmail(fromaddr, toaddrs, msg) 36 | server.quit() 37 | 38 | 39 | if __name__ == '__main__': 40 | poll_openwpm_log() 41 | 42 | -------------------------------------------------------------------------------- /automation/Commands/utils/lso.py: -------------------------------------------------------------------------------- 1 | ### This is code adapted from KU Leuven crawler code written by 2 | ### Gunes Acar and Marc Juarez 3 | from pyamf import sol 4 | import fnmatch 5 | import os 6 | 7 | #TODO: Linux only 8 | FLASH_DIRS = ['~/.macromedia/Flash_Player/#SharedObjects/'] 9 | 10 | class FlashCookie(object): 11 | filename = '' 12 | domain = '' 13 | local_path = '' 14 | key = '' 15 | content = '' 16 | 17 | def gen_find_files(filepat, top): 18 | """ 19 | http://www.dabeaz.com/generators/ 20 | returns filenames that matches the given pattern under() a given dir 21 | """ 22 | for path, _, filelist in os.walk(top): 23 | for name in fnmatch.filter(filelist, filepat): 24 | yield os.path.join(path, name) 25 | 26 | def get_flash_cookies(mod_since=0): 27 | """Return a list of Flash cookies (Local Shared Objects).""" 28 | flash_cookies = list() 29 | for top_dir in FLASH_DIRS: 30 | top_dir = os.path.expanduser(top_dir) 31 | for lso_file in gen_find_files("*.sol", top_dir): 32 | mtime = os.path.getmtime(lso_file) 33 | if mtime > mod_since: 34 | try: 35 | flash_cookies.extend(parse_flash_cookies(lso_file)) 36 | except (KeyboardInterrupt, SystemExit): 37 | raise 38 | except Exception as e: 39 | print "Exception reading", lso_file 40 | print e 41 | pass 42 | return flash_cookies 43 | 44 | def parse_flash_cookies(lso_file): 45 | lso_dict = sol.load(lso_file) 46 | flash_cookies = list() 47 | for k, v in lso_dict.iteritems(): 48 | flash_cookie = FlashCookie() 49 | flash_cookie.local_path = lso_file.split("#SharedObjects/")[1] 50 | flash_cookie.filename = os.path.basename(lso_file) 51 | flash_cookie.domain = lso_file.split("#SharedObjects/")[1].split("/")[1] 52 | flash_cookie.key = unicode(k) 53 | try: 54 | flash_cookie.content = unicode(v) 55 | except UnicodeDecodeError: 56 | # obj is byte string 57 | ascii_text = str(v).encode('string_escape') 58 | flash_cookie.content = unicode(ascii_text) 59 | 60 | flash_cookies.append(flash_cookie) 61 | return flash_cookies 62 | -------------------------------------------------------------------------------- /automation/Commands/utils/webdriver_extensions.py: -------------------------------------------------------------------------------- 1 | # A set of extensions to the functions normally provided by the selenium 2 | # webdriver. These are primarily for parsing and searching. 3 | from selenium.webdriver.support import expected_conditions as EC 4 | from selenium.webdriver.support.ui import WebDriverWait 5 | from selenium.common.exceptions import TimeoutException 6 | from selenium.common.exceptions import ElementNotVisibleException 7 | from selenium.common.exceptions import NoSuchElementException 8 | from urlparse import urljoin 9 | import random 10 | import time 11 | 12 | from ...utilities import domain_utils as du 13 | import XPathUtil 14 | 15 | #### Basic functions 16 | def scroll_down(driver): 17 | at_bottom = False 18 | while random.random() > .20 and not at_bottom: 19 | k = str(10 + int(200*random.random())) 20 | driver.execute_script("window.scrollBy(0,"+k+")") 21 | at_bottom = driver.execute_script("return (((window.scrollY + window.innerHeight ) +100 > document.body.clientHeight ))") 22 | time.sleep(0.5 + random.random()) 23 | 24 | def scroll_to_bottom(driver): 25 | driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") 26 | return 27 | 28 | def is_loaded(webdriver): 29 | return (webdriver.execute_script("return document.readyState") == "complete") 30 | 31 | def wait_until_loaded(webdriver, timeout, period=0.25): 32 | mustend = time.time() + timeout 33 | while time.time() < mustend: 34 | if is_loaded(webdriver): return True 35 | time.sleep(period) 36 | return False 37 | 38 | def get_intra_links(webdriver, url): 39 | ps1 = du.get_ps_plus_1(url) 40 | links = filter(lambda x: (x.get_attribute("href") and 41 | du.get_ps_plus_1(urljoin(url, x.get_attribute("href"))) == ps1), 42 | webdriver.find_elements_by_tag_name("a")) 43 | return links 44 | 45 | ##### Search/Block Functions 46 | # locator_type: a text representation of the standard 47 | # webdriver.find_element_by_* functions. You can either 48 | # import selenium.webdriver.common.by.By and use By.LINK_TEXT, etc. 49 | # or just remember the string representations. For example: 50 | # By.LINK_TEXT is 'link text' 51 | # By.CSS_SELECTOR is 'css selector' 52 | # By.NAME is 'name' ... and so on 53 | # locator: string that you are looking for 54 | def wait_and_find(driver, locator_type, locator, timeout=3, check_iframes=True): 55 | if is_found(driver, locator_type, locator, timeout): 56 | return driver.find_element(locator_type, locator) 57 | else: 58 | if check_iframes: #this may return the browser with an iframe active 59 | driver.switch_to_default_content() 60 | iframes = driver.find_elements_by_tag_name('iframe') 61 | 62 | for iframe in iframes: 63 | driver.switch_to_default_content() 64 | driver.switch_to_frame(iframe) 65 | if is_found(driver, locator_type, locator, timeout=0): 66 | return driver.find_element(locator_type, locator) 67 | 68 | #If we get here, search also fails in iframes 69 | driver.switch_to_default_content() 70 | raise NoSuchElementException, "Element not found during wait_and_find" 71 | 72 | def is_found(driver, locator_type, locator, timeout=3): 73 | try: 74 | w = WebDriverWait(driver, timeout) 75 | w.until(lambda d: d.find_element(locator_type, locator)) 76 | return True 77 | except TimeoutException: 78 | return False 79 | 80 | def is_visible(driver, locator_type, locator, timeout=3): 81 | try: 82 | w = WebDriverWait(driver, timeout) 83 | w.until(EC.visibility_of_element_located((locator_type, locator))) 84 | return True 85 | except TimeoutException: 86 | return False 87 | 88 | def title_is(driver, title, timeout=3): 89 | try: 90 | w = WebDriverWait(driver, timeout) 91 | w.until(EC.title_is(title)) 92 | return True 93 | except TimeoutException: 94 | return False 95 | 96 | def title_contains(driver, title, timeout=3): 97 | try: 98 | w = WebDriverWait(driver, timeout) 99 | w.until(EC.title_contains(title)) 100 | return True 101 | except TimeoutException: 102 | return False 103 | 104 | #Selenium requires an element to be visible and enabled to be 105 | #clickable. We extend that to require it to have a tag capable 106 | #of containing a link. NOTE: doesn't work 100% 107 | def is_clickable(driver, full_xpath, xpath, timeout = 1): 108 | try: 109 | w = WebDriverWait(driver, timeout) 110 | w.until(EC.element_to_be_clickable(('xpath',xpath))) 111 | return XPathUtil.is_clickable(full_xpath) 112 | except (TimeoutException, ElementNotVisibleException): 113 | return False 114 | 115 | #TODO Update this. No direct access to DB right now 116 | ''' 117 | #get and set xpaths into xpath database 118 | def get_xpath(driver, url, name): 119 | cur = self.db.cursor() 120 | cur.execute("SELECT xpath FROM xpath WHERE url = ? AND name = ?",(url, name)) 121 | response = cur.fetchone() 122 | if response == None: 123 | return None 124 | else: 125 | return response[0] 126 | 127 | def set_xpath(driver, url, name, xpath, absolute_xpath = None): 128 | cur = self.db.cursor() 129 | if self.mp_lock is not None: 130 | self.mp_lock.acquire() 131 | cur.execute("UPDATE xpath SET xpath = ?, absolute_xpath = ? \ 132 | WHERE url = ? AND name = ?", (xpath, absolute_xpath, url, name)) 133 | if cur.rowcount == 0: #occurs when record does not already exist 134 | cur.execute("INSERT INTO xpath (name, url, xpath, absolute_xpath) VALUES (?,?,?,?)", 135 | (name, url, xpath, absolute_xpath)) 136 | self.db.commit() 137 | if self.mp_lock is not None: 138 | self.mp_lock.release() 139 | return cur.lastrowid 140 | ''' 141 | 142 | #Click an xpath using javascript -- not working correctly 143 | #gets around visibility requirements of selenium. 144 | #def click_xpath(driver, xpath): 145 | # driver.execute_script('$(document.evaluate('+xpath+', document, null, 9, null).singleNodeValue).click();') 146 | -------------------------------------------------------------------------------- /automation/DataAggregator/DataAggregator.py: -------------------------------------------------------------------------------- 1 | from ..SocketInterface import serversocket 2 | from ..MPLogger import loggingclient 3 | from sqlite3 import OperationalError 4 | from sqlite3 import ProgrammingError 5 | import sqlite3 6 | import time 7 | import os 8 | 9 | 10 | def DataAggregator(manager_params, status_queue, commit_batch_size=1000): 11 | """ 12 | Receives SQL queries from other processes and writes them to the central database 13 | Executes queries until being told to die (then it will finish work and shut down) 14 | This process should never be terminated un-gracefully 15 | Currently uses SQLite but may move to different platform 16 | 17 | TaskManager configuration parameters 18 | is a queue connect to the TaskManager used for communication 19 | is the number of execution statements that should be made before a commit (used for speedup) 20 | """ 21 | 22 | # sets up DB connection 23 | db_path = manager_params['database_name'] 24 | db = sqlite3.connect(db_path, check_same_thread=False) 25 | curr = db.cursor() 26 | 27 | # sets up logging connection 28 | logger = loggingclient(*manager_params['logger_address']) 29 | 30 | # sets up the serversocket to start accepting connections 31 | sock = serversocket() 32 | status_queue.put(sock.sock.getsockname()) # let TM know location 33 | sock.start_accepting() 34 | 35 | counter = 0 # number of executions made since last commit 36 | commit_time = 0 # keep track of time since last commit 37 | while True: 38 | # received KILL command from TaskManager 39 | if not status_queue.empty(): 40 | status_queue.get() 41 | sock.close() 42 | drain_queue(sock.queue, curr, logger) 43 | break 44 | 45 | # no command for now -> sleep to avoid pegging CPU on blocking get 46 | if sock.queue.empty(): 47 | time.sleep(0.001) 48 | 49 | # commit every five seconds to avoid blocking the db for too long 50 | if counter > 0 and time.time() - commit_time > 5: 51 | db.commit() 52 | continue 53 | 54 | # process query 55 | query = sock.queue.get() 56 | process_query(query, curr, logger) 57 | 58 | # batch commit if necessary 59 | counter += 1 60 | if counter >= commit_batch_size: 61 | counter = 0 62 | commit_time = time.time() 63 | db.commit() 64 | 65 | # finishes work and gracefully stops 66 | db.commit() 67 | db.close() 68 | 69 | 70 | def process_query(query, curr, logger): 71 | """ 72 | executes a query of form (template_string, arguments) 73 | query is of form (template_string, arguments) 74 | """ 75 | if len(query) != 2: 76 | print "ERROR: Query is not the correct length" 77 | return 78 | statement = query[0] 79 | args = list(query[1]) 80 | for i in range(len(args)): 81 | if type(args[i]) == str: 82 | args[i] = unicode(args[i], errors='ignore') 83 | elif callable(args[i]): 84 | args[i] = str(args[i]) 85 | try: 86 | if len(args) == 0: 87 | curr.execute(statement) 88 | else: 89 | curr.execute(statement,args) 90 | except OperationalError as e: 91 | logger.error("Unsupported query" + '\n' + str(type(e)) + '\n' + str(e) + '\n' + statement + '\n' + str(args)) 92 | pass 93 | except ProgrammingError as e: 94 | logger.error("Unsupported query" + '\n' + str(type(e)) + '\n' + str(e) + '\n' + statement + '\n' + str(args)) 95 | pass 96 | 97 | 98 | def drain_queue(sock_queue, curr, logger): 99 | """ Ensures queue is empty before closing """ 100 | time.sleep(3) # TODO: the socket needs a better way of closing 101 | while not sock_queue.empty(): 102 | query = sock_queue.get() 103 | process_query(query, curr, logger) 104 | -------------------------------------------------------------------------------- /automation/DataAggregator/LevelDBAggregator.py: -------------------------------------------------------------------------------- 1 | from ..SocketInterface import serversocket 2 | from ..MPLogger import loggingclient 3 | import plyvel 4 | import time 5 | import os 6 | 7 | 8 | def LevelDBAggregator(manager_params, status_queue, batch_size=100): 9 | """ 10 | Receives pairs from other processes and writes them to the 11 | central database. Executes queries until being told to die (then it will 12 | finish work and shut down).This process should never be terminated 13 | un-gracefully. 14 | 15 | TaskManager configuration parameters 16 | is a queue connect to the TaskManager used for communication 17 | is the size of the write batch 18 | """ 19 | 20 | # sets up logging connection 21 | logger = loggingclient(*manager_params['logger_address']) 22 | 23 | # sets up the serversocket to start accepting connections 24 | sock = serversocket() 25 | status_queue.put(sock.sock.getsockname()) # let TM know location 26 | sock.start_accepting() 27 | 28 | # sets up DB connection 29 | db_path = os.path.join(manager_params['data_directory'], 'javascript.ldb') 30 | db = plyvel.DB(db_path, 31 | create_if_missing = True, 32 | lru_cache_size = 10**9, 33 | write_buffer_size = 128*10**4, 34 | compression = 'snappy') 35 | batch = db.write_batch() 36 | 37 | counter = 0 # number of executions made since last write 38 | commit_time = 0 # keep track of time since last write 39 | while True: 40 | # received KILL command from TaskManager 41 | if not status_queue.empty(): 42 | status_queue.get() 43 | sock.close() 44 | drain_queue(sock.queue, batch, db, counter, logger) 45 | break 46 | 47 | # no command for now -> sleep to avoid pegging CPU on blocking get 48 | if sock.queue.empty(): 49 | time.sleep(0.1) 50 | 51 | # commit every five seconds to avoid blocking the db for too long 52 | if counter > 0 and time.time() - commit_time > 5: 53 | batch.write() 54 | batch = db.write_batch() 55 | continue 56 | 57 | # process record 58 | content, content_hash = sock.queue.get() 59 | counter = process_content(content, content_hash, 60 | batch, db, counter, logger) 61 | 62 | # batch commit if necessary 63 | if counter >= batch_size: 64 | counter = 0 65 | commit_time = time.time() 66 | batch.write() 67 | batch = db.write_batch() 68 | 69 | # finishes work and gracefully stops 70 | batch.write() 71 | db.close() 72 | 73 | def process_content(content, content_hash, batch, db, counter, logger): 74 | """ 75 | adds content to the batch 76 | """ 77 | content = content.encode('utf-8') 78 | content_hash = str(content_hash) 79 | if db.get(content_hash) is not None: 80 | return counter 81 | 82 | batch.put(content_hash, content) 83 | return counter + 1 84 | 85 | def drain_queue(sock_queue, batch, db, counter, logger): 86 | """ Ensures queue is empty before closing """ 87 | time.sleep(3) # TODO: the socket needs a better way of closing 88 | while not sock_queue.empty(): 89 | content, content_hash = sock_queue.get() 90 | counter = process_content(content, content_hash, 91 | batch, db, counter, logger) 92 | -------------------------------------------------------------------------------- /automation/DataAggregator/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sensor-js/OpenWPM-mobile/1834f4c5b8fd0d3976b2e57f5310fd72860a681f/automation/DataAggregator/__init__.py -------------------------------------------------------------------------------- /automation/DeployBrowsers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sensor-js/OpenWPM-mobile/1834f4c5b8fd0d3976b2e57f5310fd72860a681f/automation/DeployBrowsers/__init__.py -------------------------------------------------------------------------------- /automation/DeployBrowsers/deploy_browser.py: -------------------------------------------------------------------------------- 1 | import deploy_firefox 2 | from ..Errors import BrowserConfigError 3 | 4 | def deploy_browser(status_queue, browser_params, manager_params, crash_recovery): 5 | """ receives a dictionary of browser parameters and passes it to the relevant constructor """ 6 | if browser_params['browser'].lower() == 'chrome': 7 | raise BrowserConfigError("Chrome is not supported. OpenWPM currently " 8 | "only supports measurement with Firefox.") 9 | if browser_params['browser'].lower() == 'firefox': 10 | return deploy_firefox.deploy_firefox(status_queue, browser_params, manager_params, crash_recovery) 11 | -------------------------------------------------------------------------------- /automation/DeployBrowsers/firefox_extensions/adblock_plus-2.7.xpi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sensor-js/OpenWPM-mobile/1834f4c5b8fd0d3976b2e57f5310fd72860a681f/automation/DeployBrowsers/firefox_extensions/adblock_plus-2.7.xpi -------------------------------------------------------------------------------- /automation/DeployBrowsers/firefox_extensions/ghostery/ghostery-5.4.10.xpi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sensor-js/OpenWPM-mobile/1834f4c5b8fd0d3976b2e57f5310fd72860a681f/automation/DeployBrowsers/firefox_extensions/ghostery/ghostery-5.4.10.xpi -------------------------------------------------------------------------------- /automation/DeployBrowsers/firefox_extensions/https_everywhere-5.1.0.xpi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sensor-js/OpenWPM-mobile/1834f4c5b8fd0d3976b2e57f5310fd72860a681f/automation/DeployBrowsers/firefox_extensions/https_everywhere-5.1.0.xpi -------------------------------------------------------------------------------- /automation/DeployBrowsers/screen_resolutions.txt: -------------------------------------------------------------------------------- 1 | 1920,1080 2 | 1366,768 3 | 1280,1024 4 | 1280,800 5 | 1024,768 6 | -------------------------------------------------------------------------------- /automation/DeployBrowsers/user_agent_strings.txt: -------------------------------------------------------------------------------- 1 | Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0 2 | Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:24.0) Gecko/20100101 Firefox/24.0 3 | Mozilla/5.0 (Windows NT 6.0; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0 4 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:24.0) Gecko/20100101 Firefox/24.0 5 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:25.0) Gecko/20100101 Firefox/25.0 6 | Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0 7 | Mozilla/5.0 (Windows NT 6.1; rv:28.0) Gecko/20100101 Firefox/28.0 8 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/536.26.17 (KHTML, like Gecko) Version/6.0.2 Safari/536.26.17 9 | Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36 10 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1664.3 Safari/537.36 11 | Mozilla/5.0 (compatible; MSIE 10.6; Windows NT 6.1; Trident/5.0; InfoPath.2; SLCC1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 2.0.50727) 3gpp-gba UNTRUSTED/1.0 12 | Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0) 13 | -------------------------------------------------------------------------------- /automation/Errors.py: -------------------------------------------------------------------------------- 1 | """ OpenWPM Custom Errors """ 2 | 3 | class CommandExecutionError(Exception): 4 | """ Raise for errors related to executing commands """ 5 | def __init__(self, message, command, *args): 6 | self.message = message 7 | self.command = command 8 | super(CommandExecutionError, self).__init__(message, command, *args) 9 | 10 | class ProfileLoadError(Exception): 11 | """ Raise for errors that occur while loading profile """ 12 | def __init__(self, message, *args): 13 | self.message = message 14 | super(ProfileLoadError, self).__init__(message, *args) 15 | 16 | class BrowserConfigError(Exception): 17 | """ Raise for errors that occur from a misconfiguration of the browser """ 18 | def __init__(self, message, *args): 19 | self.message = message 20 | super(BrowserConfigError, self).__init__(message, *args) 21 | 22 | class BrowserCrashError(Exception): 23 | """ Raise for non-critical crashes within the BrowserManager process """ 24 | def __init__(self, message, *args): 25 | self.message = message 26 | super(BrowserCrashError, self).__init__(message, *args) 27 | -------------------------------------------------------------------------------- /automation/Extension/firefox/data/create_content_policy_table.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE IF NOT EXISTS content_policy( 2 | id INTEGER PRIMARY KEY ASC, 3 | crawl_id INTEGER, 4 | content_type INTEGER, 5 | content_location TEXT, 6 | request_origin TEXT, 7 | mime_type_guess TEXT, 8 | page_id INTEGER, 9 | visit_id INTEGER 10 | ); 11 | -------------------------------------------------------------------------------- /automation/Extension/firefox/data/create_cookies_table.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE IF NOT EXISTS javascript_cookies( 2 | id INTEGER PRIMARY KEY ASC, 3 | crawl_id INTEGER, 4 | visit_id INTEGER, 5 | change TEXT, 6 | creationTime DATETIME, 7 | expiry DATETIME, 8 | is_http_only INTEGER, 9 | is_session INTEGER, 10 | last_accessed DATETIME, 11 | raw_host TEXT, 12 | expires INTEGER, 13 | host TEXT, 14 | is_domain INTEGER, 15 | is_secure INTEGER, 16 | name TEXT, 17 | path TEXT, 18 | policy INTEGER, 19 | status INTEGER, 20 | value TEXT 21 | ); 22 | -------------------------------------------------------------------------------- /automation/Extension/firefox/data/create_http_requests_table.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE IF NOT EXISTS http_requests( 2 | id INTEGER PRIMARY KEY AUTOINCREMENT, 3 | crawl_id INTEGER NOT NULL, 4 | visit_id INTEGER NOT NULL, 5 | url TEXT NOT NULL, 6 | top_level_url TEXT, 7 | method TEXT NOT NULL, 8 | referrer TEXT NOT NULL, 9 | headers TEXT NOT NULL, 10 | is_XHR BOOLEAN, 11 | is_frame_load BOOLEAN, 12 | is_full_page BOOLEAN, 13 | is_third_party_channel BOOLEAN, 14 | is_third_party_window BOOLEAN, 15 | triggering_origin TEXT, 16 | loading_origin TEXT, 17 | loading_href TEXT, 18 | req_call_stack TEXT, 19 | content_policy_type INTEGER NOT NULL, 20 | post_body TEXT, 21 | time_stamp TEXT NOT NULL 22 | ); 23 | -------------------------------------------------------------------------------- /automation/Extension/firefox/data/create_http_responses_table.sql: -------------------------------------------------------------------------------- 1 | /* TODO: link with requests */ 2 | CREATE TABLE IF NOT EXISTS http_responses( 3 | id INTEGER PRIMARY KEY AUTOINCREMENT, 4 | crawl_id INTEGER NOT NULL, 5 | visit_id INTEGER NOT NULL, 6 | url TEXT NOT NULL, 7 | method TEXT NOT NULL, 8 | referrer TEXT NOT NULL, 9 | response_status INTEGER NOT NULL, 10 | response_status_text TEXT NOT NULL, 11 | is_cached BOOLEAN NOT NULL, 12 | headers TEXT NOT NULL, 13 | location TEXT NOT NULL, 14 | time_stamp TEXT NOT NULL, 15 | content_hash TEXT 16 | ); 17 | -------------------------------------------------------------------------------- /automation/Extension/firefox/data/create_javascript_table.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE IF NOT EXISTS javascript( 2 | id INTEGER PRIMARY KEY, 3 | crawl_id INTEGER, 4 | visit_id INTEGER, 5 | script_url TEXT, 6 | script_line TEXT, 7 | script_col TEXT, 8 | func_name TEXT, 9 | script_loc_eval TEXT, 10 | call_stack TEXT, 11 | symbol TEXT, 12 | operation TEXT, 13 | value TEXT, 14 | arguments TEXT, 15 | time_stamp TEXT NOT NULL 16 | ); 17 | -------------------------------------------------------------------------------- /automation/Extension/firefox/data/create_pages_table.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE IF NOT EXISTS pages( 2 | id INTEGER PRIMARY KEY ASC, 3 | crawl_id INTEGER, 4 | visit_id INTEGER, 5 | location TEXT, 6 | parent_id INTEGER 7 | ); 8 | -------------------------------------------------------------------------------- /automation/Extension/firefox/data/remove_webdriver_attributes.js: -------------------------------------------------------------------------------- 1 | // We don't know the order the content scripts will load 2 | // so let's try to remove the attributes now (if they already exist) 3 | // or register an event handler if they don't. 4 | // * https://github.com/SeleniumHQ/selenium/blob/b82512999938d41f6765ce8017284dcabe437d4c/javascript/firefox-driver/extension/content/server.js#L49 5 | // * https://github.com/SeleniumHQ/selenium/blob/b82512999938d41f6765ce8017284dcabe437d4c/javascript/firefox-driver/extension/content/dommessenger.js#L98 6 | function getPageScript() { 7 | // return a string 8 | return "(" + function() { 9 | if ("webdriver" in navigator) { 10 | console.log("Webdriver attributes present, remove immediately"); 11 | // Attributes can be removed immediately 12 | document.documentElement.removeAttribute("webdriver"); 13 | delete window.navigator["webdriver"]; 14 | console.log("Webdriver attributes removed!"); 15 | } else { 16 | // Listener for `document` attribute 17 | document.addEventListener("DOMAttrModified", function monitor(ev) { 18 | console.log("Removing webdriver attribute from document"); 19 | document.documentElement.removeAttribute("webdriver"); 20 | document.removeEventListener("DOMAttrModified", monitor, false); 21 | }, false); 22 | 23 | // Prevent webdriver attribute from getting set on navigator 24 | var originalDefineProperty = Object.defineProperty; 25 | Object.defineProperty(Object, 'defineProperty', { 26 | value: function(obj, prop, descriptor) { 27 | if (obj == window.navigator && prop == 'webdriver') { 28 | console.log("Preventing definition of webdriver property on navigator."); 29 | 30 | // Return Object.defineProperty to original state 31 | Object.defineProperty(Object, 'defineProperty', { 32 | value: originalDefineProperty 33 | }); 34 | return undefined; 35 | } 36 | return originalDefineProperty(obj, prop, descriptor); 37 | } 38 | }); 39 | console.log("Webdriver attribute handlers started!"); 40 | } 41 | } + "());"; 42 | } 43 | 44 | function insertScript(text) { 45 | var parent = document.documentElement, 46 | script = document.createElement('script'); 47 | script.text = text; 48 | script.async = false; 49 | 50 | parent.insertBefore(script, parent.firstChild); 51 | parent.removeChild(script); 52 | } 53 | insertScript(getPageScript()); 54 | -------------------------------------------------------------------------------- /automation/Extension/firefox/data/trigger_sensor_events.js: -------------------------------------------------------------------------------- 1 | function getPageScript() { 2 | // return a string 3 | 4 | return "(" + function() { 5 | // Triggering sensor events every second after page load 6 | setInterval(trigger_sensor_events, 20); 7 | 8 | function trigger_sensor_events(){ 9 | trigger_devicelight_event(); 10 | // setTimeout(trigger_lightlevel_event, 100); 11 | setTimeout(trigger_deviceproximity_event, 200); 12 | setTimeout(trigger_userproximity_event, 400); 13 | setTimeout(trigger_deviceorientation_event, 600); 14 | setTimeout(trigger_devicemotion_event, 800); 15 | } 16 | 17 | function trigger_devicelight_event(){ 18 | var devicelight_event = new DeviceLightEvent('devicelight', { 19 | 'value': 987, 20 | 'bubbles': true, 21 | 'cancelable': true 22 | }); 23 | window.dispatchEvent(devicelight_event) 24 | } 25 | 26 | function trigger_lightlevel_event(){ 27 | // This is not supported and causes JS error on Firefox 28 | // Let's not use it 29 | var lightlevel_event = new LightLevelEvent('lightlevel', { 30 | 'value': "bright", 31 | 'bubbles': true, 32 | 'cancelable': true 33 | }); 34 | // window.dispatchEvent(lightlevel_event) 35 | } 36 | 37 | function trigger_deviceproximity_event(){ 38 | // Firefox and Chrome on Android don't seem to support this event 39 | var deviceproximity_event = new DeviceProximityEvent('deviceproximity', { 40 | 'min': 0, 41 | 'max': 100, 42 | 'value': 3, 43 | 'bubbles': true, 44 | 'cancelable': true 45 | }); 46 | window.dispatchEvent(deviceproximity_event) 47 | } 48 | 49 | function trigger_userproximity_event(){ 50 | var userproximity_event = new UserProximityEvent('userproximity', { 51 | 'near': true, 52 | 'bubbles': true, 53 | 'cancelable': true 54 | }); 55 | window.dispatchEvent(userproximity_event) 56 | } 57 | 58 | function trigger_deviceorientation_event(){ 59 | var deviceorientation_event = new DeviceOrientationEvent('deviceorientation', { 60 | 'alpha': 43.1234 + random_fraction(), 61 | 'beta': 32.9876 + random_fraction(), 62 | 'gamma': 21.6543 + random_fraction(), 63 | 'bubbles': true, 64 | 'cancelable': true 65 | }); 66 | window.dispatchEvent(deviceorientation_event) 67 | } 68 | 69 | function trigger_devicemotion_event(){ 70 | var devicemotion_event = new DeviceMotionEvent('devicemotion', { 71 | 'acceleration':{ 72 | 'x':0.1256 + random_fraction(), 73 | 'y':-0.1234 + random_fraction(), 74 | 'z':-0.1845 + random_fraction() 75 | }, 76 | 'accelerationIncludingGravity':{ 77 | 'x':0.0256 + random_fraction(), 78 | 'y':0.1234 + random_fraction(), 79 | 'z':9.7568 + random_fraction() 80 | }, 81 | 'rotationRate':{ 82 | 'alpha':0.0005 + random_fraction(), 83 | 'beta':0.0034 + random_fraction(), 84 | 'gamma':-0.0048 + random_fraction() 85 | }, 86 | 'interval': 16.6660 + random_fraction(), 87 | 'bubbles': true, 88 | 'cancelable': true 89 | }); 90 | window.dispatchEvent(devicemotion_event) 91 | } 92 | 93 | function random_fraction(leading_zeroes){ 94 | var leading_zeroes = leading_zeroes || 5; 95 | return Math.random() / Math.pow(10, leading_zeroes); 96 | } 97 | 98 | console.log("Fake sensor events will be dispatched!"); 99 | 100 | } + "());"; 101 | } 102 | 103 | 104 | function insertScript(text) { 105 | var parent = document.documentElement, 106 | script = document.createElement('script'); 107 | script.text = text; 108 | script.async = false; 109 | 110 | parent.insertBefore(script, parent.firstChild); 111 | parent.removeChild(script); 112 | } 113 | insertScript(getPageScript()); 114 | -------------------------------------------------------------------------------- /automation/Extension/firefox/doc/main.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sensor-js/OpenWPM-mobile/1834f4c5b8fd0d3976b2e57f5310fd72860a681f/automation/Extension/firefox/doc/main.md -------------------------------------------------------------------------------- /automation/Extension/firefox/index.js: -------------------------------------------------------------------------------- 1 | const fileIO = require("sdk/io/file"); 2 | const system = require("sdk/system"); 3 | const pageMod = require("sdk/page-mod"); 4 | const data = require("sdk/self").data; 5 | var loggingDB = require("./lib/loggingdb.js"); 6 | var pageManager = require("./lib/page-manager.js"); 7 | var cookieInstrument = require("./lib/cookie-instrument.js"); 8 | var jsInstrument = require("./lib/javascript-instrument.js"); 9 | var cpInstrument = require("./lib/content-policy-instrument.js"); 10 | var httpInstrument = require("./lib/http-instrument.js"); 11 | 12 | 13 | exports.main = function(options, callbacks) { 14 | 15 | // Read the browser configuration from file 16 | var path = system.pathFor("ProfD") + '/browser_params.json'; 17 | if (fileIO.exists(path)) { 18 | var config = JSON.parse(fileIO.read(path, 'r')); 19 | console.log("Browser Config:", config); 20 | } else { 21 | console.log("WARNING: config not found. Assuming this is a test run of", 22 | "the extension. Outputting all queries to console."); 23 | var config = { 24 | sqlite_address:null, 25 | leveldb_address:null, 26 | logger_address:null, 27 | disable_webdriver_self_id:true, 28 | cookie_instrument:true, 29 | js_instrument:true, 30 | cp_instrument:true, 31 | http_instrument:true, 32 | save_javascript:true, 33 | testing:true, 34 | crawl_id:'' 35 | }; 36 | } 37 | 38 | loggingDB.open(config['sqlite_address'], 39 | config['leveldb_address'], 40 | config['logger_address'], 41 | config['crawl_id']); 42 | 43 | // Prevent the webdriver from identifying itself in the DOM. See #91 44 | if (config['disable_webdriver_self_id']) { 45 | loggingDB.logDebug("Disabling webdriver self identification"); 46 | pageMod.PageMod({ 47 | include: "*", 48 | contentScriptWhen: "start", 49 | contentScriptFile: data.url("remove_webdriver_attributes.js") 50 | }); 51 | } 52 | // Trigger artificial sensor events 53 | if (config['trigger_sensor_events']) { 54 | console.log("Enabling fake sensor events"); 55 | pageMod.PageMod({ 56 | include: "*", 57 | contentScriptWhen: "ready", 58 | contentScriptFile: data.url("trigger_sensor_events.js") 59 | }); 60 | } 61 | if (config['cookie_instrument']) { 62 | loggingDB.logDebug("Cookie instrumentation enabled"); 63 | cookieInstrument.run(config['crawl_id']); 64 | } 65 | if (config['js_instrument']) { 66 | loggingDB.logDebug("Javascript instrumentation enabled"); 67 | jsInstrument.run(config['crawl_id'], config['testing']); 68 | } 69 | if (config['cp_instrument']) { 70 | loggingDB.logDebug("Content Policy instrumentation enabled"); 71 | cpInstrument.run(config['crawl_id']); 72 | } 73 | if (config['http_instrument']) { 74 | loggingDB.logDebug("HTTP Instrumentation enabled"); 75 | httpInstrument.run(config['crawl_id'], config['save_javascript']); 76 | } 77 | }; 78 | -------------------------------------------------------------------------------- /automation/Extension/firefox/lib/content-policy-instrument.js: -------------------------------------------------------------------------------- 1 | const {Cc, Ci, components} = require("chrome"); 2 | const data = require("sdk/self").data; 3 | var { Class } = require('sdk/core/heritage'); 4 | var { xpcom, Unknown, Service } = require('sdk/platform/xpcom'); 5 | var uuid = require('sdk/util/uuid').uuid(); 6 | var loggingDB = require("./loggingdb.js"); 7 | var pageManager = require("./page-manager.js"); 8 | 9 | exports.run = function(crawlID) { 10 | 11 | // Set up logging 12 | var createContentPolicyTable = data.load("create_content_policy_table.sql"); 13 | loggingDB.executeSQL(createContentPolicyTable, false); 14 | 15 | // Instrument content policy API 16 | // Provides additional information about what caused a request and what it's for 17 | var InstrumentContentPolicy = Class({ 18 | extends: Unknown, 19 | interfaces: [ "nsIContentPolicy" ], 20 | 21 | shouldLoad: function(contentType, contentLocation, requestOrigin, context, mimeTypeGuess, extra) { 22 | var update = { }; 23 | update["crawl_id"] = crawlID; 24 | update["content_type"] = contentType; 25 | update["content_location"] = loggingDB.escapeString(contentLocation.spec); 26 | update["request_origin"] = loggingDB.escapeString(requestOrigin ? requestOrigin.spec : ""); 27 | update["page_id"] = -1; 28 | if(context) { 29 | var domNode = null; 30 | var domWindow = null; 31 | try { domNode = context.QueryInterface(Ci.nsIDOMNode); } 32 | catch(error) { } 33 | try { domWindow = context.QueryInterface(Ci.nsIDOMWindow); } 34 | catch(error) { } 35 | var window = null; 36 | if(domNode && domNode.ownerDocument && domNode.ownerDocument.defaultView) 37 | window = domNode.ownerDocument.defaultView; 38 | //document = domNode.ownerDocument; 39 | if(domWindow) 40 | window = domWindow; 41 | if(window) { 42 | update["page_id"] = pageManager.pageIDFromWindow(window); 43 | } 44 | } 45 | update["mime_type_guess"] = loggingDB.escapeString(mimeTypeGuess ? mimeTypeGuess : ""); 46 | 47 | loggingDB.executeSQL(loggingDB.createInsert("content_policy", update), true); 48 | 49 | return Ci.nsIContentPolicy.ACCEPT; 50 | }, 51 | 52 | // Fires infrequently, instrumentation unused 53 | shouldProcess: function(contentType, contentLocation, requestOrigin, context, mimeType, extra) { 54 | return Ci.nsIContentPolicy.ACCEPT; 55 | } 56 | }); 57 | 58 | var contractID = "@stanford.edu/instrument-content-policy;1"; 59 | 60 | var instrumentContentPolicyService = Service({ 61 | contract: contractID, 62 | Component: InstrumentContentPolicy 63 | }); 64 | 65 | var categoryManager = Cc["@mozilla.org/categorymanager;1"].getService(Ci.nsICategoryManager); 66 | categoryManager.addCategoryEntry("content-policy", contractID, contractID, false, false); 67 | 68 | }; 69 | -------------------------------------------------------------------------------- /automation/Extension/firefox/lib/cookie-instrument.js: -------------------------------------------------------------------------------- 1 | const {Cc, Ci} = require("chrome"); 2 | var events = require("sdk/system/events"); 3 | const data = require("sdk/self").data; 4 | var loggingDB = require("./loggingdb.js"); 5 | 6 | exports.run = function(crawlID) { 7 | 8 | // Set up logging 9 | var createCookiesTable = data.load("create_cookies_table.sql"); 10 | loggingDB.executeSQL(createCookiesTable, false); 11 | 12 | // Instrument cookie changes 13 | events.on("cookie-changed", function(event) { 14 | var data = event.data; 15 | // TODO: Support other cookie operations 16 | if(data == "deleted" || data == "added" || data == "changed") { 17 | var update = {}; 18 | update["change"] = loggingDB.escapeString(data); 19 | update["crawl_id"] = crawlID; 20 | 21 | var cookie = event.subject.QueryInterface(Ci.nsICookie2); 22 | 23 | // Creation time (in microseconds) 24 | var creationTime = new Date(cookie.creationTime / 1000); // requires milliseconds 25 | update["creationTime"] = creationTime.toLocaleFormat('%Y-%m-%d %H:%M:%S'); 26 | 27 | // Expiry time (in seconds) 28 | // May return ~Max(int64). I believe this is a session 29 | // cookie which doesn't expire. Sessions cookies with 30 | // non-max expiry time expire after session or at expiry. 31 | var expiryTime = cookie.expiry; // returns seconds 32 | if (expiryTime == 9223372036854776000) { 33 | var expiryTimeString = '9999-12-31 23:59:59'; 34 | } else { 35 | var expiryTimeDate = new Date(expiryTime * 1000) // requires milliseconds 36 | var expiryTimeString = expiryTimeDate.toLocaleFormat('%Y-%m-%d %H:%M:%S'); 37 | } 38 | update["expiry"] = expiryTimeString; 39 | update["is_http_only"] = loggingDB.boolToInt(cookie.isHttpOnly); 40 | update["is_session"] = loggingDB.boolToInt(cookie.isSession); 41 | 42 | // Accessed time (in microseconds) 43 | var lastAccessedTime = new Date(cookie.lastAccessed / 1000); // requires milliseconds 44 | update["last_accessed"] = lastAccessedTime.toLocaleFormat('%Y-%m-%d %H:%M:%S'); 45 | update["raw_host"] = loggingDB.escapeString(cookie.rawHost); 46 | 47 | cookie = cookie.QueryInterface(Ci.nsICookie); 48 | update["expires"] = cookie.expires; 49 | update["host"] = loggingDB.escapeString(cookie.host); 50 | update["is_domain"] = loggingDB.boolToInt(cookie.isDomain); 51 | update["is_secure"] = loggingDB.boolToInt(cookie.isSecure); 52 | update["name"] = loggingDB.escapeString(cookie.name); 53 | update["path"] = loggingDB.escapeString(cookie.path); 54 | update["policy"] = cookie.policy; 55 | update["status"] = cookie.status; 56 | update["value"] = loggingDB.escapeString(cookie.value); 57 | 58 | loggingDB.executeSQL(loggingDB.createInsert("javascript_cookies", update), true); 59 | } 60 | }, true); 61 | 62 | }; 63 | -------------------------------------------------------------------------------- /automation/Extension/firefox/lib/javascript-instrument.js: -------------------------------------------------------------------------------- 1 | var pageMod = require("sdk/page-mod"); 2 | const data = require("sdk/self").data; 3 | var loggingDB = require("./loggingdb.js"); 4 | var pageManager = require("./page-manager.js"); 5 | 6 | exports.run = function(crawlID, testing) { 7 | 8 | // Set up tables 9 | var createJavascriptTable = data.load("create_javascript_table.sql"); 10 | loggingDB.executeSQL(createJavascriptTable, false); 11 | 12 | // Inject content script to instrument JavaScript API 13 | pageMod.PageMod({ 14 | include: "*", 15 | contentScriptWhen: "start", 16 | contentScriptFile: data.url("./content.js"), 17 | contentScriptOptions: { 18 | 'testing': testing 19 | }, 20 | onAttach: function onAttach(worker) { 21 | var url = worker.url; 22 | function processCallsAndValues(data) { 23 | var update = {}; 24 | update["crawl_id"] = crawlID; 25 | update["script_url"] = loggingDB.escapeString(data.scriptUrl); 26 | update["script_line"] = loggingDB.escapeString(data.scriptLine); 27 | update["script_col"] = loggingDB.escapeString(data.scriptCol); 28 | update["func_name"] = loggingDB.escapeString(data.funcName); 29 | update["script_loc_eval"] = loggingDB.escapeString(data.scriptLocEval); 30 | update["call_stack"] = loggingDB.escapeString(data.callStack); 31 | update["symbol"] = loggingDB.escapeString(data.symbol); 32 | update["operation"] = loggingDB.escapeString(data.operation); 33 | update["value"] = loggingDB.escapeString(data.value); 34 | update["time_stamp"] = data.timeStamp; 35 | 36 | // Create a json object for function arguments 37 | // We create an object that maps array positon to argument 38 | // e.g. someFunc('a',123,'b') --> {0: a, 1: 123, 2: 'b'} 39 | // to make it easier to query the data, using something like the 40 | // sqlite3 json1 extension. 41 | var args = {}; 42 | if (data.operation == 'call' && data.args.length > 0) { 43 | for(var i = 0; i < data.args.length; i++) { 44 | args[i] = data.args[i] 45 | } 46 | update["arguments"] = loggingDB.escapeString(JSON.stringify(args)); 47 | } 48 | 49 | loggingDB.executeSQL(loggingDB.createInsert("javascript", update), true); 50 | } 51 | worker.port.on("logCall", function(data){processCallsAndValues(data)}); 52 | worker.port.on("logValue", function(data){processCallsAndValues(data)}); 53 | } 54 | }); 55 | }; 56 | -------------------------------------------------------------------------------- /automation/Extension/firefox/lib/page-manager.js: -------------------------------------------------------------------------------- 1 | const {Cc, Ci} = require("chrome"); 2 | const data = require("sdk/self").data; 3 | var loggingDB = require("./loggingdb.js"); 4 | var events = require("sdk/system/events"); 5 | 6 | var crawlID = null; 7 | 8 | exports.setup = function(crawl_ID) { 9 | crawlID = crawl_ID; 10 | 11 | // Set up logging 12 | var createPagesTable = data.load("create_pages_table.sql"); 13 | loggingDB.executeSQL(createPagesTable, false); 14 | 15 | // Log new windows 16 | events.on("content-document-global-created", function(event) { 17 | var window = event.subject; 18 | var pageID = pageIDFromWindow(window); 19 | var parentID = window.parent ? pageIDFromWindow(window.parent) : -1; 20 | var location = window.document && window.document.location ? window.document.location : ""; 21 | insertPage(pageID, location, parentID); 22 | }, true); 23 | 24 | }; 25 | 26 | var insertPage = function(pageID, location, parentID) { 27 | var update = { }; 28 | update["crawl_id"] = crawlID; 29 | update["id"] = pageID; 30 | update["location"] = loggingDB.escapeString(location); 31 | update["parent_id"] = parentID; 32 | loggingDB.executeSQL(loggingDB.createInsert("pages", update), true); 33 | }; 34 | exports.insertPage = insertPage; 35 | 36 | var pageIDFromWindow = function (window) { 37 | try { 38 | return window.QueryInterface(Ci.nsIInterfaceRequestor).getInterface(Ci.nsIDOMWindowUtils).currentInnerWindowID; 39 | } 40 | catch(error) { 41 | } 42 | return -1; 43 | }; 44 | exports.pageIDFromWindow = pageIDFromWindow; 45 | 46 | exports.pageIDFromHttpChannel = function(httpChannel) { 47 | try { 48 | var notificationCallbacks = null; 49 | if(httpChannel.notificationCallbacks) 50 | notificationCallbacks = httpChannel.notificationCallbacks; 51 | else if(httpChannel.loadGroup) 52 | notificationCallbacks = httpChannel.loadGroup.notificationCallbacks; 53 | if(notificationCallbacks) { 54 | var loadContext = notificationCallbacks.getInterface(Ci.nsILoadContext) 55 | var window = loadContext.associatedWindow; 56 | return pageIDFromWindow(window); 57 | } 58 | } 59 | catch(error) { 60 | //console.log("Error getting page ID: " + httpChannel.URI.spec); 61 | } 62 | return -1; 63 | }; 64 | -------------------------------------------------------------------------------- /automation/Extension/firefox/lib/socket.js: -------------------------------------------------------------------------------- 1 | const {Cc, Ci} = require("chrome"); 2 | 3 | var bufferpack = require("bufferpack/bufferpack"); 4 | 5 | var tm = Cc["@mozilla.org/thread-manager;1"].getService(); 6 | var socketService = Cc["@mozilla.org/network/socket-transport-service;1"] 7 | .getService(Ci.nsISocketTransportService); 8 | 9 | class ListeningSocket { 10 | // Socket which feeds incomming messages to a queue 11 | constructor() { 12 | 13 | console.log("Initializing a listening sever socket..."); 14 | this._serverSocket = Cc["@mozilla.org/network/server-socket;1"] 15 | .createInstance(Ci.nsIServerSocket); 16 | this._inputStream = null; 17 | this._serverSocket.init(-1, true, -1); // init with random port 18 | 19 | this.port = this._serverSocket.port; 20 | this.queue = []; // stores messages sent to socket 21 | console.log("...serverSocket listening on port:",this.port); 22 | 23 | } 24 | 25 | startListening() { 26 | var thisSocket = this; // self reference for closure 27 | this._serverSocket.asyncListen({ 28 | onSocketAccepted: function(sock, transport) { 29 | thisSocket._inputStream = transport.openInputStream(0, 0, 0); 30 | thisSocket._inputStream.asyncWait({ 31 | onInputStreamReady: function() { 32 | thisSocket._updateQueue(); 33 | } 34 | }, 0, 0, tm.mainThread); 35 | } 36 | }); 37 | } 38 | 39 | _updateQueue() { 40 | var bInputStream = Cc["@mozilla.org/binaryinputstream;1"] 41 | .createInstance(Ci.nsIBinaryInputStream); 42 | bInputStream.setInputStream(this._inputStream); 43 | 44 | var buff = bInputStream.readByteArray(5); 45 | var meta = bufferpack.unpack('>Lc', buff); 46 | var string = bInputStream.readBytes(meta[0]); 47 | if (meta[1] != 'n' && meta[1] == 'j') { 48 | string = JSON.parse(string); 49 | } else if (meta[1] != 'n') { 50 | console.error("Unsupported serialization type (",meta[1],")."); 51 | return; 52 | } 53 | this.queue.push(string); 54 | 55 | var thisSocket = this; // self reference for closure 56 | this._inputStream.asyncWait({ 57 | onInputStreamReady: function(){ 58 | thisSocket._updateQueue(); 59 | } 60 | }, 0, 0, tm.mainThread); 61 | } 62 | } 63 | exports.ListeningSocket = ListeningSocket; 64 | 65 | class SendingSocket { 66 | // Socket which encodes messages and sets to specified (host, port) 67 | constructor() { 68 | this._stream = null; 69 | this._bOutputStream = Cc["@mozilla.org/binaryoutputstream;1"] 70 | .createInstance(Ci.nsIBinaryOutputStream); 71 | } 72 | 73 | connect(host, port) { 74 | // Open socket connection to remote host 75 | try { 76 | var transport = socketService.createTransport(null, 0, host, port, null); 77 | this._stream = transport.openOutputStream(1, 4096, 1048575); 78 | this._bOutputStream.setOutputStream(this._stream) 79 | return true; 80 | } catch (err) { 81 | console.error(err,err.message); 82 | return false; 83 | } 84 | } 85 | 86 | send(query) { 87 | // Format: [sql_query, [arg1, arg2, arg3]] 88 | // e.g. ["INSERT INTO table (item1, item2) VALUES (?,?)", [val1, val2]] 89 | try { 90 | var msg = JSON.stringify(query); 91 | var buff = bufferpack.pack('>Lc',[msg.length,'j']); 92 | this._bOutputStream.writeByteArray(buff, buff.length); 93 | this._stream.write(msg, msg.length); 94 | return true; 95 | } catch (err) { 96 | console.error(err,err.message); 97 | return false; 98 | } 99 | } 100 | 101 | close() { 102 | this._stream.close(); 103 | } 104 | } 105 | exports.SendingSocket = SendingSocket; 106 | -------------------------------------------------------------------------------- /automation/Extension/firefox/node_modules/bufferpack/.npmignore: -------------------------------------------------------------------------------- 1 | 2 | .gitignore 3 | *.md 4 | test/ -------------------------------------------------------------------------------- /automation/Extension/firefox/node_modules/bufferpack/CHANGELOG: -------------------------------------------------------------------------------- 1 | 2 | 3 | - 0.0.6 (1/25/12) 4 | 5 | Fixed issue with unpacking empty null term string 6 | 7 | -------------------------------------------------------------------------------- /automation/Extension/firefox/node_modules/bufferpack/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2008, Fair Oaks Labs, Inc. 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without modification, are 5 | permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, this list 8 | of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above copyright notice, this 11 | list of conditions and the following disclaimer in the documentation and/or other 12 | materials provided with the distribution. 13 | 14 | * Neither the name of Fair Oaks Labs, Inc. nor the names of its contributors may be 15 | used to endorse or promote products derived from this software without specific 16 | prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY 19 | EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 20 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL 21 | THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 23 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 24 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 25 | STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF 26 | THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | -------------------------------------------------------------------------------- /automation/Extension/firefox/node_modules/bufferpack/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "bufferpack", 3 | "description": "Module to pack/unpack primitives and c strings into/out of a Node.js buffer", 4 | "version": "0.0.6", 5 | "keywords": [ 6 | "jspack", 7 | "buffer", 8 | "octet", 9 | "primitive", 10 | "string" 11 | ], 12 | "homepage": "https://github.com/ryanrolds/bufferpack", 13 | "repository": { 14 | "type": "git", 15 | "url": "git://github.com/ryanrolds/bufferpack.git" 16 | }, 17 | "main": "./bufferpack.js", 18 | "author": { 19 | "name": "Ryan Olds", 20 | "email": "ryanrolds@gmail.com" 21 | }, 22 | "maintainers": [ 23 | { 24 | "name": "Peter Griess", 25 | "url": "https://github.com/pgriess" 26 | }, 27 | { 28 | "name": "Peter Magnusson", 29 | "email": "peter@birchroad.net", 30 | "url": "http://github.com/birchroad/node-jspack" 31 | }, 32 | { 33 | "name": "Ryan Olds", 34 | "email": "ryanrolds@gmail.com", 35 | "url": "https://github.com/ryanrolds" 36 | } 37 | ], 38 | "devDependencies": { 39 | "mocha": "= 0.10.2", 40 | "should": "= 0.5.1" 41 | }, 42 | "scripts": { 43 | "test": "./node_modules/.bin/mocha test/*.test.js --reporter spec" 44 | }, 45 | "bugs": { 46 | "url": "https://github.com/ryanrolds/bufferpack/issues" 47 | }, 48 | "readme": "ERROR: No README data found!", 49 | "_id": "bufferpack@0.0.6", 50 | "dist": { 51 | "shasum": "a7bf3619848f1f74e33bd9cb4b5909dc93ba0b30" 52 | }, 53 | "_from": "bufferpack@", 54 | "_resolved": "https://registry.npmjs.org/bufferpack/-/bufferpack-0.0.6.tgz" 55 | } 56 | -------------------------------------------------------------------------------- /automation/Extension/firefox/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "openwpm", 3 | "title": "openwpm", 4 | "description": "Extension with socket interface into OpenWPM", 5 | "author": "Steven Englehardt", 6 | "license": "GPL v3", 7 | "version": "0.0.1", 8 | "dependencies": { 9 | "bufferpack": "0.0.6" 10 | }, 11 | "permissions": { 12 | "unsafe-content-script": true 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /automation/Extension/firefox/test/test-main.js: -------------------------------------------------------------------------------- 1 | var main = require("./main"); 2 | 3 | exports["test main"] = function(assert) { 4 | assert.pass("Unit test running!"); 5 | }; 6 | 7 | exports["test main async"] = function(assert, done) { 8 | assert.pass("async Unit test running!"); 9 | done(); 10 | }; 11 | 12 | require("sdk/test").run(exports); 13 | -------------------------------------------------------------------------------- /automation/MPLogger.py: -------------------------------------------------------------------------------- 1 | """ Support for logging with the multiprocessing module """ 2 | from SocketInterface import serversocket 3 | 4 | from Queue import Empty as EmptyQueue 5 | import logging.handlers 6 | import logging 7 | import struct 8 | import json 9 | import time 10 | import sys 11 | import os 12 | 13 | class ClientSocketHandler(logging.handlers.SocketHandler): 14 | """ 15 | Make SocketHandler compatible with SocketInterface.py 16 | """ 17 | def makePickle(self, record): 18 | """ 19 | Serializes the record via json and prepends a length/serialization 20 | flag. Returns it ready for transmission across the socket. 21 | """ 22 | ei = record.exc_info 23 | if ei: 24 | # just to get traceback text into record.exc_text ... 25 | dummy = self.format(record) # noqa 26 | record.exc_info = None # to avoid Unpickleable error 27 | d = dict(record.__dict__) 28 | d['msg'] = record.getMessage() 29 | d['args'] = None 30 | s = json.dumps(d) 31 | if ei: 32 | record.exc_info = ei # for next handler 33 | return struct.pack('>Ic', len(s), 'j') + s 34 | 35 | def loggingclient(logger_address, logger_port, level=logging.DEBUG): 36 | """ Establishes a logger that sends log records to loggingserver """ 37 | logger = logging.getLogger(__name__) 38 | logger.setLevel(level) 39 | 40 | # Logger object shared, so we only want to connect handlers once 41 | if not len(logger.handlers): 42 | 43 | # Set up the SocketHandler - formatted server-side 44 | socketHandler = ClientSocketHandler(logger_address, logger_port) 45 | socketHandler.setLevel(level) 46 | logger.addHandler(socketHandler) 47 | 48 | # Set up logging to console 49 | consoleHandler = logging.StreamHandler(sys.stdout) 50 | consoleHandler.setLevel(logging.INFO) 51 | formatter = logging.Formatter('%(module)-20s - %(levelname)-8s - %(message)s') 52 | consoleHandler.setFormatter(formatter) 53 | logger.addHandler(consoleHandler) 54 | 55 | return logger 56 | 57 | def loggingserver(log_file, status_queue): 58 | """ 59 | A logging server to serialize writes to the log file from multiple 60 | processes. 61 | 62 | location of the log file on disk 63 | is a queue connect to the TaskManager used for communication 64 | """ 65 | # Configure the log file 66 | logging.basicConfig(filename=os.path.expanduser(log_file), 67 | format= '%(asctime)s - %(processName)-11s[%(threadName)-10s]' + 68 | ' - %(module)-20s - %(levelname)-8s: %(message)s', 69 | level=logging.INFO) 70 | 71 | # Sets up the serversocket to start accepting connections 72 | sock = serversocket() 73 | status_queue.put(sock.sock.getsockname()) # let TM know location 74 | sock.start_accepting() 75 | 76 | while True: 77 | # Check for KILL command from TaskManager 78 | if not status_queue.empty(): 79 | status_queue.get() 80 | sock.close() 81 | _drain_queue(sock.queue) 82 | break 83 | 84 | # Process logs 85 | try: 86 | obj = sock.queue.get(True, 10) 87 | _handleLogRecord(obj) 88 | except EmptyQueue: 89 | pass 90 | 91 | def _handleLogRecord(obj): 92 | """ Handle log, logs everything sent. Should filter client-side """ 93 | 94 | # Log message came from browser extension: requires special handling 95 | if len(obj) == 2 and obj[0] == 'EXT': 96 | obj = json.loads(obj[1]) 97 | record = logging.LogRecord(name=__name__, 98 | level=obj['level'], 99 | pathname=obj['pathname'], 100 | lineno=obj['lineno'], 101 | msg=obj['msg'], 102 | args=obj['args'], 103 | exc_info=obj['exc_info'], 104 | func=obj['func']) 105 | else: 106 | record = logging.makeLogRecord(obj) 107 | logger = logging.getLogger(record.name) 108 | logger.handle(record) 109 | 110 | def _drain_queue(sock_queue): 111 | """ Ensures queue is empty before closing """ 112 | time.sleep(3) # TODO: the socket needs a better way of closing 113 | while not sock_queue.empty(): 114 | obj = sock_queue.get() 115 | _handleLogRecord(obj) 116 | 117 | if __name__ == '__main__': 118 | # Some tests 119 | import logging, logging.handlers 120 | import multiprocess as mp 121 | 122 | # Set up loggingserver 123 | log_file = '~/mplogger.log' 124 | status_queue = mp.Queue() 125 | loggingserver = mp.Process(target=loggingserver, args=(log_file, status_queue)) 126 | loggingserver.daemon = True 127 | loggingserver.start() 128 | server_address = status_queue.get() 129 | 130 | # Connect main process to logging server 131 | rootLogger = logging.getLogger('') 132 | rootLogger.setLevel(logging.DEBUG) 133 | socketHandler = ClientSocketHandler(*server_address) 134 | rootLogger.addHandler(socketHandler) 135 | 136 | # Send some sample logs 137 | logging.info('Test1') 138 | logging.error('Test2') 139 | logging.critical('Test3') 140 | logging.debug('Test4') 141 | logging.warning('Test5') 142 | 143 | logger1 = logging.getLogger('test1') 144 | logger2 = logging.getLogger('test2') 145 | logger1.info('asdfasdfsa') 146 | logger2.info('1234567890') 147 | 148 | # Close the logging server 149 | status_queue.put('DIE') 150 | loggingserver.join() 151 | print "Server closed, exiting..." 152 | -------------------------------------------------------------------------------- /automation/Proxy/MITMProxy.py: -------------------------------------------------------------------------------- 1 | from ..SocketInterface import clientsocket 2 | from ..MPLogger import loggingclient 3 | import mitm_commands 4 | 5 | from libmproxy import controller 6 | import Queue 7 | import sys 8 | import traceback 9 | 10 | 11 | class InterceptingMaster (controller.Master): 12 | """ 13 | Customized MITMProxy 14 | Extends the proxy controller to add some additional 15 | functionality for handling /logging requests and responses 16 | 17 | Inspired by the following example. Note the gist has a lot of bugs. 18 | https://gist.github.com/dannvix/5285924 19 | """ 20 | 21 | def __init__(self, server, visit_id_queue, browser_params, manager_params, status_queue): 22 | self.browser_params = browser_params 23 | self.manager_params = manager_params 24 | 25 | # Attributes used to flag the first-party domain 26 | self.visit_id_queue = visit_id_queue # first-party domain provided by BrowserManager 27 | self.prev_visit_id, self.curr_visit_id = None, None # previous and current top level domains 28 | self.prev_requests, self.curr_requests = set(), set() # set of requests for previous and current site 29 | 30 | # Open a socket to communicate with DataAggregator 31 | self.db_socket = clientsocket(serialization='dill') 32 | self.db_socket.connect(*manager_params['aggregator_address']) 33 | 34 | # Open a socket to communicate with LevelDBAggregator 35 | self.ldb_socket = None 36 | if browser_params['save_javascript_proxy']: 37 | self.ldb_socket = clientsocket(serialization='dill') 38 | self.ldb_socket.connect(*manager_params['ldb_address']) 39 | 40 | # Open a socket to communicate with MPLogger 41 | self.logger = loggingclient(*manager_params['logger_address']) 42 | 43 | # Store status_queue for communication back to TaskManager 44 | self.status_queue = status_queue 45 | 46 | controller.Master.__init__(self, server) 47 | 48 | def load_process_message(self, q, timeout): 49 | """ Tries to read and process a message from the proxy queue, returns True iff this succeeds """ 50 | try: 51 | msg = q.get(timeout=timeout) 52 | controller.Master.handle(self, *msg) 53 | return True 54 | except Queue.Empty: 55 | return False 56 | 57 | def tick(self, q, timeout=0.01): 58 | """ new tick function used to label first-party domains and avoid race conditions when doing so """ 59 | if self.curr_visit_id is None: # proxy is fresh, need to get first-party domain right away 60 | self.curr_visit_id = self.visit_id_queue.get() 61 | elif not self.visit_id_queue.empty(): # new FP has been visited 62 | # drains the queue to get rid of stale messages from previous site 63 | while self.load_process_message(q, timeout): 64 | pass 65 | 66 | self.prev_requests, self.curr_requests = self.curr_requests, set() 67 | self.prev_visit_id, self.curr_visit_id = self.curr_visit_id, self.visit_id_queue.get() 68 | 69 | self.load_process_message(q, timeout) 70 | 71 | def run(self): 72 | """ Light wrapper around run with error printing """ 73 | try: 74 | controller.Master.run(self) 75 | except KeyboardInterrupt: 76 | print 'KeyboardInterrupt received. Shutting down' 77 | self.shutdown() 78 | sys.exit(0) 79 | except Exception: 80 | excp = traceback.format_exception(*sys.exc_info()) 81 | self.logger.critical('BROWSER %i: Exception. Shutting down proxy!\n%s' % (self.browser_params['crawl_id'], excp)) 82 | self.status_queue.put(('FAILED', None)) 83 | self.shutdown() 84 | raise 85 | 86 | def handle_request(self, msg): 87 | """ Receives HTTP request, and sends it to logging function """ 88 | msg.reply() 89 | self.curr_requests.add(msg.request) 90 | mitm_commands.process_general_mitm_request(self.db_socket, 91 | self.browser_params, 92 | self.curr_visit_id, 93 | msg) 94 | 95 | # Record data from HTTP responses 96 | def handle_response(self, msg): 97 | """ Receives HTTP response, and sends it to logging function """ 98 | msg.reply() 99 | 100 | # attempts to get the top url visit id, based on the request object 101 | if msg.request in self.prev_requests: 102 | visit_id = self.prev_visit_id 103 | self.prev_requests.remove(msg.request) 104 | elif msg.request in self.curr_requests: 105 | visit_id = self.curr_visit_id 106 | self.curr_requests.remove(msg.request) 107 | else: # ignore responses for which we cannot match the request 108 | return 109 | mitm_commands.process_general_mitm_response(self.db_socket, 110 | self.ldb_socket, 111 | self.logger, 112 | self.browser_params, 113 | visit_id, msg) 114 | -------------------------------------------------------------------------------- /automation/Proxy/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sensor-js/OpenWPM-mobile/1834f4c5b8fd0d3976b2e57f5310fd72860a681f/automation/Proxy/__init__.py -------------------------------------------------------------------------------- /automation/Proxy/cert/mitmproxy-ca-cert.cer: -------------------------------------------------------------------------------- 1 | -----BEGIN CERTIFICATE----- 2 | MIIDoTCCAomgAwIBAgIGDUd8Ol4xMA0GCSqGSIb3DQEBCwUAMCgxEjAQBgNVBAMM 3 | CW1pdG1wcm94eTESMBAGA1UECgwJbWl0bXByb3h5MB4XDTE2MDQwNTIyMjMyM1oX 4 | DTIxMDQwNjIyMjMyM1owKDESMBAGA1UEAwwJbWl0bXByb3h5MRIwEAYDVQQKDAlt 5 | aXRtcHJveHkwggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQDpWVI/DZBn 6 | Zt4BHGBpsGAkLwVfJPf4p1mSVvSwtZp25ghCQi+GMqaAcjk/rYG9k/BqhKFN43NC 7 | ggr9/hhm73oy2JxVOMbMtEKOc+z0plajxtROQnItoSTIYilbx9VPXYmYXTb5o0v5 8 | npc0huIuKGRq5KVI1hFnj6GdXO/EWJioUpUC1c3zyz2vsbK3nE0Ze0lmWQ9Aur+6 9 | cvQrkvidLeF9irfV/PatIUYQ6KmVDPGS865xH8nh3aDbK7vGS4UnM8YwCXpPz4R+ 10 | 0QCNCrzCdvT+TkuvFJNFS8o5+zT1mEi+2LBUc+AApXaoCFNc4tZ20kgvsXeIJiRo 11 | hOjYR90tzc89AgMBAAGjgdAwgc0wDwYDVR0TAQH/BAUwAwEB/zARBglghkgBhvhC 12 | AQEEBAMCAgQweAYDVR0lBHEwbwYIKwYBBQUHAwEGCCsGAQUFBwMCBggrBgEFBQcD 13 | BAYIKwYBBQUHAwgGCisGAQQBgjcCARUGCisGAQQBgjcCARYGCisGAQQBgjcKAwEG 14 | CisGAQQBgjcKAwMGCisGAQQBgjcKAwQGCWCGSAGG+EIEATAOBgNVHQ8BAf8EBAMC 15 | AQYwHQYDVR0OBBYEFKpUAZXAaEWlCENC0uzof2rZsfQfMA0GCSqGSIb3DQEBCwUA 16 | A4IBAQBSceM4F6o0mDlxdxyq0Kn8QAQSaSPR0Mc0cgbIlisZ/TArBdM4hP/io0pG 17 | 9O2/xSVfggVELsWFsA447V/0dRN/544wXjLv0D6O/hLvDrLdxeV/EGzwh98TSt9p 18 | jT/lw7TD+9r/RQg95RKorsX+IdnEd201/DNc/lc3SMV6RQaZMXFqwvc8RKgie7r9 19 | L0lLDfpPVQufOXGpUakgiQyju/qnnMQeZgw8qCubmdcwFVSQ9HkeSiRyvzQwYNT1 20 | FvxFP9p0pG9pdZLvzV1EzLtVFqH0X6la5dNYQUX9YSm1HyfSxgwPOprAstnB8xaI 21 | e1WOBDqrvIfVypJFB0IFMlmfs2Pk 22 | -----END CERTIFICATE----- 23 | -------------------------------------------------------------------------------- /automation/Proxy/cert/mitmproxy-ca-cert.p12: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sensor-js/OpenWPM-mobile/1834f4c5b8fd0d3976b2e57f5310fd72860a681f/automation/Proxy/cert/mitmproxy-ca-cert.p12 -------------------------------------------------------------------------------- /automation/Proxy/cert/mitmproxy-ca-cert.pem: -------------------------------------------------------------------------------- 1 | -----BEGIN CERTIFICATE----- 2 | MIIDoTCCAomgAwIBAgIGDUd8Ol4xMA0GCSqGSIb3DQEBCwUAMCgxEjAQBgNVBAMM 3 | CW1pdG1wcm94eTESMBAGA1UECgwJbWl0bXByb3h5MB4XDTE2MDQwNTIyMjMyM1oX 4 | DTIxMDQwNjIyMjMyM1owKDESMBAGA1UEAwwJbWl0bXByb3h5MRIwEAYDVQQKDAlt 5 | aXRtcHJveHkwggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQDpWVI/DZBn 6 | Zt4BHGBpsGAkLwVfJPf4p1mSVvSwtZp25ghCQi+GMqaAcjk/rYG9k/BqhKFN43NC 7 | ggr9/hhm73oy2JxVOMbMtEKOc+z0plajxtROQnItoSTIYilbx9VPXYmYXTb5o0v5 8 | npc0huIuKGRq5KVI1hFnj6GdXO/EWJioUpUC1c3zyz2vsbK3nE0Ze0lmWQ9Aur+6 9 | cvQrkvidLeF9irfV/PatIUYQ6KmVDPGS865xH8nh3aDbK7vGS4UnM8YwCXpPz4R+ 10 | 0QCNCrzCdvT+TkuvFJNFS8o5+zT1mEi+2LBUc+AApXaoCFNc4tZ20kgvsXeIJiRo 11 | hOjYR90tzc89AgMBAAGjgdAwgc0wDwYDVR0TAQH/BAUwAwEB/zARBglghkgBhvhC 12 | AQEEBAMCAgQweAYDVR0lBHEwbwYIKwYBBQUHAwEGCCsGAQUFBwMCBggrBgEFBQcD 13 | BAYIKwYBBQUHAwgGCisGAQQBgjcCARUGCisGAQQBgjcCARYGCisGAQQBgjcKAwEG 14 | CisGAQQBgjcKAwMGCisGAQQBgjcKAwQGCWCGSAGG+EIEATAOBgNVHQ8BAf8EBAMC 15 | AQYwHQYDVR0OBBYEFKpUAZXAaEWlCENC0uzof2rZsfQfMA0GCSqGSIb3DQEBCwUA 16 | A4IBAQBSceM4F6o0mDlxdxyq0Kn8QAQSaSPR0Mc0cgbIlisZ/TArBdM4hP/io0pG 17 | 9O2/xSVfggVELsWFsA447V/0dRN/544wXjLv0D6O/hLvDrLdxeV/EGzwh98TSt9p 18 | jT/lw7TD+9r/RQg95RKorsX+IdnEd201/DNc/lc3SMV6RQaZMXFqwvc8RKgie7r9 19 | L0lLDfpPVQufOXGpUakgiQyju/qnnMQeZgw8qCubmdcwFVSQ9HkeSiRyvzQwYNT1 20 | FvxFP9p0pG9pdZLvzV1EzLtVFqH0X6la5dNYQUX9YSm1HyfSxgwPOprAstnB8xaI 21 | e1WOBDqrvIfVypJFB0IFMlmfs2Pk 22 | -----END CERTIFICATE----- 23 | -------------------------------------------------------------------------------- /automation/Proxy/cert/mitmproxy-ca.pem: -------------------------------------------------------------------------------- 1 | -----BEGIN PRIVATE KEY----- 2 | MIIEvgIBADANBgkqhkiG9w0BAQEFAASCBKgwggSkAgEAAoIBAQDpWVI/DZBnZt4B 3 | HGBpsGAkLwVfJPf4p1mSVvSwtZp25ghCQi+GMqaAcjk/rYG9k/BqhKFN43NCggr9 4 | /hhm73oy2JxVOMbMtEKOc+z0plajxtROQnItoSTIYilbx9VPXYmYXTb5o0v5npc0 5 | huIuKGRq5KVI1hFnj6GdXO/EWJioUpUC1c3zyz2vsbK3nE0Ze0lmWQ9Aur+6cvQr 6 | kvidLeF9irfV/PatIUYQ6KmVDPGS865xH8nh3aDbK7vGS4UnM8YwCXpPz4R+0QCN 7 | CrzCdvT+TkuvFJNFS8o5+zT1mEi+2LBUc+AApXaoCFNc4tZ20kgvsXeIJiRohOjY 8 | R90tzc89AgMBAAECggEBANHxFek6t+gACc7wC+Iq2GTVlTyj0yH41sKtz1pJGV7N 9 | 2TJbv8w6N8kqe01KOoco2c3L5CBieemlfGRGPwglf0jNz+FwwpObvqHRtKfsG9kf 10 | 4xVB/N4GtERA5Lr5kHXmTwwxGWwCOjGsOJf+nvIuTyJI8lgUmPYWqK4e1m1N1JLi 11 | eaxr3zRPK0s+wGXCUOAsd4cjNARpxEBJEkexTr7lWpGG5byIDrfVrQxDctP4h8Ji 12 | CbLCT8kpqjtBecJSj9uBDoDqthvJ+Py/PmKWK80Zeco+d+0qH40wTBUAvhLMP3VK 13 | 8assIuaOVZX0Pww4f2Frz9OrInwLdbZNOZJSuuml5RUCgYEA/Y0NgMPghRB4E4tg 14 | ZlrIQlRus3dHqWOdgbXFGVLQdvtsr/bIbG7lu+RcVPWTD1a9dTv/FW3GzSovTG1b 15 | FF2H/ic4bAnKrtPSR9xphDyM5O8jk/oVkMHUEGEZy7OdiOP4C9sIZz2WK0YUUwIt 16 | W2RCNqkZEeZtJviKdGWvHmEBK+cCgYEA65pQxYnZbymwySjB7VMnXofxCi1y283Y 17 | 4zuP3uqNPOhpA+Sdq0Mh9XvfkcCaDXdIFtgD9vvUpQOWmsG1ILRo+bvrdAYHzQ4j 18 | CO8ha8aYiN9tvn6kYMDVgXFacc787qSpL6AeR7ybYQU7fe1uHixi6wEKTin92/Ns 19 | e00CWzVTpzsCgYEAz/rp/puuCbhupqmHU65X4oDbpX7MW5gI1SNDH/icY1zt1JE7 20 | 6iY1cCBr1Iz0KnreQdIK9YrsrdJSpgB124i1SrblQ0ns5ed+789PBlecwxWeO33C 21 | PtGfoCfmPv+A048cIq1ygS01hx2fAlAg4HynC6s9kz9Ofc8V01Ctit/LVDUCgYAc 22 | 0h7JW2iW2aG/qdW3Q3HQdY698PtY+iBrA7FA0q5+YevexwumlKrFzeZ2fPobZZkS 23 | +k/Z2cqUeRDmU4Xlv0wMKLnP0qEHq5ALmr0a4wtryvEw2WsgTtaPZB9tRqXYR5pO 24 | siaiHedgAfTaHb5XwJRFLTZmg2qDio6dsrj0EVzvWwKBgGPKGjNN4orOvgTwRTNB 25 | +9drtwB3hxYmGXa7Tbzq62SNN6exJUvuW9sVeSssv058Nk1hTEQ7DJAXQOfkFWQf 26 | XX5VMyumF9xqIZ0c793m54VY7hd+SkwziwvVrtTuMiy9wjPHOxUd81gdW+OKARTO 27 | A2Z4tVG+hinAI7cZM4yR5Van 28 | -----END PRIVATE KEY----- 29 | -----BEGIN CERTIFICATE----- 30 | MIIDoTCCAomgAwIBAgIGDUd8Ol4xMA0GCSqGSIb3DQEBCwUAMCgxEjAQBgNVBAMM 31 | CW1pdG1wcm94eTESMBAGA1UECgwJbWl0bXByb3h5MB4XDTE2MDQwNTIyMjMyM1oX 32 | DTIxMDQwNjIyMjMyM1owKDESMBAGA1UEAwwJbWl0bXByb3h5MRIwEAYDVQQKDAlt 33 | aXRtcHJveHkwggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQDpWVI/DZBn 34 | Zt4BHGBpsGAkLwVfJPf4p1mSVvSwtZp25ghCQi+GMqaAcjk/rYG9k/BqhKFN43NC 35 | ggr9/hhm73oy2JxVOMbMtEKOc+z0plajxtROQnItoSTIYilbx9VPXYmYXTb5o0v5 36 | npc0huIuKGRq5KVI1hFnj6GdXO/EWJioUpUC1c3zyz2vsbK3nE0Ze0lmWQ9Aur+6 37 | cvQrkvidLeF9irfV/PatIUYQ6KmVDPGS865xH8nh3aDbK7vGS4UnM8YwCXpPz4R+ 38 | 0QCNCrzCdvT+TkuvFJNFS8o5+zT1mEi+2LBUc+AApXaoCFNc4tZ20kgvsXeIJiRo 39 | hOjYR90tzc89AgMBAAGjgdAwgc0wDwYDVR0TAQH/BAUwAwEB/zARBglghkgBhvhC 40 | AQEEBAMCAgQweAYDVR0lBHEwbwYIKwYBBQUHAwEGCCsGAQUFBwMCBggrBgEFBQcD 41 | BAYIKwYBBQUHAwgGCisGAQQBgjcCARUGCisGAQQBgjcCARYGCisGAQQBgjcKAwEG 42 | CisGAQQBgjcKAwMGCisGAQQBgjcKAwQGCWCGSAGG+EIEATAOBgNVHQ8BAf8EBAMC 43 | AQYwHQYDVR0OBBYEFKpUAZXAaEWlCENC0uzof2rZsfQfMA0GCSqGSIb3DQEBCwUA 44 | A4IBAQBSceM4F6o0mDlxdxyq0Kn8QAQSaSPR0Mc0cgbIlisZ/TArBdM4hP/io0pG 45 | 9O2/xSVfggVELsWFsA447V/0dRN/544wXjLv0D6O/hLvDrLdxeV/EGzwh98TSt9p 46 | jT/lw7TD+9r/RQg95RKorsX+IdnEd201/DNc/lc3SMV6RQaZMXFqwvc8RKgie7r9 47 | L0lLDfpPVQufOXGpUakgiQyju/qnnMQeZgw8qCubmdcwFVSQ9HkeSiRyvzQwYNT1 48 | FvxFP9p0pG9pdZLvzV1EzLtVFqH0X6la5dNYQUX9YSm1HyfSxgwPOprAstnB8xaI 49 | e1WOBDqrvIfVypJFB0IFMlmfs2Pk 50 | -----END CERTIFICATE----- 51 | -------------------------------------------------------------------------------- /automation/Proxy/cert/mitmproxy-dhparam.pem: -------------------------------------------------------------------------------- 1 | 2 | -----BEGIN DH PARAMETERS----- 3 | MIICCAKCAgEAyT6LzpwVFS3gryIo29J5icvgxCnCebcdSe/NHMkD8dKJf8suFCg3 4 | O2+dguLakSVif/t6dhImxInJk230HmfC8q93hdcg/j8rLGJYDKu3ik6H//BAHKIv 5 | j5O9yjU3rXCfmVJQic2Nne39sg3CreAepEts2TvYHhVv3TEAzEqCtOuTjgDv0ntJ 6 | Gwpj+BJBRQGG9NvprX1YGJ7WOFBP/hWU7d6tgvE6Xa7T/u9QIKpYHMIkcN/l3ZFB 7 | chZEqVlyrcngtSXCROTPcDOQ6Q8QzhaBJS+Z6rcsd7X+haiQqvoFcmaJ08Ks6LQC 8 | ZIL2EtYJw8V8z7C0igVEBIADZBI6OTbuuhDwRw//zU1uq52Oc48CIZlGxTYG/Evq 9 | o9EWAXUYVzWkDSTeBH1r4z/qLPE2cnhtMxbFxuvK53jGB0emy2y1Ei6IhKshJ5qX 10 | IB/aE7SSHyQ3MDHHkCmQJCsOd4Mo26YX61NZ+n501XjqpCBQ2+DfZCBh8Va2wDyv 11 | A2Ryg9SUz8j0AXViRNMJgJrr446yro/FuJZwnQcO3WQnXeqSBnURqKjmqkeFP+d8 12 | 6mk2tqJaY507lRNqtGlLnj7f5RNoBFJDCLBNurVgfvq9TCVWKDIFD4vZRjCrnl6I 13 | rD693XKIHUCWOjMh1if6omGXKHH40QuME2gNa50+YPn1iYDl88uDbbMCAQI= 14 | -----END DH PARAMETERS----- 15 | -------------------------------------------------------------------------------- /automation/Proxy/cert8.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sensor-js/OpenWPM-mobile/1834f4c5b8fd0d3976b2e57f5310fd72860a681f/automation/Proxy/cert8.db -------------------------------------------------------------------------------- /automation/Proxy/deploy_mitm_proxy.py: -------------------------------------------------------------------------------- 1 | from ..MPLogger import loggingclient 2 | import MITMProxy 3 | 4 | from libmproxy import proxy 5 | from libmproxy.proxy.server import ProxyServer 6 | import threading 7 | import socket 8 | import Queue 9 | import os 10 | 11 | 12 | def init_proxy(browser_params, manager_params, status_queue): 13 | """ 14 | Uses mitmproxy used to log HTTP Requests and Responses 15 | configuration parameters of host browser 16 | configuration parameters of the TaskManager 17 | a Queue to report proxy status back to TaskManager 18 | """ 19 | logger = loggingclient(*manager_params['logger_address']) 20 | proxy_site_queue = Queue.Queue() # queue for crawler to communicate with proxy 21 | 22 | # gets local port from one of the free ports 23 | sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) 24 | sock.bind(('', 0)) 25 | proxy_port = sock.getsockname()[1] 26 | sock.close() 27 | 28 | config = proxy.ProxyConfig(cadir=os.path.join(os.path.dirname(__file__), 'cert'),port=proxy_port) 29 | server = ProxyServer(config) 30 | logger.info('BROWSER %i: Intercepting Proxy listening on %i' % (browser_params['crawl_id'], proxy_port)) 31 | m = MITMProxy.InterceptingMaster(server, proxy_site_queue, browser_params, manager_params, status_queue) 32 | thread = threading.Thread(target=m.run, args=()) 33 | thread.daemon = True 34 | thread.start() 35 | return proxy_port, proxy_site_queue 36 | -------------------------------------------------------------------------------- /automation/Proxy/key3.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sensor-js/OpenWPM-mobile/1834f4c5b8fd0d3976b2e57f5310fd72860a681f/automation/Proxy/key3.db -------------------------------------------------------------------------------- /automation/Proxy/mitm_commands.py: -------------------------------------------------------------------------------- 1 | # This module parses MITM Proxy requests/responses into (command, data pairs) 2 | # This should mean that the MITMProxy code should simply pass the messages + its own data to this module 3 | 4 | from urlparse import urlparse 5 | import datetime 6 | import mmh3 7 | import json 8 | import zlib 9 | 10 | def encode_to_unicode(msg): 11 | """ 12 | Tries different encodings before setting on utf8 ignoring any errors 13 | We can likely inspect the headers for an encoding as well, though it 14 | won't always be correct. 15 | """ 16 | try: 17 | msg = unicode(msg, 'utf8') 18 | except UnicodeDecodeError: 19 | try: 20 | msg = unicode(msg, 'ISO-8859-1') 21 | except UnicodeDecodeError: 22 | msg = unicode(msg, 'utf8', 'ignore') 23 | return msg 24 | 25 | 26 | def process_general_mitm_request(db_socket, browser_params, visit_id, msg): 27 | """ Logs a HTTP request object """ 28 | referrer = msg.request.headers['referer'][0] if len(msg.request.headers['referer']) > 0 else '' 29 | 30 | data = (browser_params['crawl_id'], 31 | encode_to_unicode(msg.request.url), 32 | msg.request.method, 33 | encode_to_unicode(referrer), 34 | json.dumps(msg.request.headers.get_state()), 35 | visit_id, 36 | str(datetime.datetime.now())) 37 | 38 | db_socket.send(("INSERT INTO http_requests_proxy (crawl_id, url, method, " 39 | "referrer, headers, visit_id, time_stamp) VALUES (?,?,?,?,?,?,?)", data)) 40 | 41 | 42 | def process_general_mitm_response(db_socket, ldb_socket, logger, browser_params, visit_id, msg): 43 | """ Logs a HTTP response object and, if necessary, """ 44 | referrer = msg.request.headers['referer'][0] if len(msg.request.headers['referer']) > 0 else '' 45 | location = msg.response.headers['location'][0] if len(msg.response.headers['location']) > 0 else '' 46 | 47 | content_hash = save_javascript_content(ldb_socket, logger, browser_params, msg) 48 | 49 | data = (browser_params['crawl_id'], 50 | encode_to_unicode(msg.request.url), 51 | encode_to_unicode(msg.request.method), 52 | encode_to_unicode(referrer), 53 | msg.response.code, 54 | msg.response.msg, 55 | json.dumps(msg.response.headers.get_state()), 56 | encode_to_unicode(location), 57 | visit_id, 58 | str(datetime.datetime.now()), 59 | content_hash) 60 | 61 | db_socket.send(("INSERT INTO http_responses_proxy (crawl_id, url, method, " 62 | "referrer, response_status, response_status_text, headers, " 63 | "location, visit_id, time_stamp, content_hash) " 64 | "VALUES (?,?,?,?,?,?,?,?,?,?,?)", data)) 65 | 66 | 67 | def save_javascript_content(ldb_socket, logger, browser_params, msg): 68 | """ Save javascript files de-duplicated and compressed on disk """ 69 | if not browser_params['save_javascript_proxy']: 70 | return 71 | 72 | # Check if this response is javascript content 73 | is_js = False 74 | if (len(msg.response.headers['Content-Type']) > 0 and 75 | 'javascript' in msg.response.headers['Content-Type'][0]): 76 | is_js = True 77 | if not is_js and urlparse(msg.request.url).path.split('.')[-1] == 'js': 78 | is_js = True 79 | if not is_js: 80 | return 81 | 82 | # Decompress any content with compression 83 | # We want files to hash to the same value 84 | # Firefox currently only accepts gzip/deflate 85 | script = '' 86 | content_encoding = msg.response.headers['Content-Encoding'] 87 | if (len(content_encoding) == 0 or 88 | content_encoding[0].lower() == 'utf-8' or 89 | content_encoding[0].lower() == 'identity' or 90 | content_encoding[0].lower() == 'none' or 91 | content_encoding[0].lower() == 'ansi_x3.4-1968' or 92 | content_encoding[0].lower() == 'utf8' or 93 | content_encoding[0] == ''): 94 | script = msg.response.content 95 | elif 'gzip' in content_encoding[0].lower(): 96 | try: 97 | script = zlib.decompress(msg.response.content, zlib.MAX_WBITS|16) 98 | except zlib.error as e: 99 | logger.error('BROWSER %i: Received zlib error when trying to decompress gzipped javascript: %s' % (browser_params['crawl_id'],str(e))) 100 | return 101 | elif 'deflate' in content_encoding[0].lower(): 102 | try: 103 | script = zlib.decompress(msg.response.content, -zlib.MAX_WBITS) 104 | except zlib.error as e: 105 | logger.error('BROWSER %i: Received zlib error when trying to decompress deflated javascript: %s' % (browser_params['crawl_id'],str(e))) 106 | return 107 | else: 108 | logger.error('BROWSER %i: Received Content-Encoding %s. Not supported by Firefox, skipping archive.' % (browser_params['crawl_id'], str(content_encoding))) 109 | return 110 | script = encode_to_unicode(script) 111 | 112 | # Hash script for deduplication on disk 113 | hasher = mmh3.hash128 114 | script_hash = str(hasher(script.encode('utf-8')) >> 64) 115 | 116 | ldb_socket.send((script, script_hash)) 117 | 118 | return script_hash 119 | -------------------------------------------------------------------------------- /automation/SocketInterface.py: -------------------------------------------------------------------------------- 1 | import Queue 2 | import threading 3 | import traceback 4 | import socket 5 | import struct 6 | import json 7 | import dill 8 | 9 | #TODO - Implement a cleaner shutdown for server socket 10 | # see: https://stackoverflow.com/questions/1148062/python-socket-accept-blocks-prevents-app-from-quitting 11 | 12 | class serversocket: 13 | """ 14 | A server socket to recieve and process string messages 15 | from client sockets to a central queue 16 | """ 17 | def __init__(self, verbose=False): 18 | self.sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) 19 | self.sock.bind(('localhost', 0)) 20 | self.sock.listen(10) # queue a max of n connect requests 21 | self.verbose = verbose 22 | self.queue = Queue.Queue() 23 | if self.verbose: 24 | print "Server bound to: " + str(self.sock.getsockname()) 25 | 26 | def start_accepting(self): 27 | """ Start the listener thread """ 28 | thread = threading.Thread(target=self._accept, args=()) 29 | thread.daemon = True # stops from blocking shutdown 30 | thread.start() 31 | 32 | def _accept(self): 33 | """ Listen for connections and pass handling to a new thread """ 34 | while True: 35 | (client, address) = self.sock.accept() 36 | thread = threading.Thread(target=self._handle_conn, args=(client, address)) 37 | thread.daemon = True 38 | thread.start() 39 | 40 | def _handle_conn(self, client, address): 41 | """ 42 | Recieve messages and pass to queue. Messages are prefixed with 43 | a 4-byte integer to specify the message length and 1-byte character 44 | to indicate the type of serialization applied to the message. 45 | 46 | Supported serialization formats: 47 | 'n' : no serialization 48 | 'd' : dill pickle 49 | 'j' : json 50 | """ 51 | if self.verbose: 52 | print "Thread: " + str(threading.current_thread()) + " connected to: " + str(address) 53 | try: 54 | while True: 55 | msg = self.receive_msg(client, 5) 56 | msglen, serialization = struct.unpack('>Lc', msg) 57 | if self.verbose: 58 | print "Msglen: " + str(msglen) + " is_serialized: " + str(serialization != 'n') 59 | msg = self.receive_msg(client, msglen) 60 | if serialization != 'n': 61 | try: 62 | if serialization == 'd': # dill serialization 63 | msg = dill.loads(msg) 64 | elif serialization == 'j': # json serialization 65 | msg = json.loads(msg) 66 | else: 67 | print "Unrecognized serialization type: %s" % serialization 68 | continue 69 | except (UnicodeDecodeError, ValueError) as e: 70 | print "Error de-serializing message: %s \n %s" % ( 71 | msg, traceback.format_exc(e)) 72 | continue 73 | self.queue.put(msg) 74 | except RuntimeError: 75 | if self.verbose: 76 | print "Client socket: " + str(address) + " closed" 77 | 78 | def receive_msg(self, client, msglen): 79 | msg = '' 80 | while len(msg) < msglen: 81 | chunk = client.recv(msglen-len(msg)) 82 | if chunk == '': 83 | raise RuntimeError("socket connection broken") 84 | msg = msg + chunk 85 | return msg 86 | 87 | def close(self): 88 | self.sock.close() 89 | 90 | class clientsocket: 91 | """A client socket for sending messages""" 92 | def __init__(self, serialization='json', verbose=False): 93 | """ `serialization` specifies the type of serialization to use for 94 | non-str messages. Supported formats: 95 | * 'json' uses the json module. Cross-language support. (default) 96 | * 'dill' uses the dill pickle module. Python only. 97 | """ 98 | self.sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) 99 | if serialization != 'json' and serialization != 'dill': 100 | raise ValueError("Unsupported serialization type: %s" % serialization) 101 | self.serialization = serialization 102 | self.verbose = verbose 103 | 104 | def connect(self, host, port): 105 | if self.verbose: print "Connecting to: %s:%i" % (host, port) 106 | self.sock.connect((host, port)) 107 | 108 | def send(self, msg): 109 | """ 110 | Sends an arbitrary python object to the connected socket. Serializes 111 | using dill if not str, and prepends msg len (4-bytes) and 112 | serialization type (1-byte). 113 | """ 114 | #if input not string, serialize to string 115 | if type(msg) is not str: 116 | if self.serialization == 'dill': 117 | msg = dill.dumps(msg) 118 | serialization = 'd' 119 | elif self.serialization == 'json': 120 | msg = json.dumps(msg) 121 | serialization = 'j' 122 | else: 123 | raise ValueError("Unsupported serialization type set: %s" % serialization) 124 | else: 125 | serialization = 'n' 126 | 127 | if self.verbose: print "Sending message with serialization %s" % serialization 128 | 129 | #prepend with message length 130 | msg = struct.pack('>Lc', len(msg), serialization) + msg 131 | totalsent = 0 132 | while totalsent < len(msg): 133 | sent = self.sock.send(msg[totalsent:]) 134 | if sent == 0: 135 | raise RuntimeError("socket connection broken") 136 | totalsent = totalsent + sent 137 | 138 | def close(self): 139 | self.sock.close() 140 | 141 | if __name__ == '__main__': 142 | import sys 143 | 144 | #Just for testing 145 | if sys.argv[1] == 's': 146 | sock = serversocket(verbose=True) 147 | sock.start_accepting() 148 | raw_input("Press enter to exit...") 149 | sock.close() 150 | elif sys.argv[1] == 'c': 151 | host = raw_input("Enter the host name:\n") 152 | port = raw_input("Enter the port:\n") 153 | serialization = raw_input("Enter the serialization type (default: 'json'):\n") 154 | if serialization == '': 155 | serialization = 'json' 156 | sock = clientsocket(serialization=serialization) 157 | sock.connect(host, int(port)) 158 | msg = None 159 | 160 | # some predefined messages 161 | tuple_msg = ('hello','world') 162 | list_msg = ['hello','world'] 163 | dict_msg = {'hello':'world'} 164 | def function_msg(x): return x 165 | 166 | # read user input 167 | while msg != "quit": 168 | msg = raw_input("Enter a message to send:\n") 169 | if msg == 'tuple': 170 | sock.send(tuple_msg) 171 | elif msg == 'list': 172 | sock.send(list_msg) 173 | elif msg == 'dict': 174 | sock.send(dict_msg) 175 | elif msg == 'function': 176 | sock.send(function_msg) 177 | else: 178 | sock.send(msg) 179 | sock.close() 180 | -------------------------------------------------------------------------------- /automation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sensor-js/OpenWPM-mobile/1834f4c5b8fd0d3976b2e57f5310fd72860a681f/automation/__init__.py -------------------------------------------------------------------------------- /automation/default_browser_params.json: -------------------------------------------------------------------------------- 1 | { 2 | "extension_enabled": true, 3 | "disable_webdriver_self_id": true, 4 | "cookie_instrument": false, 5 | "js_instrument": true, 6 | "cp_instrument": false, 7 | "http_instrument": false, 8 | "save_javascript": true, 9 | 10 | "random_attributes": false, 11 | "bot_mitigation": false, 12 | "disable_flash": true, 13 | "profile_tar": null, 14 | "profile_archive_dir": null, 15 | "headless": true, 16 | "browser": "firefox", 17 | "tp_cookies": "always", 18 | "donottrack": false, 19 | "ghostery": false, 20 | "https-everywhere": false, 21 | "adblock-plus": false, 22 | "tracking-protection": false, 23 | "proxy": false, 24 | "save_javascript_proxy": false, 25 | "mobile_platform": "android" 26 | } 27 | -------------------------------------------------------------------------------- /automation/default_manager_params.json: -------------------------------------------------------------------------------- 1 | { 2 | "data_directory": "~/openwpm/", 3 | "log_directory": "~/openwpm/", 4 | "database_name": "crawl-data.sqlite", 5 | "log_file": "openwpm.log", 6 | "failure_limit": null, 7 | "testing": false 8 | } 9 | -------------------------------------------------------------------------------- /automation/schema.sql: -------------------------------------------------------------------------------- 1 | /* This file is sourced during the initialization 2 | * of the crawler. Make sure everything is CREATE 3 | * IF NOT EXISTS, otherwise there will be errors 4 | */ 5 | 6 | /* Crawler Tables */ 7 | 8 | CREATE TABLE IF NOT EXISTS task ( 9 | task_id INTEGER PRIMARY KEY AUTOINCREMENT, 10 | start_time DATETIME DEFAULT CURRENT_TIMESTAMP, 11 | manager_params TEXT NOT NULL, 12 | openwpm_version TEXT NOT NULL, 13 | browser_version TEXT NOT NULL); 14 | 15 | CREATE TABLE IF NOT EXISTS crawl ( 16 | crawl_id INTEGER PRIMARY KEY AUTOINCREMENT, 17 | task_id INTEGER NOT NULL, 18 | browser_params TEXT NOT NULL, 19 | screen_res TEXT, 20 | ua_string TEXT, 21 | finished BOOLEAN NOT NULL DEFAULT 0, 22 | start_time DATETIME DEFAULT CURRENT_TIMESTAMP, 23 | FOREIGN KEY(task_id) REFERENCES task(task_id)); 24 | 25 | CREATE TABLE IF NOT EXISTS xpath ( 26 | id INTEGER PRIMARY KEY AUTOINCREMENT, 27 | name VARCHAR(100) NOT NULL, 28 | url VARCHAR(500) NOT NULL, 29 | xpath VARCHAR(500) NOT NULL, 30 | absolute_xpath VARCHAR(500), 31 | ctime DATETIME DEFAULT CURRENT_TIMESTAMP, 32 | UNIQUE(name, url)); 33 | 34 | CREATE TABLE IF NOT EXISTS site_visits ( 35 | visit_id INTEGER PRIMARY KEY, 36 | crawl_id INTEGER NOT NULL, 37 | site_url VARCHAR(500) NOT NULL, 38 | FOREIGN KEY(crawl_id) REFERENCES crawl(id)); 39 | 40 | /* Proxy Tables */ 41 | 42 | /* TODO: add publix_suffix to db structure */ 43 | /* TODO: link with headers */ 44 | CREATE TABLE IF NOT EXISTS http_requests_proxy ( 45 | id INTEGER PRIMARY KEY AUTOINCREMENT, 46 | crawl_id INTEGER NOT NULL, 47 | url VARCHAR(500) NOT NULL, 48 | method VARCHAR(500) NOT NULL, 49 | referrer VARCHAR(500) NOT NULL, 50 | headers VARCHAR(500) NOT NULL, 51 | visit_id INTEGER NOT NULL, 52 | time_stamp VARCHAR(500) NOT NULL); 53 | 54 | /* TODO: add publix_suffix to db structure */ 55 | /* TODO: link with headers */ 56 | /* TODO: link with requests */ 57 | CREATE TABLE IF NOT EXISTS http_responses_proxy ( 58 | id INTEGER PRIMARY KEY AUTOINCREMENT, 59 | crawl_id INTEGER NOT NULL, 60 | url VARCHAR(500) NOT NULL, 61 | method VARCHAR(500) NOT NULL, 62 | referrer VARCHAR(500) NOT NULL, 63 | response_status INTEGER NOT NULL, 64 | response_status_text VARCHAR(500) NOT NULL, 65 | headers VARCHAR(500) NOT NULL, 66 | location VARCHAR(500) NOT NULL, 67 | visit_id INTEGER NOT NULL, 68 | time_stamp VARCHAR(500) NOT NULL, 69 | content_hash VARCHAR(50)); 70 | 71 | /* Firefox Storage Vector Dumps */ 72 | 73 | CREATE TABLE IF NOT EXISTS flash_cookies ( 74 | id INTEGER PRIMARY KEY AUTOINCREMENT, 75 | crawl_id INTEGER NOT NULL, 76 | visit_id INTEGER NOT NULL, 77 | domain VARCHAR(500), 78 | filename VARCHAR(500), 79 | local_path VARCHAR(1000), 80 | key TEXT, 81 | content TEXT, 82 | FOREIGN KEY(crawl_id) REFERENCES crawl(id), 83 | FOREIGN KEY(visit_id) REFERENCES site_visits(id)); 84 | 85 | CREATE TABLE IF NOT EXISTS profile_cookies ( 86 | id INTEGER PRIMARY KEY AUTOINCREMENT, 87 | crawl_id INTEGER NOT NULL, 88 | visit_id INTEGER NOT NULL, 89 | baseDomain TEXT, 90 | name TEXT, 91 | value TEXT, 92 | host TEXT, 93 | path TEXT, 94 | expiry INTEGER, 95 | accessed INTEGER, 96 | creationTime INTEGER, 97 | isSecure INTEGER, 98 | isHttpOnly INTEGER, 99 | FOREIGN KEY(crawl_id) REFERENCES crawl(id), 100 | FOREIGN KEY(visit_id) REFERENCES site_visits(id)); 101 | 102 | CREATE TABLE IF NOT EXISTS localStorage ( 103 | id INTEGER PRIMARY KEY AUTOINCREMENT, 104 | crawl_id INTEGER NOT NULL, 105 | page_url VARCHAR(500) NOT NULL, 106 | scope TEXT, 107 | KEY TEXT, 108 | value TEXT, 109 | FOREIGN KEY(crawl_id) REFERENCES crawl(id)); 110 | 111 | /* Crawl History table */ 112 | CREATE TABLE IF NOT EXISTS CrawlHistory ( 113 | crawl_id INTEGER, 114 | command TEXT, 115 | arguments TEXT, 116 | bool_success INTEGER, 117 | dtg DATETIME DEFAULT (CURRENT_TIMESTAMP), 118 | FOREIGN KEY(crawl_id) REFERENCES crawl(id)); 119 | 120 | -------------------------------------------------------------------------------- /automation/utilities/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sensor-js/OpenWPM-mobile/1834f4c5b8fd0d3976b2e57f5310fd72860a681f/automation/utilities/__init__.py -------------------------------------------------------------------------------- /automation/utilities/db_utils.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | import os 3 | import plyvel 4 | 5 | 6 | def query_db(db, query, params=None): 7 | """Run a query against the given db. 8 | 9 | If params is not None, securely construct a query from the given 10 | query string and params. 11 | """ 12 | with sqlite3.connect(db) as con: 13 | if params is None: 14 | rows = con.execute(query).fetchall() 15 | else: 16 | rows = con.execute(query, params).fetchall() 17 | return rows 18 | 19 | 20 | def get_javascript_content(data_directory): 21 | """Yield key, value pairs from the deduplicated leveldb content database 22 | 23 | Parameters 24 | ---------- 25 | data_directory : str 26 | root directory of the crawl files containing `javascript.ldb` 27 | """ 28 | db_path = os.path.join(data_directory, 'javascript.ldb') 29 | db = plyvel.DB(db_path, 30 | create_if_missing=False, 31 | compression='snappy') 32 | for content_hash, content in db.iterator(): 33 | yield content_hash, content 34 | db.close() 35 | 36 | 37 | def get_javascript_entries(db, all_columns=False): 38 | if all_columns: 39 | select_columns = "*" 40 | else: 41 | select_columns = "script_url, symbol, operation, value, arguments" 42 | 43 | return query_db(db, "SELECT %s FROM javascript" % select_columns) 44 | 45 | 46 | def any_command_failed(db): 47 | """Returns True if any command in a given database failed""" 48 | rows = query_db(db, "SELECT * FROM CrawlHistory;") 49 | for row in rows: 50 | if row[3] != 1: 51 | return True 52 | return False 53 | -------------------------------------------------------------------------------- /automation/utilities/domain_utils.py: -------------------------------------------------------------------------------- 1 | from publicsuffix import PublicSuffixList, fetch 2 | from ipaddress import ip_address 3 | from urlparse import urlparse 4 | from functools import wraps 5 | import tempfile 6 | import codecs 7 | import os 8 | 9 | # We cache the Public Suffix List in temp directory 10 | PSL_CACHE_LOC = os.path.join(tempfile.gettempdir(),'public_suffix_list.dat') 11 | 12 | def get_psl(): 13 | """ 14 | Grabs an updated public suffix list. 15 | """ 16 | if not os.path.isfile(PSL_CACHE_LOC): 17 | print "%s does not exist, downloading a copy." % PSL_CACHE_LOC 18 | psl_file = fetch() 19 | with codecs.open(PSL_CACHE_LOC, 'w', encoding='utf8') as f: 20 | f.write(psl_file.read()) 21 | print "Using psl from cache: %s" % PSL_CACHE_LOC 22 | psl_cache = codecs.open(PSL_CACHE_LOC, encoding='utf8') 23 | return PublicSuffixList(psl_cache) 24 | 25 | def load_psl(function): 26 | @wraps(function) 27 | def wrapper(*args, **kwargs): 28 | if not kwargs.has_key('psl'): 29 | if wrapper.psl is None: 30 | wrapper.psl = get_psl() 31 | return function(*args, psl=wrapper.psl, **kwargs) 32 | else: 33 | return function(*args, **kwargs) 34 | wrapper.psl = None 35 | return wrapper 36 | 37 | def is_ip_address(hostname): 38 | """ 39 | Check if the given string is a valid IP address 40 | """ 41 | try: 42 | ip_address(unicode(hostname)) 43 | return True 44 | except ValueError: 45 | return False 46 | 47 | @load_psl 48 | def get_ps_plus_1(url, **kwargs): 49 | """ 50 | Returns the PS+1 of the url. This will also return 51 | an IP address if the hostname of the url is a valid 52 | IP address. 53 | 54 | An (optional) PublicSuffixList object can be passed with keyword arg 'psl', 55 | otherwise a version cached in the system temp directory is used. 56 | """ 57 | if not kwargs.has_key('psl'): 58 | raise ValueError("A PublicSuffixList must be passed as a keyword argument.") 59 | hostname = urlparse(url).hostname 60 | if is_ip_address(hostname): 61 | return hostname 62 | elif hostname is None: 63 | # Possible reasons hostname is None, `url` is: 64 | # * malformed 65 | # * a relative url 66 | # * a `javascript:` or `data:` url 67 | # * many others 68 | return 69 | else: 70 | return kwargs['psl'].get_public_suffix(hostname) 71 | 72 | @load_psl 73 | def hostname_subparts(url, include_ps=False, **kwargs): 74 | """ 75 | Returns a list of slices of a url's hostname down to the PS+1 (or PS if include_ps) 76 | 77 | For example: http://a.b.c.d.com/path?query#frag would yield: 78 | [a.b.c.d.com, b.c.d.com, c.d.com, d.com] if include_ps == False 79 | [a.b.c.d.com, b.c.d.com, c.d.com, d.com, com] if include_ps == True 80 | 81 | An (optional) PublicSuffixList object can be passed with keyword arg 'psl'. 82 | otherwise a version cached in the system temp directory is used. 83 | """ 84 | if not kwargs.has_key('psl'): 85 | raise ValueError("A PublicSuffixList must be passed as a keyword argument.") 86 | hostname = urlparse(url).hostname 87 | 88 | # If an IP address, just return a single item list with the IP 89 | if is_ip_address(hostname): 90 | return [hostname] 91 | 92 | subparts = list() 93 | ps_plus_1 = kwargs['psl'].get_public_suffix(hostname) 94 | 95 | # We expect all ps_plus_1s to have at least one '.' 96 | # If they don't, the url was likely malformed, so we'll just return an 97 | # empty list 98 | if '.' not in ps_plus_1: 99 | return [] 100 | subdomains = hostname[:-(len(ps_plus_1)+1)].split('.') 101 | if subdomains == ['']: 102 | subdomains = [] 103 | for i in range(len(subdomains)): 104 | subparts.append('.'.join(subdomains[i:])+'.'+ps_plus_1) 105 | subparts.append(ps_plus_1) 106 | if include_ps: 107 | try: 108 | subparts.append(ps_plus_1[ps_plus_1.index('.')+1:]) 109 | except: 110 | pass 111 | return subparts 112 | 113 | def get_stripped_url(url, scheme=False): 114 | """Returns a url stripped to (scheme)?+hostname+path""" 115 | purl = urlparse(url) 116 | surl = '' 117 | if scheme: 118 | surl += purl.scheme + '://' 119 | try: 120 | surl += purl.hostname + purl.path 121 | except TypeError: 122 | surl += purl.hostname 123 | return surl 124 | 125 | def get_stripped_urls(urls, scheme=False): 126 | """ Returns a set (or list) of urls stripped to (scheme)?+hostname+path """ 127 | new_urls = list() 128 | for url in urls: 129 | get_stripped_url(url, scheme) 130 | if type(urls) == set: 131 | return set(new_urls) 132 | return new_urls 133 | -------------------------------------------------------------------------------- /demo.py: -------------------------------------------------------------------------------- 1 | from automation import TaskManager, CommandSequence 2 | 3 | # The list of sites that we wish to crawl 4 | NUM_BROWSERS = 15 5 | # sites = ["https://securehomes.esat.kuleuven.be/~gacar/dev/test/sensor/"] 6 | sites = [] 7 | csv_name = "top-1m.csv" 8 | no_of_sites = 100000 9 | for l in open(csv_name).readlines()[1:no_of_sites]: 10 | url = l.split(",")[-1].rstrip() 11 | sites.append("http://%s" % url) 12 | #sites = ['http://www.example.com', 13 | #'http://www.princeton.edu', 14 | #'http://citp.princeton.edu/'] 15 | 16 | # Loads the manager preference and 3 copies of the default browser dictionaries 17 | manager_params, browser_params = TaskManager.load_default_params(NUM_BROWSERS) 18 | 19 | # Update browser configuration (use this for per-browser settings) 20 | for i in xrange(NUM_BROWSERS): 21 | browser_params[i]['http_instrument'] = True # Record HTTP Requests and Responses 22 | browser_params[i]['disable_flash'] = False #Enable flash for all three browsers 23 | browser_params[0]['headless'] = True #Launch only browser 0 headless 24 | 25 | # Update TaskManager configuration (use this for crawl-wide settings) 26 | manager_params['data_directory'] = '~/openwpm/' 27 | manager_params['log_directory'] = '~/openwpm/' 28 | 29 | # Instantiates the measurement platform 30 | # Commands time out by default after 60 seconds 31 | manager = TaskManager.TaskManager(manager_params, browser_params) 32 | 33 | # Visits the sites with all browsers simultaneously 34 | for site in sites: 35 | command_sequence = CommandSequence.CommandSequence(site) 36 | 37 | # Start by visiting the page 38 | command_sequence.get(sleep=0, timeout=60) 39 | 40 | # dump_profile_cookies/dump_flash_cookies closes the current tab. 41 | command_sequence.dump_profile_cookies(120) 42 | 43 | manager.execute_command_sequence(command_sequence, index='**') # ** = synchronized browsers 44 | 45 | # Shuts down the browsers and waits for the data to finish logging 46 | manager.close() 47 | -------------------------------------------------------------------------------- /feature_extraction/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sensor-js/OpenWPM-mobile/1834f4c5b8fd0d3976b2e57f5310fd72860a681f/feature_extraction/__init__.py -------------------------------------------------------------------------------- /feature_extraction/utils.py: -------------------------------------------------------------------------------- 1 | from tld import get_fld 2 | from urlparse import urlparse 3 | import ipaddress 4 | import json 5 | 6 | DISCONNECT_JSON = "adblock/disconnect.json" 7 | 8 | 9 | def get_ps1_or_host(url): 10 | if not url.startswith("http"): 11 | url = 'http://' + url 12 | 13 | try: 14 | return get_fld(url, fail_silently=False) 15 | except Exception: 16 | hostname = urlparse(url).hostname 17 | try: 18 | ipaddress.ip_address(hostname) 19 | return hostname 20 | except Exception: 21 | return None 22 | 23 | 24 | def is_third_party(url, site_url): 25 | # !!!: We return False when we have missing information 26 | if not site_url: 27 | return False 28 | 29 | site_ps1 = get_ps1_or_host(site_url) 30 | if site_ps1 is None: 31 | return False 32 | 33 | req_ps1 = get_ps1_or_host(url) 34 | if req_ps1 is None: 35 | # print url 36 | return False 37 | if (req_ps1 == site_ps1): 38 | return False 39 | 40 | return True 41 | 42 | 43 | def get_disconnect_blocked_hosts(disconnect_json=DISCONNECT_JSON): 44 | blocked_hosts = set() 45 | disconnect = json.loads(open(disconnect_json).read()) 46 | categories = disconnect["categories"] 47 | for _, entries in categories.iteritems(): 48 | for entry in entries: 49 | adresses = entry.values() 50 | for address in adresses: 51 | address.pop("dnt", None) # there's one such entry 52 | # and it's not a domain/host 53 | hosts_list = address.values() 54 | blocked_hosts.update(hosts_list[0]) 55 | 56 | print len(blocked_hosts), "blocked hosts" 57 | # note that disconnect keep a list of blocked hosts, not PS+1s 58 | assert "adwords.google.com" in blocked_hosts 59 | assert "facebook.com" in blocked_hosts 60 | return list(blocked_hosts) 61 | 62 | 63 | def is_blocked_by_disconnect_old(url, disconnect_blocked_hosts): 64 | return urlparse(url).hostname in disconnect_blocked_hosts 65 | 66 | 67 | def is_blocked_by_disconnect(url, disconnect_blocked_hosts): 68 | host = urlparse(url).hostname 69 | if host in disconnect_blocked_hosts: 70 | return True 71 | while True: 72 | # strip one subdomain at a time 73 | host = host.split(".", 1)[-1] # take foo.com from bar.foo.com 74 | if "." not in host: 75 | return False 76 | if host in disconnect_blocked_hosts: 77 | return True 78 | return False # this shouldn't happen unless we are provided a corrupt hostname 79 | 80 | 81 | if __name__ == '__main__': 82 | # Test for the is_blocked_by_disconnect 83 | # TODO: move to a separate file 84 | assert is_blocked_by_disconnect("http://adwords.google.com", ["facebook.com", "adwords.google.com"]) 85 | assert not is_blocked_by_disconnect("http://example.com", ["facebook.com", "google.com"]) 86 | assert not is_blocked_by_disconnect("http://8.8.8.8", ["facebook.com", "google.com"]) 87 | disconnect_blocked_hosts = get_disconnect_blocked_hosts() 88 | assert is_blocked_by_disconnect("https://tps40.doubleverify.com/visit.js", 89 | disconnect_blocked_hosts) 90 | assert is_blocked_by_disconnect("https://pagead2.googlesyndication.com/bg/CI_hqThbQjBwoUSK10cIsovHByRI4InaU0wolTzGCLU.js", 91 | disconnect_blocked_hosts) 92 | assert not is_blocked_by_disconnect("http://bar-foo.com", ["foo.com"]) 93 | assert not is_blocked_by_disconnect("http://oo.com", ["foo.com"]) 94 | assert is_blocked_by_disconnect("http://bar.foo.com", ["foo.com"]) 95 | assert is_blocked_by_disconnect("http://sub.bar.foo.com", ["foo.com"]) 96 | -------------------------------------------------------------------------------- /install-analysis.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | # Download adblock lists 5 | mkdir -p feature_extraction/adblock 6 | wget https://easylist.to/easylist/easylist.txt -P adblock 7 | wget https://easylist.to/easylist/easyprivacy.txt -P adblock 8 | wget https://github.com/disconnectme/disconnect-tracking-protection/blob/master/services.json -O adblock/disconnect.json 9 | sudo pip install adblockparser 10 | -------------------------------------------------------------------------------- /install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | echo "Would you like to install Adobe Flash Player? (Only required for crawls with Flash) [y,N]" 5 | read -s -n 1 response 6 | if [[ $response = "" ]] || [ $response == 'n' ] || [ $response == 'N' ]; then 7 | flash=false 8 | echo Not installing Adobe Flash Plugin 9 | elif [ $response == 'y' ] || [ $response == 'Y' ]; then 10 | flash=true 11 | echo Installing Adobe Flash Plugin 12 | sudo sh -c 'echo "deb http://archive.canonical.com/ubuntu/ trusty partner" >> /etc/apt/sources.list.d/canonical_partner.list' 13 | else 14 | echo Unrecognized response, exiting 15 | exit 1 16 | fi 17 | 18 | sudo apt-get update 19 | 20 | # npm is required for compiling Firefox extension 21 | sudo apt-get install -y firefox htop git python-dev libxml2-dev libxslt-dev libffi-dev libssl-dev build-essential xvfb libboost-python-dev libleveldb-dev libjpeg-dev libgtk2.0-0 npm 22 | 23 | # For some versions of ubuntu, the package libleveldb1v5 isn't available. Use libleveldb1 instead. 24 | sudo apt-get install -y libleveldb1v5 || sudo apt-get install -y libleveldb1 25 | 26 | if [ "$flash" = true ]; then 27 | sudo apt-get install -y adobe-flashplugin 28 | fi 29 | 30 | # Check if we're running on continuous integration 31 | # Python requirements are already installed by .travis.yml on Travis 32 | if [ "$TRAVIS" != "true" ]; then 33 | wget https://bootstrap.pypa.io/get-pip.py 34 | sudo -H python get-pip.py 35 | rm get-pip.py 36 | sudo pip install -U -r requirements.txt 37 | fi 38 | 39 | # Install specific version of Firefox known to work well with the selenium version above 40 | if [ $(uname -m) == 'x86_64' ]; then 41 | echo Downloading 64-bit Firefox 42 | wget https://ftp.mozilla.org/pub/firefox/releases/45.9.0esr/linux-x86_64/en-US/firefox-45.9.0esr.tar.bz2 43 | else 44 | echo Downloading 32-bit Firefox 45 | wget https://ftp.mozilla.org/pub/firefox/releases/45.9.0esr/linux-i686/en-US/firefox-45.9.0esr.tar.bz2 46 | fi 47 | tar jxf firefox*.tar.bz2 48 | rm -rf firefox-bin 49 | mv firefox firefox-bin 50 | rm firefox*.tar.bz2 51 | 52 | 53 | # Fix naming issue (exists in 14.04 and 16.04) 54 | if [ ! -f /usr/bin/node ]; then 55 | sudo ln -s /usr/bin/nodejs /usr/bin/node 56 | fi 57 | 58 | # install jpm 59 | sudo npm install jpm -g 60 | -------------------------------------------------------------------------------- /mobile_sensor_crawl.py: -------------------------------------------------------------------------------- 1 | from automation import TaskManager, CommandSequence 2 | 3 | # number of browsers to run in parallel 4 | NUM_BROWSERS = 10 5 | 6 | 7 | sites = [] 8 | csv_name = "top-1m.csv" 9 | no_of_sites = 100000 # crawl 100K sites 10 | for l in open(csv_name).readlines()[0:no_of_sites]: 11 | site = l.split(",")[-1].rstrip() 12 | sites.append(site) 13 | 14 | # Loads the manager preference and 3 copies of the default browser dictionaries 15 | manager_params, browser_params = TaskManager.load_default_params(NUM_BROWSERS) 16 | 17 | # Update browser configuration (use this for per-browser settings) 18 | for i in xrange(NUM_BROWSERS): 19 | browser_params[i]['http_instrument'] = True # Record HTTP Requests and Responses 20 | browser_params[i]['disable_flash'] = True # Disable flash for all browsers 21 | browser_params[i]['js_instrument'] = True # Enable JS instrumentation 22 | browser_params[i]['save_javascript'] = True # save JS files 23 | browser_params[i]['headless'] = True # headless 24 | browser_params[i]['trigger_sensor_events'] = True # fake sensor events 25 | browser_params[i]['mobile_platform'] = "android" # or "iphone" 26 | 27 | # Update TaskManager configuration (use this for crawl-wide settings) 28 | manager_params['data_directory'] = '~/openwpm_mobile_100k/' 29 | manager_params['log_directory'] = '~/openwpm_mobile_100k/' 30 | 31 | # Instantiates the measurement platform 32 | # Commands time out by default after 60 seconds 33 | manager = TaskManager.TaskManager(manager_params, browser_params) 34 | 35 | # Visits the sites with all browsers simultaneously 36 | for rank, site in enumerate(sites, 1): 37 | url = "http://%s" % site 38 | command_sequence = CommandSequence.CommandSequence(url, reset=True) 39 | 40 | # Start by visiting the page 41 | command_sequence.get(sleep=10, timeout=60) 42 | # command_sequence.save_screenshot('%d_%s_screenshot' % (rank, site)) 43 | # dump_profile_cookies/dump_flash_cookies closes the current tab. 44 | command_sequence.dump_profile_cookies(120) 45 | 46 | manager.execute_command_sequence(command_sequence) 47 | 48 | # Shuts down the browsers and waits for the data to finish logging 49 | manager.close() 50 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | setuptools 2 | pyvirtualdisplay 3 | beautifulsoup4 4 | python-dateutil 5 | tld 6 | multiprocess 7 | dill 8 | pyamf 9 | psutil 10 | plyvel 11 | tblib 12 | tabulate 13 | pytest 14 | publicsuffix 15 | # Install specific version of selenium known to work well with the Firefox install we use 16 | selenium==2.53.0 17 | mmh3 18 | # IPython 6.0+ does not support python 2.7 19 | IPython>=5.0,<6.0 20 | # See https://github.com/ActiveState/appdirs/issues/89 21 | appdirs>=1.4.3 22 | -------------------------------------------------------------------------------- /test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sensor-js/OpenWPM-mobile/1834f4c5b8fd0d3976b2e57f5310fd72860a681f/test/__init__.py -------------------------------------------------------------------------------- /test/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import utilities 3 | from ..automation.utilities.platform_utils import create_xpi 4 | 5 | 6 | @pytest.fixture(scope="session", autouse=True) 7 | def prepare_test_setup(request): 8 | """Run an HTTP server during the tests.""" 9 | print "\nCalling create_xpi", create_xpi() 10 | print "\nStarting local_http_server" 11 | server, server_thread = utilities.start_server() 12 | 13 | def local_http_server_stop(): 14 | print "\nClosing server thread..." 15 | server.shutdown() 16 | server_thread.join() 17 | 18 | request.addfinalizer(local_http_server_stop) 19 | -------------------------------------------------------------------------------- /test/manual_test.py: -------------------------------------------------------------------------------- 1 | from utilities import BASE_TEST_URL, start_server 2 | from conftest import create_xpi 3 | from os.path import dirname, join, realpath 4 | from selenium.webdriver.firefox.firefox_binary import FirefoxBinary 5 | from selenium import webdriver 6 | import subprocess 7 | import atexit 8 | 9 | OPENWPM_LOG_PREFIX = "console.log: openwpm: " 10 | INSERT_PREFIX = "Array" 11 | BASE_DIR = dirname(dirname(realpath(__file__))) 12 | EXT_PATH = join(BASE_DIR, 'automation', 'Extension', 'firefox') 13 | FF_BIN_PATH = join(BASE_DIR, 'firefox-bin', 'firefox') 14 | 15 | 16 | class bcolors: 17 | HEADER = '\033[95m' 18 | OKBLUE = '\033[94m' 19 | OKGREEN = '\033[92m' 20 | WARNING = '\033[93m' 21 | FAIL = '\033[91m' 22 | ENDC = '\033[0m' 23 | BOLD = '\033[1m' 24 | UNDERLINE = '\033[4m' 25 | 26 | 27 | def get_command_output(command, cwd=None): 28 | popen = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, 29 | stderr=subprocess.STDOUT, cwd=cwd) 30 | return iter(popen.stdout.readline, b"") 31 | 32 | 33 | def colorize(line): 34 | if INSERT_PREFIX in line: # print long DB insert lines in blue 35 | line = line.replace(INSERT_PREFIX, bcolors.OKBLUE + INSERT_PREFIX) 36 | if OPENWPM_LOG_PREFIX in line: 37 | line = line.replace(OPENWPM_LOG_PREFIX, 38 | OPENWPM_LOG_PREFIX + bcolors.OKGREEN) 39 | return line 40 | 41 | 42 | def start_webdriver(with_extension=False): 43 | """ Open a webdriver instance and a server for the test pages 44 | 45 | This is meant to be imported and run manually from a python or 46 | ipython shell. A webdriver instance is returned and both the webdriver 47 | and server will automatically clean up when the shell is exited. 48 | 49 | Parameters 50 | ---------- 51 | with_extension : boolean 52 | Set to True to also load OpenWPM extension instrumentation 53 | 54 | Returns 55 | ------- 56 | webdriver 57 | A selenium webdriver instance. 58 | """ 59 | fb = FirefoxBinary(FF_BIN_PATH) 60 | server, thread = start_server() 61 | 62 | def register_cleanup(driver): 63 | driver.get(BASE_TEST_URL) 64 | 65 | def cleanup_server(): 66 | print "Cleanup before shutdown..." 67 | server.shutdown() 68 | thread.join() 69 | print "...sever shutdown" 70 | driver.quit() 71 | print "...webdriver closed" 72 | 73 | atexit.register(cleanup_server) 74 | return driver 75 | 76 | if not with_extension: 77 | return register_cleanup(webdriver.Firefox(firefox_binary=fb)) 78 | 79 | # add openwpm extension to profile 80 | create_xpi() 81 | fp = webdriver.FirefoxProfile() 82 | ext_xpi = join(EXT_PATH, 'openwpm.xpi') 83 | fp.add_extension(extension=ext_xpi) 84 | fp.set_preference("extensions.@openwpm.sdk.console.logLevel", "all") 85 | 86 | return register_cleanup( 87 | webdriver.Firefox(firefox_binary=fb, firefox_profile=fp)) 88 | 89 | 90 | def start_jpm(): 91 | cmd_jpm_run = "jpm run --binary-args 'url %s' -b %s" % (BASE_TEST_URL, 92 | FF_BIN_PATH) 93 | server, thread = start_server() 94 | try: 95 | # http://stackoverflow.com/a/4417735/3104416 96 | for line in get_command_output(cmd_jpm_run, cwd=EXT_PATH): 97 | print colorize(line), bcolors.ENDC, 98 | except KeyboardInterrupt: 99 | print "Keyboard Interrupt detected, shutting down..." 100 | print "\nClosing server thread..." 101 | server.shutdown() 102 | thread.join() 103 | 104 | 105 | if __name__ == '__main__': 106 | import IPython 107 | import sys 108 | 109 | # TODO use some real parameter handling library 110 | if len(sys.argv) == 1: 111 | start_jpm() 112 | elif len(sys.argv) >= 2 and sys.argv[1] == '--selenium': 113 | if len(sys.argv) == 3 and sys.argv[2] == '--no-extension': 114 | driver = start_webdriver(False) 115 | else: 116 | driver = start_webdriver(True) 117 | print "\nDropping into ipython shell...." 118 | print " * Interact with the webdriver instance using `driver`" 119 | print " * The webdriver and test page server will close automatically" 120 | print " * Use `exit` to quit the ipython shell\n" 121 | IPython.embed() 122 | else: 123 | print ("Unrecognized arguments. Usage:\n" 124 | "python manual_test.py ('--selenium')? ('--no-extension')?") 125 | -------------------------------------------------------------------------------- /test/openwpmtest.py: -------------------------------------------------------------------------------- 1 | from os.path import join 2 | import utilities 3 | import pytest 4 | import commands 5 | from ..automation import TaskManager 6 | 7 | 8 | class OpenWPMTest(object): 9 | NUM_BROWSERS = 1 10 | 11 | @pytest.fixture(autouse=True) 12 | def set_tmpdir(self, tmpdir): 13 | """Create a tmpdir fixture to be used in `get_test_config`. 14 | 15 | Based on: 16 | https://mail.python.org/pipermail/pytest-dev/2014-April/002484.html 17 | """ 18 | self.tmpdir = str(tmpdir) 19 | 20 | def visit(self, page_url, data_dir="", sleep_after=0): 21 | """Visit a test page with the given parameters.""" 22 | manager_params, browser_params = self.get_config(data_dir) 23 | manager = TaskManager.TaskManager(manager_params, browser_params) 24 | if not page_url.startswith("http"): 25 | page_url = utilities.BASE_TEST_URL + page_url 26 | manager.get(url=page_url, sleep=sleep_after) 27 | manager.close() 28 | return manager_params['db'] 29 | 30 | def get_test_config(self, data_dir="", 31 | num_browsers=NUM_BROWSERS): 32 | """Load and return the default test parameters.""" 33 | if not data_dir: 34 | data_dir = self.tmpdir 35 | manager_params, browser_params = TaskManager.\ 36 | load_default_params(num_browsers) 37 | manager_params['data_directory'] = data_dir 38 | manager_params['log_directory'] = data_dir 39 | browser_params[0]['headless'] = True 40 | manager_params['db'] = join(manager_params['data_directory'], 41 | manager_params['database_name']) 42 | return manager_params, browser_params 43 | 44 | def is_installed(self, pkg_name): 45 | """Check if a Linux package is installed.""" 46 | cmd = 'which %s' % pkg_name 47 | status, _ = commands.getstatusoutput(cmd) 48 | return False if status else True 49 | 50 | def assert_is_installed(self, pkg): 51 | assert self.is_installed(pkg), 'Cannot find %s in your system' % pkg 52 | 53 | def assert_py_pkg_installed(self, pkg): 54 | # some modules are imported using a different name than the ones used 55 | # at the installation. 56 | pkg_name_mapping = {"pyopenssl": "OpenSSL", 57 | "mitmproxy": "libmproxy", 58 | "beautifulsoup4": "bs4", 59 | "python-dateutil": "dateutil" 60 | } 61 | # get the mapped name if it exists. 62 | pkg_importable = pkg_name_mapping.get(pkg.lower(), pkg) 63 | try: 64 | __import__(pkg_importable) 65 | except ImportError: 66 | pytest.fail("Cannot find python package %s in your system" % pkg) 67 | -------------------------------------------------------------------------------- /test/test_adblock_plus.py: -------------------------------------------------------------------------------- 1 | from urlparse import urlparse 2 | import pytest 3 | import os 4 | 5 | from ..automation import TaskManager 6 | from ..automation.Errors import BrowserConfigError 7 | from ..automation.utilities.platform_utils import fetch_adblockplus_list 8 | from ..automation.utilities import domain_utils, db_utils 9 | 10 | import utilities 11 | import expected 12 | from openwpmtest import OpenWPMTest 13 | 14 | psl = domain_utils.get_psl() 15 | 16 | 17 | class TestABP(OpenWPMTest): 18 | 19 | def get_config(self, data_dir=""): 20 | manager_params, browser_params = self.get_test_config(data_dir) 21 | browser_params[0]['http_instrument'] = True 22 | browser_params[0]['adblock-plus'] = True 23 | return manager_params, browser_params 24 | 25 | def test_list_fetch(self, tmpdir): 26 | data_dir = str(tmpdir) 27 | fetch_adblockplus_list(data_dir) 28 | assert os.path.isfile(os.path.join(data_dir, 'patterns.ini')) 29 | assert os.path.isfile(os.path.join(data_dir, 'elemhide.css')) 30 | 31 | def test_blocks_includes(self, tmpdir): 32 | data_dir = str(tmpdir) 33 | list_loc = os.path.join(data_dir, 'adblock_plus') 34 | manager_params, browser_params = self.get_config(data_dir) 35 | fetch_adblockplus_list(list_loc) 36 | browser_params[0]['adblock-plus_list_location'] = list_loc 37 | manager = TaskManager.TaskManager(manager_params, browser_params) 38 | manager.get(utilities.BASE_TEST_URL + '/abp/adblock_plus_test.html') 39 | manager.close() 40 | 41 | db = os.path.join(data_dir, manager_params['database_name']) 42 | rows = db_utils.query_db(db, "SELECT url FROM http_requests") 43 | urls = set() 44 | for url, in rows: 45 | ps1 = psl.get_public_suffix(urlparse(url).hostname) 46 | # exclude requests to safebrowsing and tracking protection backends 47 | if ps1 not in ("mozilla.com", "mozilla.net"): 48 | urls.add(url) 49 | assert urls == expected.adblockplus 50 | 51 | def test_error_with_missing_option(self): 52 | manager_params, browser_params = self.get_config() 53 | with pytest.raises(BrowserConfigError): 54 | manager = TaskManager.TaskManager(manager_params, browser_params) 55 | manager.close() 56 | 57 | def test_error_with_missing_list(self, tmpdir): 58 | data_dir = str(tmpdir) 59 | list_loc = os.path.join(data_dir, 'adblock_plus') 60 | manager_params, browser_params = self.get_config(data_dir) 61 | browser_params[0]['adblock-plus_list_location'] = list_loc 62 | with pytest.raises(BrowserConfigError): 63 | manager = TaskManager.TaskManager(manager_params, browser_params) 64 | manager.close() 65 | -------------------------------------------------------------------------------- /test/test_crawl.py: -------------------------------------------------------------------------------- 1 | from urlparse import urlparse 2 | import tarfile 3 | import pytest 4 | import os 5 | 6 | from ..automation import TaskManager 7 | from ..automation.utilities import domain_utils, db_utils 8 | from openwpmtest import OpenWPMTest 9 | 10 | 11 | TEST_SITES = [ 12 | 'http://google.com', 13 | 'http://facebook.com', 14 | 'http://youtube.com', 15 | 'http://yahoo.com', 16 | 'http://baidu.com', 17 | 'http://wikipedia.org', 18 | 'http://qq.com', 19 | 'http://linkedin.com', 20 | 'http://taobao.com', 21 | 'http://twitter.com', 22 | 'http://live.com', 23 | 'http://amazon.com', 24 | 'http://sina.com.cn', 25 | 'http://google.co.in', 26 | 'http://hao123.com', 27 | 'http://blogspot.com', 28 | 'http://weibo.com', 29 | 'http://wordpress.com', 30 | 'http://yandex.ru', 31 | 'http://yahoo.co.jp' 32 | ] 33 | 34 | psl = domain_utils.get_psl() 35 | 36 | 37 | class TestCrawl(OpenWPMTest): 38 | """ Runs a short test crawl. 39 | 40 | This should be used to test any features that require real 41 | crawl data. This should be avoided if possible, as controlled 42 | tests will be easier to debug 43 | """ 44 | 45 | def get_config(self, data_dir=""): 46 | manager_params, browser_params = self.get_test_config(data_dir) 47 | browser_params[0]['profile_archive_dir'] =\ 48 | os.path.join(manager_params['data_directory'], 'browser_profile') 49 | browser_params[0]['http_instrument'] = True 50 | return manager_params, browser_params 51 | 52 | @pytest.mark.slow 53 | def test_browser_profile_coverage(self, tmpdir): 54 | """ Test the coverage of the browser's profile 55 | 56 | This verifies that Firefox's places.sqlite database contains 57 | all visited sites (with a few exceptions). If it does not, 58 | it is likely the profile is lost at some point during the crawl 59 | """ 60 | # Run the test crawl 61 | data_dir = os.path.join(str(tmpdir), 'data_dir') 62 | manager_params, browser_params = self.get_config(data_dir) 63 | manager = TaskManager.TaskManager(manager_params, browser_params) 64 | for site in TEST_SITES: 65 | manager.get(site) 66 | ff_db_tar = os.path.join(browser_params[0]['profile_archive_dir'], 67 | 'profile.tar.gz') 68 | manager.close() 69 | 70 | # Extract crawl profile 71 | with tarfile.open(ff_db_tar) as tar: 72 | tar.extractall(browser_params[0]['profile_archive_dir']) 73 | 74 | # Output databases 75 | ff_db = os.path.join(browser_params[0]['profile_archive_dir'], 76 | 'places.sqlite') 77 | crawl_db = manager_params['db'] 78 | 79 | # Grab urls from crawl database 80 | rows = db_utils.query_db(crawl_db, "SELECT url FROM http_requests") 81 | req_ps = set() # visited domains from http_requests table 82 | for url, in rows: 83 | req_ps.add(psl.get_public_suffix(urlparse(url).hostname)) 84 | 85 | hist_ps = set() # visited domains from CrawlHistory Table 86 | successes = dict() 87 | rows = db_utils.query_db(crawl_db, "SELECT arguments, bool_success " 88 | "FROM CrawlHistory WHERE command='GET'") 89 | for url, success in rows: 90 | ps = psl.get_public_suffix(urlparse(url).hostname) 91 | hist_ps.add(ps) 92 | successes[ps] = success 93 | 94 | # Grab urls from Firefox database 95 | profile_ps = set() # visited domains from firefox profile 96 | rows = db_utils.query_db(ff_db, "SELECT url FROM moz_places") 97 | for host, in rows: 98 | try: 99 | profile_ps.add(psl.get_public_suffix(urlparse(host).hostname)) 100 | except AttributeError: 101 | pass 102 | 103 | # We expect urls to be in the Firefox profile if: 104 | # 1. We've made requests to it 105 | # 2. The url is a top_url we entered into the address bar 106 | # 3. The url successfully loaded (see: Issue #40) 107 | # 4. The site does not respond to the initial request with a 204 (won't show in FF DB) 108 | missing_urls = req_ps.intersection(hist_ps).difference(profile_ps) 109 | unexpected_missing_urls = set() 110 | for url in missing_urls: 111 | if successes[url] == 0 or successes[url] == -1: 112 | continue 113 | 114 | # Get the visit id for the url 115 | rows = db_utils.query_db(crawl_db, 116 | "SELECT visit_id FROM site_visits " 117 | "WHERE site_url = ?", 118 | ('http://' + url,)) 119 | 120 | try: 121 | visit_id = rows[0][0] 122 | except Exception: 123 | visit_id = rows[0] 124 | 125 | rows = db_utils.query_db(crawl_db, 126 | "SELECT COUNT(*) FROM http_responses " 127 | "WHERE visit_id = ?", 128 | (visit_id,)) 129 | if rows[0] > 1: 130 | continue 131 | 132 | rows = db_utils.query_db(crawl_db, 133 | "SELECT response_status, location FROM " 134 | "http_responses WHERE visit_id = ?", 135 | (visit_id,)) 136 | response_status, location = rows[0] 137 | if response_status == 204: 138 | continue 139 | if location == 'http://': # site returned a blank redirect 140 | continue 141 | unexpected_missing_urls.add(url) 142 | 143 | assert len(unexpected_missing_urls) == 0 144 | -------------------------------------------------------------------------------- /test/test_custom_function_command.py: -------------------------------------------------------------------------------- 1 | import expected 2 | import utilities 3 | from ..automation import CommandSequence 4 | from ..automation import TaskManager 5 | from ..automation.utilities import db_utils 6 | from openwpmtest import OpenWPMTest 7 | 8 | url_a = utilities.BASE_TEST_URL + '/simple_a.html' 9 | 10 | 11 | class TestCustomFunctionCommand(OpenWPMTest): 12 | """Test `custom_function` command's ability to handle various inline functions""" 13 | 14 | def get_config(self, data_dir=""): 15 | return self.get_test_config(data_dir) 16 | 17 | def test_custom_function(self): 18 | """ Test `custom_function` with an inline function that collects links """ 19 | 20 | from ..automation.SocketInterface import clientsocket 21 | def collect_links(table_name, scheme, **kwargs): 22 | """ Collect links with matching `scheme` and save in table `table_name` """ 23 | driver = kwargs['driver'] 24 | manager_params = kwargs['manager_params'] 25 | link_elements = driver.find_elements_by_tag_name('a') 26 | link_urls = [element.get_attribute("href") for element in link_elements] 27 | link_urls = filter(lambda x: x.startswith(scheme+'://'), link_urls) 28 | current_url = driver.current_url 29 | 30 | sock = clientsocket() 31 | sock.connect(*manager_params['aggregator_address']) 32 | 33 | query = ("CREATE TABLE IF NOT EXISTS %s (" 34 | "top_url TEXT, link TEXT);" % table_name) 35 | sock.send((query, ())) 36 | 37 | for link in link_urls: 38 | query = ("INSERT INTO %s (top_url, link) " 39 | "VALUES (?, ?)" % table_name) 40 | sock.send((query, (current_url, link))) 41 | sock.close() 42 | 43 | manager_params, browser_params = self.get_config() 44 | manager = TaskManager.TaskManager(manager_params, browser_params) 45 | cs = CommandSequence.CommandSequence(url_a) 46 | cs.get(sleep=0, timeout=60) 47 | cs.run_custom_function(collect_links, ('page_links', 'http')) 48 | manager.execute_command_sequence(cs) 49 | manager.close() 50 | query_result = db_utils.query_db(manager_params['db'], 51 | "SELECT top_url, link FROM page_links;") 52 | assert expected.page_links == set(query_result) 53 | -------------------------------------------------------------------------------- /test/test_env.py: -------------------------------------------------------------------------------- 1 | import re 2 | from os.path import realpath, dirname, join, isfile, isdir 3 | from openwpmtest import OpenWPMTest 4 | 5 | 6 | class TestDependencies(OpenWPMTest): 7 | 8 | BASE_DIR = dirname(dirname(realpath(__file__))) 9 | 10 | def test_dependencies(self): 11 | self.assert_is_installed("npm") 12 | self.assert_is_installed("jpm") 13 | self.assert_is_installed('mitmdump') 14 | self.assert_is_installed('firefox') 15 | ff_bin_dir = join(self.BASE_DIR, "firefox-bin") 16 | assert isdir(ff_bin_dir) 17 | ff_binary = join(ff_bin_dir, "firefox") 18 | assert isfile(ff_binary) 19 | 20 | def test_py_pkgs(self): 21 | PY_REQUIREMENTS_TXT = join(self.BASE_DIR, "requirements.txt") 22 | assert isfile(PY_REQUIREMENTS_TXT) 23 | for line in open(PY_REQUIREMENTS_TXT): 24 | if line.startswith("#"): 25 | continue 26 | pkg = re.split(r'[>=<]', line.strip())[0] 27 | print "Checking Python package", pkg 28 | self.assert_py_pkg_installed(pkg) 29 | -------------------------------------------------------------------------------- /test/test_js_instrument.py: -------------------------------------------------------------------------------- 1 | from openwpmtest import OpenWPMTest 2 | from ..automation.utilities import db_utils 3 | 4 | GETS_AND_SETS = { 5 | ("window.test.prop1", "get", "prop1"), 6 | ("window.test.prop1", "set", "blah1"), 7 | ("window.test.prop1", "get", "blah1"), 8 | ("window.test.prop2", "get", "prop2"), 9 | ("window.test.prop2", "set", "blah2"), 10 | ("window.test.prop2", "get", "blah2"), 11 | ("window.test.objProp", "get", "{\"hello\":\"world\"}"), 12 | ("window.test.objProp", "set", "{\"key\":\"value\"}"), 13 | ("window.test.objProp", "get", "{\"key\":\"value\"}"), 14 | ("window.test.prop3", "get", "default-value"), 15 | ("window.test.prop3", "set", "blah3"), 16 | ("window.test.prop3", "get", "blah3"), 17 | ('window.test.method1', 'set', 'FUNCTION'), 18 | ('window.test.method1', 'set', 'now static'), 19 | ('window.test.method1', 'get', 'now static'), 20 | ('window.test.prop1', 'set', 'FUNCTION'), 21 | ('window.test.nestedObj', 'get', 22 | '{"prop1":"default1","prop2":"default2","method1":"FUNCTION"}') 23 | } 24 | 25 | METHOD_CALLS = { 26 | ('window.test.prop1', 'call', '{"0":"now accepting arugments"}'), 27 | ('window.test.method1', 'call', '{"0":"hello","1":"{\\"world\\":true}"}'), 28 | ('window.test.method1', 'call', '{"0":"new argument"}') 29 | } 30 | 31 | RECURSIVE_GETS_AND_SETS = { 32 | ("window.test2.nestedObj.prop1", "get", "default1"), 33 | ("window.test2.nestedObj.prop1", "set", "updatedprop1"), 34 | ("window.test2.nestedObj.prop1", "get", "updatedprop1"), 35 | ("window.test2.nestedObj.prop2", "get", "default2"), 36 | ("window.test2.nestedObj.method1", "set", "FUNCTION"), 37 | ("window.test2.nestedObj.doubleNested.prop1", "get", "double default"), 38 | ("window.test2.nestedObj.doubleNested.prop1", "set", "doubleprop1"), 39 | ("window.test2.nestedObj.doubleNested.prop1", "get", "doubleprop1"), 40 | ("window.test2.nestedObj.doubleNested.method1", "set", "FUNCTION") 41 | } 42 | 43 | RECURSIVE_METHOD_CALLS = { 44 | ('window.test2.nestedObj.method1', 'call', '{"0":"arg-before"}'), 45 | ('window.test2.nestedObj.method1', 'call', '{"0":"arg-after"}'), 46 | ('window.test2.nestedObj.doubleNested.method1', 'call', '{"0":"blah"}') 47 | } 48 | 49 | RECURSIVE_PROP_SET = { 50 | ('window.test2.l1.l2.l3.l4.l5.prop', 'get', 'level5prop'), 51 | ('window.test2.l1.l2.l3.l4.l5.l6', 'get', '{"prop":"level6prop"}') 52 | } 53 | 54 | SET_PREVENT_CALLS = { 55 | (u'window.test3.method1', u'call', None), 56 | ('window.test3.obj1.method2', 'call', None) 57 | } 58 | 59 | SET_PREVENT_GETS_AND_SETS = { 60 | (u'window.test3.prop1', u'set', u'newprop1'), 61 | ('window.test3.method1', 'set(prevented)', 'FUNCTION'), 62 | ('window.test3.obj1', 'set(prevented)', '{"new":"object"}'), 63 | (u'window.test3.obj1.prop2', u'set', u'newprop2'), 64 | ('window.test3.obj1.method2', 'set(prevented)', 'FUNCTION'), 65 | ('window.test3.obj1.obj2', 'set(prevented)', '{"new":"object2"}'), 66 | (u'window.test3.prop1', u'get', u'newprop1'), 67 | ('window.test3.obj1.obj2', 'get', '{"testobj":"nested"}'), 68 | ('window.test3.obj1.prop2', 'get', 'newprop2'), 69 | } 70 | 71 | 72 | class TestJSInstrument(OpenWPMTest): 73 | 74 | def get_config(self, data_dir=""): 75 | manager_params, browser_params = self.get_test_config(data_dir) 76 | browser_params[0]['js_instrument'] = True 77 | manager_params['testing'] = True 78 | return manager_params, browser_params 79 | 80 | def test_instrument_object(self): 81 | """ Ensure instrumentObject logs all property gets, sets, and calls """ 82 | db = self.visit('/instrument_object.html') 83 | rows = db_utils.get_javascript_entries(db) 84 | 85 | # Check calls of non-recursive instrumentation 86 | observed_gets_and_sets = set() 87 | observed_calls = set() 88 | for script_url, symbol, operation, value, arguments in rows: 89 | if not symbol.startswith('window.test.'): 90 | continue 91 | if operation == 'get' or operation == 'set': 92 | observed_gets_and_sets.add((symbol, operation, value)) 93 | else: 94 | observed_calls.add((symbol, operation, arguments)) 95 | assert observed_calls == METHOD_CALLS 96 | assert observed_gets_and_sets == GETS_AND_SETS 97 | 98 | # Check calls of recursive instrumentation 99 | observed_gets_and_sets = set() 100 | observed_calls = set() 101 | for script_url, symbol, operation, value, arguments in rows: 102 | if not symbol.startswith('window.test2.nestedObj'): 103 | continue 104 | if operation == 'get' or operation == 'set': 105 | observed_gets_and_sets.add((symbol, operation, value)) 106 | else: 107 | observed_calls.add((symbol, operation, arguments)) 108 | assert observed_calls == RECURSIVE_METHOD_CALLS 109 | assert observed_gets_and_sets == RECURSIVE_GETS_AND_SETS 110 | 111 | # Check that calls not present after default recursion limit (5) 112 | # We should only see the window.test2.l1.l2.l3.l4.l5.prop access 113 | # and not window.test2.l1.l2.l3.l4.l5.l6.prop access. 114 | prop_access = set() 115 | for script_url, symbol, operation, value, arguments in rows: 116 | if not symbol.startswith('window.test2.l1'): 117 | continue 118 | prop_access.add((symbol, operation, value)) 119 | assert prop_access == RECURSIVE_PROP_SET 120 | 121 | # Check calls of object with sets prevented 122 | observed_gets_and_sets = set() 123 | observed_calls = set() 124 | for script_url, symbol, operation, value, arguments in rows: 125 | if not symbol.startswith('window.test3'): 126 | continue 127 | if operation == 'call': 128 | observed_calls.add((symbol, operation, arguments)) 129 | else: 130 | observed_gets_and_sets.add((symbol, operation, value)) 131 | assert observed_calls == SET_PREVENT_CALLS 132 | assert observed_gets_and_sets == SET_PREVENT_GETS_AND_SETS 133 | -------------------------------------------------------------------------------- /test/test_pages/abp/adblock_plus_test.html: -------------------------------------------------------------------------------- 1 | 2 | AdBlock Plus Test Page 3 |

If functioning correctly, we expect AdBlock Plus to prevent the included 4 | requests from succeeding.

5 | 6 |

7 | Script 1:


8 | Script 2:

9 | Script 3:

10 |

11 | 12 | 13 | 14 | 15 | 16 | 37 | 38 | 39 | -------------------------------------------------------------------------------- /test/test_pages/abp/adspot/1.js: -------------------------------------------------------------------------------- 1 | var include_1 = true; 2 | -------------------------------------------------------------------------------- /test/test_pages/abp/adsystem/3.js: -------------------------------------------------------------------------------- 1 | var include_3 = true; 2 | -------------------------------------------------------------------------------- /test/test_pages/abp/bannerads/2.js: -------------------------------------------------------------------------------- 1 | var include_2 = true; 2 | -------------------------------------------------------------------------------- /test/test_pages/audio_fingerprinting.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | AudioContext Fingerprint Test Page 6 |

AudioContext Fingerprint Test Page

7 |

 8 | 
 9 | 
41 | 
42 | 
43 | 
44 | 


--------------------------------------------------------------------------------
/test/test_pages/battery_fingerprinting.html:
--------------------------------------------------------------------------------
 1 | 
 2 |   
 3 |   AudioContext Fingerprint Test Page
 4 |   
 5 |   
 6 | 
 7 |     

Battery Fingerprinting Test Page

8 |

Charging?

 
9 |

Charging Level:

 
10 |

Charging Time:

 
11 |

Discharging Time:

 
12 | 13 | 58 | 59 | 60 | -------------------------------------------------------------------------------- /test/test_pages/canvas_fingerprinting.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Canvas Fingerprinting Test Page 4 |

Canvas Fingerprinting Test Page

5 | 6 | 7 |

 8 | 
 9 | 
31 | 
32 | 
33 | 


--------------------------------------------------------------------------------
/test/test_pages/expected_source.html:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | Simple Page A
 4 |   
10 |  
11 |  
12 |  Click me!
13 |  Click me also!
14 |  Click me for a JS alert!
15 |  Go to google.com
16 |  Go to example.com
17 |  
18 | 
19 | 


--------------------------------------------------------------------------------
/test/test_pages/http_stacktrace.html:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | async load by script
 5 |   
25 |  
26 |  
27 |  

The scripts on this page inject an image, an invisible pixel and a script.

28 |
29 |
30 | 31 | -------------------------------------------------------------------------------- /test/test_pages/http_test_page.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Simple HTTP Test page 5 | 6 | 7 | 8 | 9 |

This test page sources an image, script, and css and favicon resource. 10 | It also includes an iframe which contains the same image and a different 11 | script.

12 | 13 | 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /test/test_pages/http_test_page_2.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Simple HTTP Test page 2 5 | 6 | 7 |

This test page sources two different images and a script.

8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /test/test_pages/js_call_stack.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Simple Page A 5 | 12 | 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /test/test_pages/js_cookie.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | localStorage example 5 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /test/test_pages/lso/FlashCookie.swf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sensor-js/OpenWPM-mobile/1834f4c5b8fd0d3976b2e57f5310fd72860a681f/test/test_pages/lso/FlashCookie.swf -------------------------------------------------------------------------------- /test/test_pages/lso/flash-cookie.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sensor-js/OpenWPM-mobile/1834f4c5b8fd0d3976b2e57f5310fd72860a681f/test/test_pages/lso/flash-cookie.js -------------------------------------------------------------------------------- /test/test_pages/lso/setlso.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Flash cookie example 5 | 6 | 7 | 8 | 9 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /test/test_pages/post_file_upload.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | File upload form test page 6 | 12 | 13 | 14 |

Submit a form with the given encoding type in the URL params.

15 | 16 |
17 |
18 |
19 |
20 | 21 |
22 | 23 | 24 | -------------------------------------------------------------------------------- /test/test_pages/post_request.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | POST request test page 6 | 19 | 20 | 21 |

Submit a form with the given encoding type in the URL params.

22 | 23 |
24 |
25 |
26 |
29 | 30 |
31 |

Automated tests uses the following URLs to submit data in different encodings

32 | 37 | 38 | 39 | -------------------------------------------------------------------------------- /test/test_pages/post_request_ajax.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Test page - POST request using AJAX 5 | 8 | 9 | 51 | 52 | 53 |

Automated tests uses the following URLs to submit data in different formats using AJAX POST

54 | 61 | 62 | 63 | -------------------------------------------------------------------------------- /test/test_pages/property_enumeration.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Property Enumeration Test Page 4 |

Property Enumeration Test Page

5 | 6 |

 7 | 
 8 | 
34 | 
35 | 
36 | 


--------------------------------------------------------------------------------
/test/test_pages/sensor_value_test.html:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | 
  4 | 
  5 | 

Sensor Value Ranges

6 | 7 |

Ambient light

8 |

9 |

10 | 11 |

Proximity

12 |

13 |

14 | 15 | 16 |

Battery

17 |
  • Is battery in charge? unavailable
  • 18 |
  • Battery will be charged in unavailable seconds
  • 19 |
  • Battery will be discharged in unavailable seconds
  • 20 |
  • Current battery level: unavailable
  • 21 | 22 |

    Orientation

    23 |

    24 |

    25 |

    26 | 27 |

    Accelerometer

    28 |

    29 |

    30 |

    31 |

    32 |

    33 |

    34 |

    35 | 36 |

    Gyro

    37 |

    38 |

    39 |

    40 | 41 |

    42 | 43 | 138 | 139 | 140 | -------------------------------------------------------------------------------- /test/test_pages/sensors.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | localStorage example 5 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /test/test_pages/shared/test_favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sensor-js/OpenWPM-mobile/1834f4c5b8fd0d3976b2e57f5310fd72860a681f/test/test_pages/shared/test_favicon.ico -------------------------------------------------------------------------------- /test/test_pages/shared/test_image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sensor-js/OpenWPM-mobile/1834f4c5b8fd0d3976b2e57f5310fd72860a681f/test/test_pages/shared/test_image.png -------------------------------------------------------------------------------- /test/test_pages/shared/test_image_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sensor-js/OpenWPM-mobile/1834f4c5b8fd0d3976b2e57f5310fd72860a681f/test/test_pages/shared/test_image_2.png -------------------------------------------------------------------------------- /test/test_pages/shared/test_script.js: -------------------------------------------------------------------------------- 1 | //A simple script 2 | window.test_script_loaded = true; 3 | console.log("test script loaded"); 4 | -------------------------------------------------------------------------------- /test/test_pages/shared/test_script_2.js: -------------------------------------------------------------------------------- 1 | // A second simple script 2 | window.test_script_2_loaded = true; 3 | console.log("test script 2 loaded"); 4 | 5 | var test = 1; 6 | 7 | function test_function() { 8 | test = test + 1; 9 | console.log(test); 10 | } 11 | 12 | test_function(); 13 | test_function(); 14 | test_function(); 15 | -------------------------------------------------------------------------------- /test/test_pages/shared/test_style.css: -------------------------------------------------------------------------------- 1 | p { 2 | color: red; 3 | } 4 | -------------------------------------------------------------------------------- /test/test_pages/shared/utils.js: -------------------------------------------------------------------------------- 1 | function getParameterByName(name) { 2 | // http://stackoverflow.com/a/901144 3 | name = name.replace(/[\[]/, "\\[").replace(/[\]]/, "\\]"); 4 | var regex = new RegExp("[\\?&]" + name + "=([^&#]*)"), 5 | results = regex.exec(location.search); 6 | return results === null ? "" : decodeURIComponent(results[1].replace(/\+/g, " ")); 7 | } 8 | -------------------------------------------------------------------------------- /test/test_pages/simple_a.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Simple Page A 5 | 11 | 12 | 13 | Click me! 14 | Click me also! 15 | Click me for a JS alert! 16 | Go to google.com 17 | Go to example.com 18 | 19 | -------------------------------------------------------------------------------- /test/test_pages/simple_b.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Simple Page B 5 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /test/test_pages/simple_c.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Simple Page C 5 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /test/test_pages/simple_d.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Simple Page D 5 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /test/test_pages/stack.js: -------------------------------------------------------------------------------- 1 | // A function in an external script 2 | function js_check_navigator() { 3 | console.log(window.navigator.userAgent); 4 | var foo = eval("window.navigator.platform"); 5 | } 6 | 7 | // call the above function 8 | js_check_navigator(); 9 | 10 | // use eval 11 | var bar = eval("window.navigator.buildID"); 12 | 13 | //use Function 14 | new Function("window.navigator.appVersion")(); 15 | -------------------------------------------------------------------------------- /test/test_pages/webrtc_localip.html: -------------------------------------------------------------------------------- 1 | 2 | WebRTC Local IP Test Page 3 |

    WebRTC Local IP Test Page

    4 |
    
     5 | 
     6 | 
    33 | 
    34 | 
    35 | 
    
    
    --------------------------------------------------------------------------------
    /test/test_profile.py:
    --------------------------------------------------------------------------------
     1 | import pytest
     2 | from os.path import join, isfile
     3 | from ..automation import TaskManager
     4 | from ..automation.Errors import CommandExecutionError, ProfileLoadError
     5 | from openwpmtest import OpenWPMTest
     6 | 
     7 | 
     8 | # TODO update these tests to make use of blocking commands
     9 | class TestProfile(OpenWPMTest):
    10 | 
    11 |     def get_config(self, data_dir=""):
    12 |         manager_params, browser_params = self.get_test_config(data_dir)
    13 |         browser_params[0]['profile_archive_dir'] =\
    14 |             join(manager_params['data_directory'], 'browser_profile')
    15 |         return manager_params, browser_params
    16 | 
    17 |     def test_saving(self):
    18 |         manager_params, browser_params = self.get_config()
    19 |         manager = TaskManager.TaskManager(manager_params, browser_params)
    20 |         manager.get('http://example.com')
    21 |         manager.close()
    22 |         assert isfile(join(browser_params[0]['profile_archive_dir'],'profile.tar.gz'))
    23 | 
    24 |     def test_crash(self):
    25 |         manager_params, browser_params = self.get_config()
    26 |         manager_params['failure_limit'] = 0
    27 |         manager = TaskManager.TaskManager(manager_params, browser_params)
    28 |         with pytest.raises(CommandExecutionError):
    29 |             manager.get('http://example.com') # So we have a profile
    30 |             manager.get('example.com') # Selenium requires scheme prefix
    31 |             manager.get('example.com') # Requires two commands to shut down
    32 | 
    33 |     def test_crash_profile(self):
    34 |         manager_params, browser_params = self.get_config()
    35 |         manager_params['failure_limit'] = 2
    36 |         manager = TaskManager.TaskManager(manager_params, browser_params)
    37 |         try:
    38 |             manager.get('http://example.com') # So we have a profile
    39 |             manager.get('example.com') # Selenium requires scheme prefix
    40 |             manager.get('example.com') # Selenium requires scheme prefix
    41 |             manager.get('example.com') # Selenium requires scheme prefix
    42 |             manager.get('example.com') # Requires two commands to shut down
    43 |         except CommandExecutionError:
    44 |             pass
    45 |         assert isfile(join(browser_params[0]['profile_archive_dir'],'profile.tar.gz'))
    46 | 
    47 |     def test_profile_error(self):
    48 |         manager_params, browser_params = self.get_config()
    49 |         browser_params[0]['profile_tar'] = '/tmp/NOTREAL'
    50 |         with pytest.raises(ProfileLoadError):
    51 |             TaskManager.TaskManager(manager_params, browser_params)  # noqa
    52 | 
    53 |     def test_profile_saved_when_launch_crashes(self):
    54 |         manager_params, browser_params = self.get_config()
    55 |         browser_params[0]['proxy'] = True
    56 |         browser_params[0]['save_javascript'] = True
    57 |         manager = TaskManager.TaskManager(manager_params, browser_params)
    58 |         manager.get('http://example.com')
    59 | 
    60 |         # Kill the LevelDBAggregator
    61 |         # This will cause the proxy launch to crash
    62 |         manager.ldb_status_queue.put("DIE")
    63 |         manager.browsers[0]._SPAWN_TIMEOUT = 2 # Have timeout occur quickly
    64 |         manager.browsers[0]._UNSUCCESSFUL_SPAWN_LIMIT = 2 # Have timeout occur quickly
    65 |         manager.get('example.com') # Cause a selenium crash to force browser to restart
    66 | 
    67 |         # The browser will fail to launch due to the proxy crashes
    68 |         try:
    69 |             manager.get('http://example.com')
    70 |         except CommandExecutionError:
    71 |             pass
    72 |         manager.close()
    73 |         assert isfile(join(browser_params[0]['profile_archive_dir'],'profile.tar.gz'))
    74 | 
    75 |     #TODO Check for Flash
    76 |     #TODO Check contents of profile (tests should fail anyway if profile doesn't contain everything)
    77 | 
    
    
    --------------------------------------------------------------------------------
    /test/test_sensors.py:
    --------------------------------------------------------------------------------
     1 | import utilities
     2 | from openwpmtest import OpenWPMTest
     3 | from ..automation import TaskManager
     4 | from ..automation.utilities import db_utils
     5 | import json
     6 | # TODO: add test for setter instrumentation
     7 | 
     8 | 
     9 | class TestExtension(OpenWPMTest):
    10 |     NUM_BROWSERS = 1
    11 | 
    12 |     def get_config(self, data_dir=""):
    13 |         manager_params, browser_params = self.get_test_config(data_dir)
    14 |         browser_params[0]['js_instrument'] = True
    15 |         return manager_params, browser_params
    16 | 
    17 |     def test_sensor_probing(self, tmpdir):
    18 |         test_url = utilities.BASE_TEST_URL + '/sensors.html'
    19 |         db = self.visit(test_url, str(tmpdir))
    20 |         rows = db_utils.get_javascript_entries(db, all_columns=True)
    21 |         observed_sensor_apis = set()
    22 |         expected_apis = set(['deviceorientation', 'devicemotion',
    23 |                              'deviceproximity', 'devicelight'])
    24 |         for row in rows:
    25 |             if row[9] == "window.addEventListener":
    26 |                 observed_sensor_apis.add(json.loads(row[12])["0"])
    27 |                 assert row[3] == test_url
    28 |         assert observed_sensor_apis == expected_apis
    29 | 
    
    
    --------------------------------------------------------------------------------
    /test/test_storage_vectors.py:
    --------------------------------------------------------------------------------
      1 | import pytest
      2 | import utilities
      3 | from ..automation import CommandSequence
      4 | from ..automation import TaskManager
      5 | from ..automation.utilities import db_utils
      6 | from openwpmtest import OpenWPMTest
      7 | 
      8 | expected_lso_content_a = [
      9 |                1, # visit id
     10 |                u'localtest.me',
     11 |                u'FlashCookie.sol',
     12 |                u'localtest.me/FlashCookie.sol',
     13 |                u'test_key',
     14 |                u'REPLACEME']
     15 | 
     16 | expected_lso_content_b = [
     17 |                2, # visit id
     18 |                u'localtest.me',
     19 |                u'FlashCookie.sol',
     20 |                u'localtest.me/FlashCookie.sol',
     21 |                u'test_key',
     22 |                u'REPLACEME']
     23 | 
     24 | expected_js_cookie = (
     25 |              1, # visit id
     26 |              u'%s' % utilities.BASE_TEST_URL_DOMAIN,
     27 |              u'test_cookie',
     28 |              u'Test-0123456789',
     29 |              u'%s' % utilities.BASE_TEST_URL_DOMAIN,
     30 |              u'/')
     31 | 
     32 | 
     33 | class TestStorageVectors(OpenWPMTest):
     34 |     """ Runs some basic tests to check that the saving of
     35 |     storage vectors (i.e. Flash LSOs, profile cookies) works.
     36 | 
     37 |     NOTE: These tests are very basic and should be expanded
     38 |     on to check for completeness and correctness.
     39 |     """
     40 | 
     41 |     def get_config(self, data_dir=""):
     42 |         return self.get_test_config(data_dir)
     43 | 
     44 |     @pytest.mark.skip("Flash is not used for mobile crawls")
     45 |     def test_flash_cookies(self):
     46 |         """ Check that some Flash LSOs are saved and
     47 |         are properly keyed in db."""
     48 |         # Run the test crawl
     49 |         manager_params, browser_params = self.get_config()
     50 |         browser_params[0]['disable_flash'] = False
     51 |         manager = TaskManager.TaskManager(manager_params, browser_params)
     52 | 
     53 |         # Get a site we know sets Flash cookies and visit it twice
     54 |         lso_value_a = utilities.rand_str(8)
     55 |         expected_lso_content_a[5] = lso_value_a  # we'll expect this to be present
     56 |         qry_str = '?lso_test_key=%s&lso_test_value=%s' % ("test_key",
     57 |                                                           lso_value_a)
     58 |         test_url_a = utilities.BASE_TEST_URL + '/lso/setlso.html' + qry_str
     59 |         cs = CommandSequence.CommandSequence(test_url_a)
     60 |         cs.get(sleep=3, timeout=120)
     61 |         cs.dump_flash_cookies()
     62 |         manager.execute_command_sequence(cs)
     63 | 
     64 |         lso_value_b = utilities.rand_str(8)
     65 |         expected_lso_content_b[5] = lso_value_b  # we'll expect this to be present
     66 |         qry_str = '?lso_test_key=%s&lso_test_value=%s' % ("test_key",
     67 |                                                           lso_value_b)
     68 |         test_url_b = utilities.BASE_TEST_URL + '/lso/setlso.html' + qry_str
     69 |         cs = CommandSequence.CommandSequence(test_url_b)
     70 |         cs.get(sleep=3, timeout=120)
     71 |         cs.dump_flash_cookies()
     72 |         manager.execute_command_sequence(cs)
     73 | 
     74 |         manager.close()
     75 | 
     76 |         #  Check that some flash cookies are recorded
     77 |         qry_res = db_utils.query_db(manager_params['db'],
     78 |                                      "SELECT * FROM flash_cookies")
     79 |         lso_count = len(qry_res)
     80 |         assert lso_count == 2
     81 |         lso_content_a = list(qry_res[0][2:])  # Remove first two items
     82 |         lso_content_b = list(qry_res[1][2:])  # Remove first two items
     83 |         # remove randomly generated LSO directory name
     84 |         # e.g. TY2FOJUG/localtest.me/Flash.sol -> localtest.me/Flash.sol
     85 |         lso_content_a[3] = lso_content_a[3].split("/", 1)[-1]  # remove LSO dirname
     86 |         lso_content_b[3] = lso_content_b[3].split("/", 1)[-1]  # remove LSO dirname
     87 |         assert lso_content_a == expected_lso_content_a
     88 |         assert lso_content_b == expected_lso_content_b
     89 | 
     90 |     def test_profile_cookies(self):
     91 |         """ Check that some profile cookies are saved """
     92 |         # Run the test crawl
     93 |         manager_params, browser_params = self.get_config()
     94 |         manager = TaskManager.TaskManager(manager_params, browser_params)
     95 |         # TODO update this to local test site
     96 |         url = 'http://www.yahoo.com'
     97 |         cs = CommandSequence.CommandSequence(url)
     98 |         cs.get(sleep=3, timeout=120)
     99 |         cs.dump_profile_cookies()
    100 |         manager.execute_command_sequence(cs)
    101 |         manager.close()
    102 | 
    103 |         # Check that some flash cookies are recorded
    104 |         qry_res = db_utils.query_db(manager_params['db'],
    105 |                                      "SELECT COUNT(*) FROM profile_cookies")
    106 |         prof_cookie_count = qry_res[0]
    107 |         assert prof_cookie_count > 0
    108 | 
    109 |     def test_js_profile_cookies(self):
    110 |         """ Check that profile cookies set by JS are saved """
    111 |         # Run the test crawl
    112 |         manager_params, browser_params = self.get_config()
    113 |         manager = TaskManager.TaskManager(manager_params, browser_params)
    114 |         url = utilities.BASE_TEST_URL + "/js_cookie.html"
    115 |         cs = CommandSequence.CommandSequence(url)
    116 |         cs.get(sleep=3, timeout=120)
    117 |         cs.dump_profile_cookies()
    118 |         manager.execute_command_sequence(cs)
    119 |         manager.close()
    120 |         # Check that the JS cookie we stored is recorded
    121 |         qry_res = db_utils.query_db(manager_params['db'], "SELECT * FROM profile_cookies")
    122 |         assert len(qry_res) == 1  # we store only one cookie
    123 |         cookies = qry_res[0]  # take the first cookie
    124 |         # compare URL, domain, name, value, origin, path
    125 |         assert cookies[2:8] == expected_js_cookie
    126 | 
    
    
    --------------------------------------------------------------------------------
    /test/test_trigger_sensor_events.py:
    --------------------------------------------------------------------------------
     1 | import pytest
     2 | import utilities
     3 | from openwpmtest import OpenWPMTest
     4 | from ..automation import TaskManager
     5 | from ..automation import CommandSequence
     6 | from ..automation.utilities import db_utils
     7 | 
     8 | 
     9 | class TestTriggerSensorEvents(OpenWPMTest):
    10 |     """Make sure that we trigger fake sensor events."""
    11 | 
    12 |     def get_config(self, data_dir=""):
    13 |         return self.get_test_config(data_dir)
    14 | 
    15 |     def test_trigger_sensor_events(self):
    16 |         manager_params, browser_params = self.get_config()
    17 |         browser_params[0]['trigger_sensor_events'] = True
    18 |         manager = TaskManager.TaskManager(manager_params, browser_params)
    19 |         test_url = utilities.BASE_TEST_URL + '/sensor_value_test.html'
    20 | 
    21 |         def get_text_from_el(driver, element_id):
    22 |             js_str = 'return document.getElementById("%s").innerHTML' %\
    23 |                 element_id
    24 |             return driver.execute_script(js_str)
    25 | 
    26 |         def check_trigger_sensor_events(**kwargs):
    27 |             """Check if we find the sensor values on the page"""
    28 |             driver = kwargs['driver']
    29 | 
    30 |             device_light_str = get_text_from_el(driver, "DeviceLight")
    31 |             assert "AmbientLight current Value: " in device_light_str
    32 |             assert "Max:" in device_light_str
    33 |             assert "Min:" in device_light_str
    34 | 
    35 |             device_proximity_str = get_text_from_el(driver, "DeviceProximity")
    36 |             assert "DeviceProximity current Value: " in device_proximity_str
    37 |             assert "Max:" in device_proximity_str
    38 |             assert "Min:" in device_proximity_str
    39 | 
    40 |             user_proximity_str = get_text_from_el(driver, "UserProximity")
    41 |             assert user_proximity_str == "UserProximity: true"
    42 | 
    43 |             batt_in_charge_str = get_text_from_el(driver, "in-charge")
    44 |             assert batt_in_charge_str != "unavailable"
    45 | 
    46 |             batt_charging_time_str = get_text_from_el(driver, "charging-time")
    47 |             assert batt_charging_time_str != "unavailable"
    48 | 
    49 |             batt_discharging_time_str = get_text_from_el(driver,
    50 |                                                          "discharging-time")
    51 |             assert batt_discharging_time_str != "unavailable"
    52 | 
    53 |             batt_level_str = get_text_from_el(driver, "battery-level")
    54 |             assert batt_level_str != "unavailable"
    55 | 
    56 |             assert "Z-axis: " in get_text_from_el(driver, "Orientation_a")
    57 |             assert "X-axis: " in get_text_from_el(driver, "Orientation_b")
    58 |             assert "Y-axis: " in get_text_from_el(driver, "Orientation_g")
    59 | 
    60 |             assert "AccelerometerIncludingGravity X-axis:" in\
    61 |                 get_text_from_el(driver, "Accelerometer_gx")
    62 |             assert "AccelerometerIncludingGravity Y-axis:" in\
    63 |                 get_text_from_el(driver, "Accelerometer_gy")
    64 |             assert "AccelerometerIncludingGravity Z-axis:" in\
    65 |                 get_text_from_el(driver, "Accelerometer_gz")
    66 | 
    67 |             assert "Accelerometer X-axis: " in\
    68 |                 get_text_from_el(driver, "Accelerometer_x")
    69 |             assert "Accelerometer Y-axis: " in\
    70 |                 get_text_from_el(driver, "Accelerometer_y")
    71 |             assert "Accelerometer Z-axis: " in\
    72 |                 get_text_from_el(driver, "Accelerometer_z")
    73 |             assert "Data Interval: " in\
    74 |                 get_text_from_el(driver, "Accelerometer_i")
    75 | 
    76 |             assert "Gyro X-axis: " in\
    77 |                 get_text_from_el(driver, "Gyro_x")
    78 |             assert "Gyro Y-axis: " in\
    79 |                 get_text_from_el(driver, "Gyro_y")
    80 |             assert "Gyro Z-axis: " in\
    81 |                 get_text_from_el(driver, "Gyro_z")
    82 | 
    83 |         cs = CommandSequence.CommandSequence(test_url, blocking=True)
    84 |         cs.get(sleep=5, timeout=60)
    85 |         cs.run_custom_function(check_trigger_sensor_events)
    86 |         manager.execute_command_sequence(cs)
    87 |         manager.close()
    88 |         assert not db_utils.any_command_failed(manager_params['db'])
    89 | 
    
    
    --------------------------------------------------------------------------------
    /test/utilities.py:
    --------------------------------------------------------------------------------
     1 | import SimpleHTTPServer
     2 | import SocketServer
     3 | import threading
     4 | import os
     5 | from random import choice
     6 | from os.path import realpath, dirname
     7 | LOCAL_WEBSERVER_PORT = 8000
     8 | BASE_TEST_URL_DOMAIN = "localtest.me"
     9 | BASE_TEST_URL_NOPATH = "http://%s:%s" % (BASE_TEST_URL_DOMAIN,
    10 |                                          LOCAL_WEBSERVER_PORT)
    11 | BASE_TEST_URL = "%s/test_pages" % BASE_TEST_URL_NOPATH
    12 | 
    13 | 
    14 | class MyTCPServer(SocketServer.TCPServer):
    15 |     """Subclass TCPServer to be able to reuse the same port (Errno 98)."""
    16 |     allow_reuse_address = True
    17 | 
    18 | 
    19 | def start_server():
    20 |     """ Start a simple HTTP server to run local tests.
    21 | 
    22 |     We need this since page-mod events in the extension
    23 |     don't fire on `file://*`. Instead, point test code to
    24 |     `http://localtest.me:8000/test_pages/...`
    25 |     """
    26 |     print "Starting HTTP Server in a separate thread"
    27 |     # switch to test dir, this is where the test files are
    28 |     os.chdir(dirname(realpath(__file__)))
    29 |     Handler = SimpleHTTPServer.SimpleHTTPRequestHandler
    30 |     server = MyTCPServer(("localhost", LOCAL_WEBSERVER_PORT), Handler)
    31 |     thread = threading.Thread(target=server.serve_forever)
    32 |     thread.daemon = True
    33 |     thread.start()
    34 |     print "...serving at port", LOCAL_WEBSERVER_PORT
    35 |     return server, thread
    36 | 
    37 | 
    38 | def rand_str(size=8):
    39 |     """Return random string with the given size."""
    40 |     RAND_CHARS = "abcdefghijklmnopqrstuvwxyz0123456789"
    41 |     return ''.join(choice(RAND_CHARS) for _ in range(size))
    42 | 
    
    
    --------------------------------------------------------------------------------