├── .gitignore
├── .travis.yml
├── CHANGELOG
├── EmulatingAndroidFonts.md
├── LICENSE
├── README.md
├── VERSION
├── __init__.py
├── automation
├── BrowserManager.py
├── CommandSequence.py
├── Commands
│ ├── __init__.py
│ ├── browser_commands.py
│ ├── command_executor.py
│ ├── profile_commands.py
│ └── utils
│ │ ├── XPathUtil.py
│ │ ├── __init__.py
│ │ ├── file_utils.py
│ │ ├── firefox_profile.py
│ │ ├── gen_utils.py
│ │ ├── lso.py
│ │ └── webdriver_extensions.py
├── DataAggregator
│ ├── DataAggregator.py
│ ├── LevelDBAggregator.py
│ └── __init__.py
├── DeployBrowsers
│ ├── __init__.py
│ ├── configure_firefox.py
│ ├── deploy_browser.py
│ ├── deploy_firefox.py
│ ├── firefox_extensions
│ │ ├── adblock_plus-2.7.xpi
│ │ ├── ghostery
│ │ │ ├── ghostery-5.4.10.xpi
│ │ │ └── store.json
│ │ └── https_everywhere-5.1.0.xpi
│ ├── screen_resolutions.txt
│ └── user_agent_strings.txt
├── Errors.py
├── Extension
│ └── firefox
│ │ ├── data
│ │ ├── content.js
│ │ ├── create_content_policy_table.sql
│ │ ├── create_cookies_table.sql
│ │ ├── create_http_requests_table.sql
│ │ ├── create_http_responses_table.sql
│ │ ├── create_javascript_table.sql
│ │ ├── create_pages_table.sql
│ │ ├── remove_webdriver_attributes.js
│ │ └── trigger_sensor_events.js
│ │ ├── doc
│ │ └── main.md
│ │ ├── index.js
│ │ ├── lib
│ │ ├── content-policy-instrument.js
│ │ ├── cookie-instrument.js
│ │ ├── http-instrument.js
│ │ ├── http-post-parser.js
│ │ ├── javascript-instrument.js
│ │ ├── loggingdb.js
│ │ ├── page-manager.js
│ │ └── socket.js
│ │ ├── node_modules
│ │ └── bufferpack
│ │ │ ├── .npmignore
│ │ │ ├── CHANGELOG
│ │ │ ├── LICENSE
│ │ │ ├── bufferpack.js
│ │ │ └── package.json
│ │ ├── package.json
│ │ └── test
│ │ └── test-main.js
├── MPLogger.py
├── Proxy
│ ├── MITMProxy.py
│ ├── __init__.py
│ ├── cert
│ │ ├── mitmproxy-ca-cert.cer
│ │ ├── mitmproxy-ca-cert.p12
│ │ ├── mitmproxy-ca-cert.pem
│ │ ├── mitmproxy-ca.pem
│ │ └── mitmproxy-dhparam.pem
│ ├── cert8.db
│ ├── deploy_mitm_proxy.py
│ ├── key3.db
│ └── mitm_commands.py
├── SocketInterface.py
├── TaskManager.py
├── __init__.py
├── default_browser_params.json
├── default_manager_params.json
├── schema.sql
└── utilities
│ ├── Cookie.py
│ ├── __init__.py
│ ├── build_cookie_table.py
│ ├── db_utils.py
│ ├── domain_utils.py
│ └── platform_utils.py
├── clustering
└── Clustering_JS_scripts.ipynb
├── demo.py
├── feature_extraction
├── SensorAccesByRankPlot.ipynb
├── __init__.py
├── extract_features.py
└── utils.py
├── install-analysis.sh
├── install.sh
├── mobile_sensor_crawl.py
├── requirements.txt
└── test
├── __init__.py
├── conftest.py
├── expected.py
├── manual_test.py
├── openwpmtest.py
├── test_adblock_plus.py
├── test_crawl.py
├── test_custom_function_command.py
├── test_disable_webdriver_self_id.py
├── test_env.py
├── test_extension.py
├── test_http_instrumentation.py
├── test_js_instrument.py
├── test_pages
├── abp
│ ├── adblock_plus_test.html
│ ├── adspot
│ │ └── 1.js
│ ├── adsystem
│ │ └── 3.js
│ └── bannerads
│ │ └── 2.js
├── audio_fingerprinting.html
├── battery_fingerprinting.html
├── canvas_fingerprinting.html
├── expected_source.html
├── http_stacktrace.html
├── http_test_page.html
├── http_test_page_2.html
├── instrument_object.html
├── js_call_stack.html
├── js_cookie.html
├── lso
│ ├── FlashCookie.swf
│ ├── flash-cookie.js
│ └── setlso.html
├── post_file_upload.html
├── post_request.html
├── post_request_ajax.html
├── property_enumeration.html
├── sensor_value_test.html
├── sensors.html
├── shared
│ ├── test_favicon.ico
│ ├── test_image.png
│ ├── test_image_2.png
│ ├── test_script.js
│ ├── test_script_2.js
│ ├── test_style.css
│ └── utils.js
├── simple_a.html
├── simple_b.html
├── simple_c.html
├── simple_d.html
├── stack.js
└── webrtc_localip.html
├── test_profile.py
├── test_sensors.py
├── test_simple_commands.py
├── test_storage_vectors.py
├── test_trigger_sensor_events.py
└── utilities.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # firefox directories
2 | firefox-bin/
3 |
4 | # VIM tmp files
5 | *~
6 | .*.sw*
7 |
8 | # A bug in selenium creates this on unix systems
9 | C:\\nppdf32Log\\debuglog.txt
10 |
11 | # PyCharm
12 | .idea/*
13 | *.idea
14 | */idea
15 |
16 | # Byte-compiled / optimized / DLL files
17 | __pycache__/
18 | *.py[cod]
19 |
20 | # C extensions
21 | *.so
22 |
23 | # Distribution / packaging
24 | .Python
25 | env/
26 | bin/
27 | build/
28 | develop-eggs/
29 | dist/
30 | eggs/
31 | #lib/
32 | #lib64/
33 | parts/
34 | sdist/
35 | var/
36 | *.egg-info/
37 | .installed.cfg
38 | *.egg
39 |
40 | # Installer logs
41 | pip-log.txt
42 | pip-delete-this-directory.txt
43 |
44 | # Unit test / coverage reports
45 | htmlcov/
46 | .tox/
47 | .coverage
48 | .cache
49 | nosetests.xml
50 | coverage.xml
51 |
52 | # Translations
53 | *.mo
54 |
55 | # Mr Developer
56 | .mr.developer.cfg
57 | .project
58 | .pydevproject
59 |
60 | # Rope
61 | .ropeproject
62 |
63 | # Django stuff:
64 | *.log
65 | *.pot
66 |
67 | # Sphinx documentation
68 | docs/_build/
69 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | sudo: required
2 | language: python
3 | os: linux
4 | dist: trusty
5 | group: deprecated-2017Q4
6 | env:
7 | # See, https://docs.travis-ci.com/user/speeding-up-the-build/
8 | # We need a balanced distribution of the tests
9 | # Once we add and remove tests, this distribution may become unbalanced.
10 | # Feel free to move tests around to make the running time of the jobs
11 | # as close as possible.
12 | - TESTS=test_[a-b,d-e]*
13 | # test_crawl.py is the longest running test.
14 | - TESTS=test_c*
15 | - TESTS=test_[f-h]*
16 | - TESTS=test_[i-z]*
17 | git:
18 | depth: 3
19 | before_install:
20 | - "export DISPLAY=:99.0"
21 | # https://github.com/npm/npm/issues/20203
22 | # !!! comment the following when the cert issue is fixed
23 | - "npm config set strict-ssl false"
24 | install:
25 | - echo "y" | ./install.sh
26 | - pip install -r requirements.txt
27 | before_script:
28 | - cd test
29 | script:
30 | - py.test -s -v --durations=10 $TESTS
31 |
--------------------------------------------------------------------------------
/CHANGELOG:
--------------------------------------------------------------------------------
1 | v0.7.0 - 2016-11-15
2 | ======
3 |
4 | Changes:
5 | * Bugfixes to extension instrumentation where records would be dropped when
6 | the extension was under heavy load and fail to re-enable until the browser
7 | was restarted.
8 | * Bugfix to extension / socket interface
9 | * Add `run_custom_function` command
10 | * Using alternative serialization/parallelization with `dill` and
11 | `multiprocess`
12 | * Better documentation
13 | * Bugfixes to install script
14 | * Add `save_screenshot` and `dump_page_source` commands
15 | * Add Audio API instrumentation
16 | * Bugfix to `browse` command
17 | * Bugfix to extension instrumentation injection to avoid Security Errors
18 |
19 | v0.6.2 - 2016-04-08
20 | ======
21 |
22 | Changes:
23 | * Bugfix to browse command. Now supports sleeping after get.
24 |
25 | v0.6.1 - 2016-04-08
26 | ======
27 |
28 | Critical:
29 | * Bugfix in LevelDBAggregator preventing data loss
30 |
31 | Changes:
32 | * Bump to Firefox 45 & Selenium 2.53.0
33 | * Update certificate stored
34 | * Added sleep argument to `get` command
35 | * Added install script for development dependencies
36 | * Improved error handling in TaskManager and Proxy
37 | * Version bumps and bugfixes in HTTPS Everywhere, Ghostery, and ABP
38 | * Tests added!
39 | * Numerous bugfixes and improvements in Javascript Instrumentation
40 |
41 | v0.6.0 - 2015-12-22
42 | ======
43 |
44 | Changes:
45 | * Cleanup of Firefox prefs to make browsers faster and reduce phoning home
46 | * Use LevelDB for javascript file storage
47 | * Improved HTTP Cookie Parsing
48 | * Several bugfixes to extension instrumentation
49 | * Improved profile handling during shutdown and crashes
50 | * Improved handling of child Exceptions
51 | * Inital platform tests
52 | * Improvements to javascript instrumentation
53 |
54 | v0.5.1 - 2015-10-15
55 | ======
56 |
57 | Changes:
58 | * Save json serialized headers and fix cookie parsing
59 |
60 | v0.5.0 - 2015-10-14
61 | ======
62 |
63 | Changes:
64 | * Added support for saving all javascript files de-duplicated and compressed
65 | * Created two configuration dictionaries. One for individual browsers and
66 | another for the entire infrastructure
67 | * Support for using OpenWPM as a submodule
68 | * Firefox (v39) and Selenium (v2.47.1)
69 | * Added support for launching Ghostery, HTTPS Everywhere, and AdBlock Plus
70 | * Removed Random Extension Support
71 | * Bugfix for broken profile saving.
72 | * Bugfix for profile clearing when memory limits are exceeded
73 | * Numerous stability fixes
74 | * Full Logging support in all commands
75 |
76 | v0.4.0
77 | ======
78 |
79 | Changes:
80 | * Significant stability improvements for long crawls
81 | * Support for logging with logging module
82 | * A large number of bugfixes related to process handling
83 | * Prevention of a large number of stray tmp files/folders during long crawls
84 | * Process/memory watchdog to handle orphaned processes and keep memory usage
85 | reasonable
86 | * Numerous bugfixes for extension
87 | * Failure thresholds to prevent infinite loops of browser respawns or
88 | command execution attempts (instead, Errors and raised)
89 | * Script to install dependencies
90 | * API changes to command timeouts
91 | * Move SocketInterface from pickle to json serialization
92 |
93 | Known Issues:
94 | * Encoding issues cause a very small percentage of data to be dropped by the
95 | extension
96 | * Malformed queries are occassionally sent to the DataAggregator and are
97 | dropped. The cause is unknown.
98 | * Forking can be done in a more memory efficient way
99 |
100 | 0.3.1 - Fixes #5
101 | 0.3.0 - Experimental merge of Fourthparty + framework to allow additional
102 | javascript instrumentation.
103 | 0.2.3 - Timeout logging
104 | 0.2.2 - Browse command + better scrolling + bugfixes
105 | 0.2.1 - Support for MITMProxy v0.11 + minor bugfixes
106 | 0.2.0 - Complete re-write of HTTP Cookie parsing
107 | 0.1.1 - Simplfied load of default settings, including wiki demo
108 | 0.1.0 - Initial Public Release
109 |
--------------------------------------------------------------------------------
/EmulatingAndroidFonts.md:
--------------------------------------------------------------------------------
1 | To mitigate detection of the OpenWPM-Mobile by font-based fingerprinting,
2 | you may uninstall all fonts present on your crawler machine and install fonts
3 | extracted from a real Android device.
4 |
5 | ## 1 Extracting Android fonts:
6 |
7 | Connect the Android device you want to emulate in USB debugging mode.
8 | Copy the Android fonts from the phone using `adb`:
9 |
10 | ```
11 | mkdir android_fonts # create a directory for the font files
12 | cd android_fonts
13 | adb pull /system/fonts # copy the font files from the device
14 | ```
15 |
16 | ## 2 Adding Android fonts to the crawler machine:
17 |
18 | ```
19 | mv ~/.fonts ~/.fonts_BKP # back-up existing user-specific fonts - may or may not exist
20 | mkdir -p ~/.fonts # create the user-specific font directory
21 | cp android_fonts/* ~/.fonts # copy font files extracted from the Android device
22 | fc-cache -f -v # update the font cache
23 |
24 | ```
25 |
26 |
27 | ## 3 Comment out the aliases for `MS Gothic` and `MS PGothic` fonts in `/etc/fonts/conf.avail/30-cjk-aliases.conf`
28 |
29 | ```
30 |
31 |
44 | ...
45 |
58 | ```
59 |
60 | ## 4 Remove existing system-wide fonts:
61 | We need to empty `/usr/share/fonts` and `/usr/local/share/fonts`
62 |
63 | ```
64 | mkdir ~/usr_share_bkp
65 | mkdir ~/usr_local_share_bkp
66 | mv /usr/share/fonts/* ~/usr_share_bkp
67 | mv /usr/local/share/fonts* ~/usr_local_share_bkp
68 | fc-cache -f -v
69 | ```
70 |
71 | If you are using a non-Debian based distro, check `/etc/fonts/fonts.conf`
72 | for `` and move the font files in those dirs to a backup dir.
73 |
74 | ### Restoring old fonts after the crawl:
75 | mv ~/usr_share_bkp/* /usr/share/fonts/
76 | mv ~/usr_local_share_bkp/* /usr/local/share/fonts/
77 | mv ~/.fonts_BKP ~/.fonts
78 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | OpenWPM-Mobile [](https://travis-ci.org/sensor-js/OpenWPM-mobile)
2 | =======
3 |
4 | OpenWPM-Mobile is a mobile web privacy measurement framework that is based on
5 | [OpenWPM](https://github.com/citp/OpenWPM). OpenWPM-Mobile is developed for the paper titled "[`The Web's Sixth Sense: A Study of Scripts Accessing Smartphone Sensors`](https://sensor-js.xyz)" to measure the ecosystem of scripts accessing mobile sensors.
6 |
7 | ## Installation
8 |
9 | Run the following to install OpenWPM-Mobile.
10 |
11 | ```./install.sh```
12 |
13 | To install the analysis related packages and files:
14 |
15 | ```install-analysis.sh```
16 |
17 |
18 | ## Basic usage
19 |
20 | Edit [`mobile_sensor_crawl.py`](https://github.com/sensor-js/OpenWPM_mobile/blob/mobile_sensors/mobile_sensor_crawl.py) to change the crawl parameters, such as number of sites to crawl and the number of browsers to run in parallel.
21 |
22 | Then start a crawl by running:
23 |
24 | ```python mobile_sensor_crawl.py```
25 |
26 |
27 | ## Imitating Mobile Browser
28 | OpenWPM-Mobile takes several steps to realistically imitate Firefox for Android.
29 |
30 | This involves overriding navigator object’s user agent, platform,
31 | appVersion and appCodeName strings; matching the screen resolution,
32 | screen dimensions, pixel depth, color depth; enabling touch
33 | status; removing plugins and supported MIME types that may indicate a desktop browser.
34 |
35 | OpenWPM-Mobile also uses the preferences used to configure Firefox
36 | for Android such as hiding the scroll bars and disabling popup windows.
37 | We relied on the values provided in the [`mobile.js`](https://dxr.mozilla.org/mozilla-esr45/source/mobile/android/app/mobile.js) script found in the Firefox for Android source code repository.
38 |
39 | When running crawls with OpenWPM-Mobile we installed
40 | Android fonts on our crawler machines to mitigate font-based
41 | fingerprinting. You may follow the instructions provided in
42 | [EmulatingAndroidFonts.md](https://github.com/sensor-js/OpenWPM_mobile/blob/mobile_sensors/EmulatingAndroidFonts.md)
43 | to install Android fonts on your crawler machines.
44 |
45 | ## Running tests
46 |
47 | The following will run all the tests:
48 |
49 | ```pytest test```
50 |
51 | If you don't want to run the (slow) crawling test `test_crawl.py` execute the following:
52 |
53 | ```pytest test -m "not slow"```
54 |
55 | ## Data Analysis
56 |
57 | Consult to the [OpenWPM repository](https://github.com/citp/OpenWPM#instrumentation-and-data-access) for details of the data format.
58 |
59 | ### Feature extraction and clustering
60 |
61 | Follow the steps below to extract binary script features and cluster scripts similar using the methodology described in the [paper](https://sensor-js.xyz/ccs-18-a-study-of-scripts-accessing-smartphone-sensors.pdf).
62 |
63 | 1. Run the following command to extract features for scripts discovered in the crawl:
64 |
65 | ```python extract_features.py```
66 |
67 | Make sure to point to the correct database containing the crawl results inside [`extract_features.py`](https://github.com/sensor-js/OpenWPM_mobile/blob/mobile_sensors/feature_extraction/extract_features.py#L813).
68 |
69 | 2. Once features are extracted you can generate clusters from the extracted features by using the [`Clustering_JS_scripts.ipynb`](https://github.com/sensor-js/OpenWPM_mobile/blob/mobile_sensors/cluster_scripts/Clustering_JS_scripts.ipynb) Jupyter notebook.
70 |
71 | Make sure to point to the newly generated feature file (```features.csv```) from the step 1.
72 |
73 | ## Citation
74 | If you use OpenWPM-Mobile in your research, please cite our CCS 2018 paper titled [`The Web's Sixth Sense: A Study of Scripts Accessing Smartphone Sensors`](https://sensor-js.xyz/ccs-18-a-study-of-scripts-accessing-smartphone-sensors.pdf). You can use the following BibTeX.
75 |
76 | ```
77 | @inproceedings{sensor-js-2018,
78 | author = "Anupam Das and Gunes Acar and Nikita Borisov and Amogh Pradeep",
79 | title = "{The Web's Sixth Sense: A Study of Scripts Accessing Smartphone Sensors}",
80 | booktitle = {Proceedings of ACM CCS 2018},
81 | year = "2018",
82 | }
83 | ```
84 |
85 | ## License
86 |
87 | OpenWPM-Mobile is licensed under GNU GPLv3. Additional code has been included from
88 | [OpenWPM](https://github.com/citp/OpenWPM) (which OpenWPM-Mobile is based on),
89 | [FourthParty](https://github.com/fourthparty/fourthparty) and
90 | [Privacy Badger](https://github.com/EFForg/privacybadgerfirefox), all of which
91 | are licensed GPLv3+.
92 |
93 |
--------------------------------------------------------------------------------
/VERSION:
--------------------------------------------------------------------------------
1 | 0.7.0
2 |
--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sensor-js/OpenWPM-mobile/1834f4c5b8fd0d3976b2e57f5310fd72860a681f/__init__.py
--------------------------------------------------------------------------------
/automation/CommandSequence.py:
--------------------------------------------------------------------------------
1 | from Errors import CommandExecutionError
2 |
3 | class CommandSequence:
4 | """A CommandSequence wraps a series of commands to be performed
5 | on a visit to one top-level site into one logical
6 | "site visit," keyed by a visit id. An example of a CommandSequence
7 | that visits a page and dumps cookies modified on that visit would be:
8 |
9 | sequence = CommandSequence(url)
10 | sequence.get()
11 | sequence.dump_profile_cookies()
12 | task_manager.execute_command_sequence(sequence)
13 |
14 | CommandSequence guarantees that a series of commands will be performed
15 | by a single browser instance.
16 |
17 | NOTE: Commands dump_profile_cookies and dump_flash_cookies will close
18 | the current tab - any command that relies on the page still being open,
19 | like save_screenshot, extract_links, or dump_page_source, should be
20 | called prior to one of those two commands.
21 | """
22 |
23 | def __init__(self, url, reset=False, blocking=False):
24 | """Initialize command sequence.
25 |
26 | Parameters
27 | ----------
28 | url : str
29 | url of page visit the command sequence should execute on
30 | reset : bool
31 | True if browser should clear state and restart after sequence
32 | blocking : bool
33 | True if sequence should block parent process during execution
34 | """
35 | self.url = url
36 | self.reset = reset
37 | self.blocking = blocking
38 | self.commands_with_timeout = []
39 | self.total_timeout = 0
40 | self.contains_get_or_browse = False
41 |
42 | def get(self, sleep=0, timeout=60):
43 | """ goes to a url """
44 | self.total_timeout += timeout
45 | command = ('GET', self.url, sleep)
46 | self.commands_with_timeout.append((command, timeout))
47 | self.contains_get_or_browse = True
48 |
49 | def browse(self, num_links = 2, sleep=0, timeout=60):
50 | """ browse a website and visit links on the page """
51 | self.total_timeout += timeout
52 | command = ('BROWSE', self.url, num_links, sleep)
53 | self.commands_with_timeout.append((command, timeout))
54 | self.contains_get_or_browse = True
55 |
56 | def dump_flash_cookies(self, timeout=60):
57 | """ dumps the local storage vectors (flash, localStorage, cookies) to db
58 | Side effect: closes the current tab."""
59 | self.total_timeout += timeout
60 | if not self.contains_get_or_browse:
61 | raise CommandExecutionError("No get or browse request preceding "
62 | "the dump storage vectors command", self)
63 | command = ('DUMP_FLASH_COOKIES',)
64 | self.commands_with_timeout.append((command, timeout))
65 |
66 | def dump_profile_cookies(self, timeout=60):
67 | """ dumps from the profile path to a given file (absolute path)
68 | Side effect: closes the current tab."""
69 | self.total_timeout += timeout
70 | if not self.contains_get_or_browse:
71 | raise CommandExecutionError("No get or browse request preceding "
72 | "the dump storage vectors command", self)
73 | command = ('DUMP_PROFILE_COOKIES',)
74 | self.commands_with_timeout.append((command, timeout))
75 |
76 | def dump_profile(self, dump_folder, close_webdriver=False, compress=True, timeout=120):
77 | """ dumps from the profile path to a given file (absolute path) """
78 | self.total_timeout += timeout
79 | command = ('DUMP_PROF', dump_folder, close_webdriver, compress)
80 | self.commands_with_timeout.append((command, timeout))
81 |
82 | def extract_links(self, timeout=30):
83 | """Extracts links found on web page and dumps them externally"""
84 | self.total_timeout += timeout
85 | if not self.contains_get_or_browse:
86 | raise CommandExecutionError("No get or browse request preceding "
87 | "the dump storage vectors command", self)
88 | command = ('EXTRACT_LINKS',)
89 | self.commands_with_timeout.append((command, timeout))
90 |
91 | def save_screenshot(self, screenshot_name, timeout=30):
92 | """Saves screenshot of page to 'screenshots' directory in data directory."""
93 | self.total_timeout += timeout
94 | if not self.contains_get_or_browse:
95 | raise CommandExecutionError("No get or browse request preceding "
96 | "the save screenshot command", self)
97 | command = ('SAVE_SCREENSHOT', screenshot_name,)
98 | self.commands_with_timeout.append((command, timeout))
99 |
100 | def dump_page_source(self, dump_name, timeout=30):
101 | """Dumps rendered source of current page visit to 'sources' directory."""
102 | self.total_timeout += timeout
103 | if not self.contains_get_or_browse:
104 | raise CommandExecutionError("No get or browse request preceding "
105 | "the dump page source command", self)
106 | command = ('DUMP_PAGE_SOURCE', dump_name,)
107 | self.commands_with_timeout.append((command, timeout))
108 |
109 | def run_custom_function(self, function_handle, func_args=(), timeout=30):
110 | """Run a custom by passing the function handle"""
111 | self.total_timeout += timeout
112 | if not self.contains_get_or_browse:
113 | raise CommandExecutionError("No get or browse request preceding "
114 | "the dump page source command", self)
115 | command = ('RUN_CUSTOM_FUNCTION', function_handle, func_args)
116 | self.commands_with_timeout.append((command, timeout))
117 |
--------------------------------------------------------------------------------
/automation/Commands/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sensor-js/OpenWPM-mobile/1834f4c5b8fd0d3976b2e57f5310fd72860a681f/automation/Commands/__init__.py
--------------------------------------------------------------------------------
/automation/Commands/command_executor.py:
--------------------------------------------------------------------------------
1 | import browser_commands
2 | import profile_commands
3 |
4 |
5 | def execute_command(command, webdriver, proxy_queue, browser_settings, browser_params, manager_params, extension_socket):
6 | """
7 | executes BrowserManager commands by passing command tuples into necessary helper function
8 | commands are of form (COMMAND, ARG0, ARG1, ...)
9 | the only imports in this file should be imports to helper libraries
10 | """
11 | if command[0] == 'GET':
12 | browser_commands.get_website(url=command[1], sleep=command[2], visit_id=command[3],
13 | webdriver=webdriver, proxy_queue=proxy_queue,
14 | browser_params=browser_params, extension_socket=extension_socket)
15 |
16 | if command[0] == 'BROWSE':
17 | browser_commands.browse_website(url=command[1], num_links=command[2], sleep=command[3],
18 | visit_id=command[4], webdriver=webdriver,
19 | proxy_queue=proxy_queue, browser_params=browser_params,
20 | manager_params=manager_params, extension_socket=extension_socket)
21 |
22 | if command[0] == 'DUMP_FLASH_COOKIES':
23 | browser_commands.dump_flash_cookies(start_time=command[1], visit_id=command[2],
24 | webdriver=webdriver, browser_params=browser_params,
25 | manager_params=manager_params)
26 |
27 | if command[0] == 'DUMP_PROFILE_COOKIES':
28 | browser_commands.dump_profile_cookies(start_time=command[1], visit_id=command[2],
29 | webdriver=webdriver, browser_params=browser_params,
30 | manager_params=manager_params)
31 |
32 | if command[0] == 'DUMP_PROF':
33 | profile_commands.dump_profile(browser_profile_folder=browser_params['profile_path'],
34 | manager_params=manager_params,
35 | browser_params=browser_params,
36 | tar_location=command[1], close_webdriver=command[2],
37 | webdriver=webdriver, browser_settings=browser_settings,
38 | compress=command[3],
39 | save_flash=browser_params['disable_flash'] is False)
40 |
41 | if command[0] == 'EXTRACT_LINKS':
42 | browser_commands.extract_links(webdriver, browser_params, manager_params)
43 |
44 | if command[0] == 'SAVE_SCREENSHOT':
45 | browser_commands.save_screenshot(screenshot_name=command[1], webdriver=webdriver,
46 | browser_params=browser_params, manager_params=manager_params)
47 |
48 | if command[0] == 'DUMP_PAGE_SOURCE':
49 | browser_commands.dump_page_source(dump_name=command[1], webdriver=webdriver,
50 | browser_params=browser_params, manager_params=manager_params)
51 |
52 | if command[0] == 'RUN_CUSTOM_FUNCTION':
53 | arg_dict = {"command": command,
54 | "driver": webdriver,
55 | "proxy_queue": proxy_queue,
56 | "browser_settings": browser_settings,
57 | "browser_params": browser_params,
58 | "manager_params": manager_params,
59 | "extension_socket": extension_socket}
60 | command[1](*command[2], **arg_dict)
61 |
--------------------------------------------------------------------------------
/automation/Commands/utils/XPathUtil.py:
--------------------------------------------------------------------------------
1 | # XPathUtil.py
2 | # A collecton of utilities to extract and parse
3 | # XPaths encountered while scraping.
4 | #
5 | # Steven Englehardt (github.com/englehardt)
6 | from bs4 import BeautifulSoup as bs
7 | import bs4
8 | import re
9 |
10 | def is_clickable(xpath):
11 | #We consider any xpath that has an 'a', 'button',
12 | #or 'input' tag to be clickable as it most likely
13 | #contains a link. It may make sense to see check
14 | # or other tags...
15 | index_regex = re.compile(r'\[[^\]]*\]') #match index and id brackets
16 | #check xpath for necessary tags
17 | temp = re.sub(index_regex,'',xpath)
18 | temp = temp.split('/')
19 | if 'a' in temp or 'button' in temp or 'input' in temp:
20 | return True
21 | return False
22 |
23 | # ExtractXPath(element, use_id)
24 | # - element: a bs4 tag node
25 | # - use_id: defaults True
26 | #
27 | # Traverses up the tag tree of a Beautiful Soup node
28 | # to return the XPath of that node.
29 | #
30 | # Use of ids is preferred when the xpath will be used
31 | # outside of BeautifulSoup. Since an id is unique to
32 | # all elements of the tree, it allows the use of a
33 | # wildcard for all parent nodes. This minimizes the
34 | # chances of incorrect indexing (which can occur if
35 | # javascript changes a page during processing).
36 |
37 | class ExtractXPathError(Exception):
38 | def __init__(self, value):
39 | self.value = value
40 | def __str__(self):
41 | return repr(self.value)
42 |
43 | def check_previous_tags(node, use_id=True):
44 | #index of node
45 | counter = 1
46 | for tag in node.previous_siblings:
47 | if type(tag) != bs4.element.Tag:
48 | continue
49 | elif tag.name == node.name:
50 | counter += 1
51 |
52 | #XPath name
53 | if counter > 1:
54 | xpath = node.name + '[' + str(counter) + ']'
55 | else:
56 | xpath = node.name
57 |
58 | return xpath
59 |
60 | def ExtractXPath(element, use_id = True):
61 | # Check that element is a tag node
62 | if type(element) != bs4.element.Tag:
63 | raise ExtractXPathError(str(type(element)) +
64 | ' is not a supported data type. Only tag nodes from the tag tree are accepted.')
65 |
66 | ##### Starting node
67 | #Check id first
68 | if use_id and element.get('id') != None:
69 | return '//*/' + element.name + '[@id=\"' + element.get('id') + '\"]'
70 |
71 | xpath = check_previous_tags(element)
72 |
73 | ##### Parent Nodes
74 | for parent in element.parents:
75 | #End of XPath - exclude from string
76 | if parent.name == '[document]':
77 | break
78 |
79 | #Check id first
80 | if use_id and parent.get('id') != None:
81 | return '//*/' + parent.name + '[@id=\"' + parent.get('id') + '\"]/' + xpath
82 |
83 | xpath = check_previous_tags(parent) + '/' + xpath
84 |
85 | xpath = '/' + xpath
86 | return xpath
87 |
88 | # xp1_wildcard adds wildcard functionality to XPath 1.0
89 | # strings using the limited function set supported by the 1.0
90 | # implementation.
91 | #
92 | # xp1_lowercase likewise adds lowercase functionality
93 | #
94 | # Hopefully you never need these...
95 |
96 | def xp1_lowercase(string):
97 | return 'translate('+ string + ", 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz')"
98 |
99 | # Converts a string with a wildcard in it to an XPath 1.0
100 | # compatible string *** ONLY SUPPORTS 1 WILDCARD ***
101 | # string: string w/ wildcard that you are searching for
102 | # attr: tag attribute you are searching for (e.g. 'text()' or '@id' or ...)
103 | def xp1_wildcard(attr, string, normalize=True):
104 | parts = string.split('*')
105 |
106 | if normalize:
107 | attr = 'normalize-space(' + attr + ')'
108 |
109 | if len(parts) != 2:
110 | print "ERROR: This function is meant to support 1 wildcard"
111 | return '[' + attr + '=' + string + ']'
112 | else:
113 | pt1 = ''
114 | pt2 = ''
115 |
116 | if parts[0] != '':
117 | pt1 = 'starts-with(' + attr + ', \'' + parts[0] + '\')'
118 | if parts[1] != '':
119 | pt2 = ('contains(substring(' + attr +
120 | ', string-length(' + attr + ')-'+ str(len(parts[1])-1) +
121 | '), \'' + parts[1] + '\')')
122 |
123 | if pt1 == '' and pt2 != '':
124 | return '[' + pt2 + ']'
125 | elif pt1 != '' and pt2 == '':
126 | return '[' + pt1 + ']'
127 | elif pt1 != '' and pt2 != '':
128 | return ('[' + pt1 + ' and ' + pt2 + ']')
129 | else:
130 | print "ERROR: The string is empty"
131 | return '[' + attr + '=' + string + ']'
132 |
133 | if __name__=='__main__':
134 | #Output some sample XPaths
135 | print "--- Sample XPaths ---"
136 | import urllib2
137 | import re
138 | from random import choice
139 | rsp = urllib2.urlopen('http://www.reddit.com/')
140 | if rsp.getcode() == 200:
141 | soup = bs(rsp.read(), 'lxml')
142 | elements = soup.findAll(text = re.compile('[A-Za-z0-9]{10,}'))
143 | for i in range(0,5):
144 | element = choice(elements).parent
145 | print "HTML"
146 | print element
147 | print "XPath"
148 | print ExtractXPath(element)
149 | print "**************"
150 |
--------------------------------------------------------------------------------
/automation/Commands/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sensor-js/OpenWPM-mobile/1834f4c5b8fd0d3976b2e57f5310fd72860a681f/automation/Commands/utils/__init__.py
--------------------------------------------------------------------------------
/automation/Commands/utils/file_utils.py:
--------------------------------------------------------------------------------
1 | # A collection of file utilities
2 | import shutil
3 | import os
4 |
5 | def rmsubtree(location):
6 | """Clears all subfolders and files in location"""
7 | for root, dirs, files in os.walk(location):
8 | for f in files:
9 | os.unlink(os.path.join(root, f))
10 | for d in dirs:
11 | shutil.rmtree(os.path.join(root, d))
12 |
--------------------------------------------------------------------------------
/automation/Commands/utils/firefox_profile.py:
--------------------------------------------------------------------------------
1 | ### This is code adapted from KU Leuven crawler code written by
2 | ### Gunes Acar and Marc Juarez
3 | from glob import glob
4 | import sqlite3
5 | import time
6 | import os
7 |
8 | def tmp_sqlite_files_exist(path):
9 | """Check if temporary sqlite files(wal, shm) exist in a given path."""
10 | return glob(os.path.join(path, '*-wal')) or \
11 | glob(os.path.join(path, '*-shm'))
12 |
13 |
14 | def sleep_until_sqlite_checkpoint(profile_dir, timeout=60):
15 | """
16 | We wait until all the shm and wal files are checkpointed to DB.
17 | https://www.sqlite.org/wal.html#ckpt.
18 | """
19 | while (timeout > 0 and tmp_sqlite_files_exist(profile_dir)):
20 | time.sleep(1)
21 | timeout -= 1
22 | print "Waited for %s seconds for sqlite checkpointing" % (60 - timeout)
23 |
24 |
25 | def get_localStorage(profile_directory, mod_since):
26 | #TODO how to support modified since???
27 | ff_ls_file = os.path.join(profile_directory, 'webappsstore.sqlite')
28 | if not os.path.isfile(ff_ls_file):
29 | print "Cannot find localstorage DB %s" % ff_ls_file
30 | else:
31 | conn = sqlite3.connect(ff_ls_file)
32 | with conn:
33 | cur = conn.cursor()
34 | cur.execute('SELECT scope, KEY, value \
35 | FROM webappsstore2 \
36 | WHERE last;')
37 | rows = cur.fetchall()
38 | return rows
39 |
40 | def get_cookies(profile_directory, mod_since):
41 | cookie_db = os.path.join(profile_directory, 'cookies.sqlite')
42 | if not os.path.isfile(cookie_db):
43 | print "cannot find cookie.db", cookie_db
44 | else:
45 | conn = sqlite3.connect(cookie_db)
46 | with conn:
47 | c = conn.cursor()
48 | c.execute('SELECT baseDomain, name, value, host, path, expiry,\
49 | lastAccessed, creationTime, isSecure, isHttpOnly \
50 | FROM moz_cookies \
51 | WHERE lastAccessed > ?;',(int(mod_since*1000000),))
52 | rows = c.fetchall()
53 | return rows
54 |
--------------------------------------------------------------------------------
/automation/Commands/utils/gen_utils.py:
--------------------------------------------------------------------------------
1 | import smtplib
2 | import os
3 | from time import sleep
4 |
5 |
6 | def get_last_crawled(log_file):
7 | last_line = ""
8 | for line in open(log_file):
9 | if "EXECUTING COMMAND: ('GET'" in line:
10 | last_line = line
11 | return int(last_line.split(", ")[-1].split(")")[0])
12 |
13 |
14 | def poll_openwpm_log(log_file="~/openwpm/openwpm.log"):
15 | POLL_LOG_FREQ = 900 # sec
16 | log_file = os.path.expanduser(log_file)
17 | while True:
18 | last_crawled = get_last_crawled(log_file)
19 | print("last_crawled %s" % last_crawled)
20 | send_alert_email("Crawled %s sites" % last_crawled)
21 | sleep(POLL_LOG_FREQ)
22 |
23 |
24 | def send_alert_email(msg="Cannot reach the phone"):
25 | fromaddr = 'appmonit@gmail.com'
26 | toaddrs = 'appmonit@gmail.com'
27 | msg = 'Subject: %s\n\n%s' % ("[appmonit-alert]", msg)
28 | # Credentials (if needed)
29 | username = 'appmonit'
30 | password = 'appmonit1' # TODO change it
31 | # The actual mail send
32 | server = smtplib.SMTP('smtp.gmail.com:587')
33 | server.starttls()
34 | server.login(username, password)
35 | server.sendmail(fromaddr, toaddrs, msg)
36 | server.quit()
37 |
38 |
39 | if __name__ == '__main__':
40 | poll_openwpm_log()
41 |
42 |
--------------------------------------------------------------------------------
/automation/Commands/utils/lso.py:
--------------------------------------------------------------------------------
1 | ### This is code adapted from KU Leuven crawler code written by
2 | ### Gunes Acar and Marc Juarez
3 | from pyamf import sol
4 | import fnmatch
5 | import os
6 |
7 | #TODO: Linux only
8 | FLASH_DIRS = ['~/.macromedia/Flash_Player/#SharedObjects/']
9 |
10 | class FlashCookie(object):
11 | filename = ''
12 | domain = ''
13 | local_path = ''
14 | key = ''
15 | content = ''
16 |
17 | def gen_find_files(filepat, top):
18 | """
19 | http://www.dabeaz.com/generators/
20 | returns filenames that matches the given pattern under() a given dir
21 | """
22 | for path, _, filelist in os.walk(top):
23 | for name in fnmatch.filter(filelist, filepat):
24 | yield os.path.join(path, name)
25 |
26 | def get_flash_cookies(mod_since=0):
27 | """Return a list of Flash cookies (Local Shared Objects)."""
28 | flash_cookies = list()
29 | for top_dir in FLASH_DIRS:
30 | top_dir = os.path.expanduser(top_dir)
31 | for lso_file in gen_find_files("*.sol", top_dir):
32 | mtime = os.path.getmtime(lso_file)
33 | if mtime > mod_since:
34 | try:
35 | flash_cookies.extend(parse_flash_cookies(lso_file))
36 | except (KeyboardInterrupt, SystemExit):
37 | raise
38 | except Exception as e:
39 | print "Exception reading", lso_file
40 | print e
41 | pass
42 | return flash_cookies
43 |
44 | def parse_flash_cookies(lso_file):
45 | lso_dict = sol.load(lso_file)
46 | flash_cookies = list()
47 | for k, v in lso_dict.iteritems():
48 | flash_cookie = FlashCookie()
49 | flash_cookie.local_path = lso_file.split("#SharedObjects/")[1]
50 | flash_cookie.filename = os.path.basename(lso_file)
51 | flash_cookie.domain = lso_file.split("#SharedObjects/")[1].split("/")[1]
52 | flash_cookie.key = unicode(k)
53 | try:
54 | flash_cookie.content = unicode(v)
55 | except UnicodeDecodeError:
56 | # obj is byte string
57 | ascii_text = str(v).encode('string_escape')
58 | flash_cookie.content = unicode(ascii_text)
59 |
60 | flash_cookies.append(flash_cookie)
61 | return flash_cookies
62 |
--------------------------------------------------------------------------------
/automation/Commands/utils/webdriver_extensions.py:
--------------------------------------------------------------------------------
1 | # A set of extensions to the functions normally provided by the selenium
2 | # webdriver. These are primarily for parsing and searching.
3 | from selenium.webdriver.support import expected_conditions as EC
4 | from selenium.webdriver.support.ui import WebDriverWait
5 | from selenium.common.exceptions import TimeoutException
6 | from selenium.common.exceptions import ElementNotVisibleException
7 | from selenium.common.exceptions import NoSuchElementException
8 | from urlparse import urljoin
9 | import random
10 | import time
11 |
12 | from ...utilities import domain_utils as du
13 | import XPathUtil
14 |
15 | #### Basic functions
16 | def scroll_down(driver):
17 | at_bottom = False
18 | while random.random() > .20 and not at_bottom:
19 | k = str(10 + int(200*random.random()))
20 | driver.execute_script("window.scrollBy(0,"+k+")")
21 | at_bottom = driver.execute_script("return (((window.scrollY + window.innerHeight ) +100 > document.body.clientHeight ))")
22 | time.sleep(0.5 + random.random())
23 |
24 | def scroll_to_bottom(driver):
25 | driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
26 | return
27 |
28 | def is_loaded(webdriver):
29 | return (webdriver.execute_script("return document.readyState") == "complete")
30 |
31 | def wait_until_loaded(webdriver, timeout, period=0.25):
32 | mustend = time.time() + timeout
33 | while time.time() < mustend:
34 | if is_loaded(webdriver): return True
35 | time.sleep(period)
36 | return False
37 |
38 | def get_intra_links(webdriver, url):
39 | ps1 = du.get_ps_plus_1(url)
40 | links = filter(lambda x: (x.get_attribute("href") and
41 | du.get_ps_plus_1(urljoin(url, x.get_attribute("href"))) == ps1),
42 | webdriver.find_elements_by_tag_name("a"))
43 | return links
44 |
45 | ##### Search/Block Functions
46 | # locator_type: a text representation of the standard
47 | # webdriver.find_element_by_* functions. You can either
48 | # import selenium.webdriver.common.by.By and use By.LINK_TEXT, etc.
49 | # or just remember the string representations. For example:
50 | # By.LINK_TEXT is 'link text'
51 | # By.CSS_SELECTOR is 'css selector'
52 | # By.NAME is 'name' ... and so on
53 | # locator: string that you are looking for
54 | def wait_and_find(driver, locator_type, locator, timeout=3, check_iframes=True):
55 | if is_found(driver, locator_type, locator, timeout):
56 | return driver.find_element(locator_type, locator)
57 | else:
58 | if check_iframes: #this may return the browser with an iframe active
59 | driver.switch_to_default_content()
60 | iframes = driver.find_elements_by_tag_name('iframe')
61 |
62 | for iframe in iframes:
63 | driver.switch_to_default_content()
64 | driver.switch_to_frame(iframe)
65 | if is_found(driver, locator_type, locator, timeout=0):
66 | return driver.find_element(locator_type, locator)
67 |
68 | #If we get here, search also fails in iframes
69 | driver.switch_to_default_content()
70 | raise NoSuchElementException, "Element not found during wait_and_find"
71 |
72 | def is_found(driver, locator_type, locator, timeout=3):
73 | try:
74 | w = WebDriverWait(driver, timeout)
75 | w.until(lambda d: d.find_element(locator_type, locator))
76 | return True
77 | except TimeoutException:
78 | return False
79 |
80 | def is_visible(driver, locator_type, locator, timeout=3):
81 | try:
82 | w = WebDriverWait(driver, timeout)
83 | w.until(EC.visibility_of_element_located((locator_type, locator)))
84 | return True
85 | except TimeoutException:
86 | return False
87 |
88 | def title_is(driver, title, timeout=3):
89 | try:
90 | w = WebDriverWait(driver, timeout)
91 | w.until(EC.title_is(title))
92 | return True
93 | except TimeoutException:
94 | return False
95 |
96 | def title_contains(driver, title, timeout=3):
97 | try:
98 | w = WebDriverWait(driver, timeout)
99 | w.until(EC.title_contains(title))
100 | return True
101 | except TimeoutException:
102 | return False
103 |
104 | #Selenium requires an element to be visible and enabled to be
105 | #clickable. We extend that to require it to have a tag capable
106 | #of containing a link. NOTE: doesn't work 100%
107 | def is_clickable(driver, full_xpath, xpath, timeout = 1):
108 | try:
109 | w = WebDriverWait(driver, timeout)
110 | w.until(EC.element_to_be_clickable(('xpath',xpath)))
111 | return XPathUtil.is_clickable(full_xpath)
112 | except (TimeoutException, ElementNotVisibleException):
113 | return False
114 |
115 | #TODO Update this. No direct access to DB right now
116 | '''
117 | #get and set xpaths into xpath database
118 | def get_xpath(driver, url, name):
119 | cur = self.db.cursor()
120 | cur.execute("SELECT xpath FROM xpath WHERE url = ? AND name = ?",(url, name))
121 | response = cur.fetchone()
122 | if response == None:
123 | return None
124 | else:
125 | return response[0]
126 |
127 | def set_xpath(driver, url, name, xpath, absolute_xpath = None):
128 | cur = self.db.cursor()
129 | if self.mp_lock is not None:
130 | self.mp_lock.acquire()
131 | cur.execute("UPDATE xpath SET xpath = ?, absolute_xpath = ? \
132 | WHERE url = ? AND name = ?", (xpath, absolute_xpath, url, name))
133 | if cur.rowcount == 0: #occurs when record does not already exist
134 | cur.execute("INSERT INTO xpath (name, url, xpath, absolute_xpath) VALUES (?,?,?,?)",
135 | (name, url, xpath, absolute_xpath))
136 | self.db.commit()
137 | if self.mp_lock is not None:
138 | self.mp_lock.release()
139 | return cur.lastrowid
140 | '''
141 |
142 | #Click an xpath using javascript -- not working correctly
143 | #gets around visibility requirements of selenium.
144 | #def click_xpath(driver, xpath):
145 | # driver.execute_script('$(document.evaluate('+xpath+', document, null, 9, null).singleNodeValue).click();')
146 |
--------------------------------------------------------------------------------
/automation/DataAggregator/DataAggregator.py:
--------------------------------------------------------------------------------
1 | from ..SocketInterface import serversocket
2 | from ..MPLogger import loggingclient
3 | from sqlite3 import OperationalError
4 | from sqlite3 import ProgrammingError
5 | import sqlite3
6 | import time
7 | import os
8 |
9 |
10 | def DataAggregator(manager_params, status_queue, commit_batch_size=1000):
11 | """
12 | Receives SQL queries from other processes and writes them to the central database
13 | Executes queries until being told to die (then it will finish work and shut down)
14 | This process should never be terminated un-gracefully
15 | Currently uses SQLite but may move to different platform
16 |
17 | TaskManager configuration parameters
18 | is a queue connect to the TaskManager used for communication
19 | is the number of execution statements that should be made before a commit (used for speedup)
20 | """
21 |
22 | # sets up DB connection
23 | db_path = manager_params['database_name']
24 | db = sqlite3.connect(db_path, check_same_thread=False)
25 | curr = db.cursor()
26 |
27 | # sets up logging connection
28 | logger = loggingclient(*manager_params['logger_address'])
29 |
30 | # sets up the serversocket to start accepting connections
31 | sock = serversocket()
32 | status_queue.put(sock.sock.getsockname()) # let TM know location
33 | sock.start_accepting()
34 |
35 | counter = 0 # number of executions made since last commit
36 | commit_time = 0 # keep track of time since last commit
37 | while True:
38 | # received KILL command from TaskManager
39 | if not status_queue.empty():
40 | status_queue.get()
41 | sock.close()
42 | drain_queue(sock.queue, curr, logger)
43 | break
44 |
45 | # no command for now -> sleep to avoid pegging CPU on blocking get
46 | if sock.queue.empty():
47 | time.sleep(0.001)
48 |
49 | # commit every five seconds to avoid blocking the db for too long
50 | if counter > 0 and time.time() - commit_time > 5:
51 | db.commit()
52 | continue
53 |
54 | # process query
55 | query = sock.queue.get()
56 | process_query(query, curr, logger)
57 |
58 | # batch commit if necessary
59 | counter += 1
60 | if counter >= commit_batch_size:
61 | counter = 0
62 | commit_time = time.time()
63 | db.commit()
64 |
65 | # finishes work and gracefully stops
66 | db.commit()
67 | db.close()
68 |
69 |
70 | def process_query(query, curr, logger):
71 | """
72 | executes a query of form (template_string, arguments)
73 | query is of form (template_string, arguments)
74 | """
75 | if len(query) != 2:
76 | print "ERROR: Query is not the correct length"
77 | return
78 | statement = query[0]
79 | args = list(query[1])
80 | for i in range(len(args)):
81 | if type(args[i]) == str:
82 | args[i] = unicode(args[i], errors='ignore')
83 | elif callable(args[i]):
84 | args[i] = str(args[i])
85 | try:
86 | if len(args) == 0:
87 | curr.execute(statement)
88 | else:
89 | curr.execute(statement,args)
90 | except OperationalError as e:
91 | logger.error("Unsupported query" + '\n' + str(type(e)) + '\n' + str(e) + '\n' + statement + '\n' + str(args))
92 | pass
93 | except ProgrammingError as e:
94 | logger.error("Unsupported query" + '\n' + str(type(e)) + '\n' + str(e) + '\n' + statement + '\n' + str(args))
95 | pass
96 |
97 |
98 | def drain_queue(sock_queue, curr, logger):
99 | """ Ensures queue is empty before closing """
100 | time.sleep(3) # TODO: the socket needs a better way of closing
101 | while not sock_queue.empty():
102 | query = sock_queue.get()
103 | process_query(query, curr, logger)
104 |
--------------------------------------------------------------------------------
/automation/DataAggregator/LevelDBAggregator.py:
--------------------------------------------------------------------------------
1 | from ..SocketInterface import serversocket
2 | from ..MPLogger import loggingclient
3 | import plyvel
4 | import time
5 | import os
6 |
7 |
8 | def LevelDBAggregator(manager_params, status_queue, batch_size=100):
9 | """
10 | Receives pairs from other processes and writes them to the
11 | central database. Executes queries until being told to die (then it will
12 | finish work and shut down).This process should never be terminated
13 | un-gracefully.
14 |
15 | TaskManager configuration parameters
16 | is a queue connect to the TaskManager used for communication
17 | is the size of the write batch
18 | """
19 |
20 | # sets up logging connection
21 | logger = loggingclient(*manager_params['logger_address'])
22 |
23 | # sets up the serversocket to start accepting connections
24 | sock = serversocket()
25 | status_queue.put(sock.sock.getsockname()) # let TM know location
26 | sock.start_accepting()
27 |
28 | # sets up DB connection
29 | db_path = os.path.join(manager_params['data_directory'], 'javascript.ldb')
30 | db = plyvel.DB(db_path,
31 | create_if_missing = True,
32 | lru_cache_size = 10**9,
33 | write_buffer_size = 128*10**4,
34 | compression = 'snappy')
35 | batch = db.write_batch()
36 |
37 | counter = 0 # number of executions made since last write
38 | commit_time = 0 # keep track of time since last write
39 | while True:
40 | # received KILL command from TaskManager
41 | if not status_queue.empty():
42 | status_queue.get()
43 | sock.close()
44 | drain_queue(sock.queue, batch, db, counter, logger)
45 | break
46 |
47 | # no command for now -> sleep to avoid pegging CPU on blocking get
48 | if sock.queue.empty():
49 | time.sleep(0.1)
50 |
51 | # commit every five seconds to avoid blocking the db for too long
52 | if counter > 0 and time.time() - commit_time > 5:
53 | batch.write()
54 | batch = db.write_batch()
55 | continue
56 |
57 | # process record
58 | content, content_hash = sock.queue.get()
59 | counter = process_content(content, content_hash,
60 | batch, db, counter, logger)
61 |
62 | # batch commit if necessary
63 | if counter >= batch_size:
64 | counter = 0
65 | commit_time = time.time()
66 | batch.write()
67 | batch = db.write_batch()
68 |
69 | # finishes work and gracefully stops
70 | batch.write()
71 | db.close()
72 |
73 | def process_content(content, content_hash, batch, db, counter, logger):
74 | """
75 | adds content to the batch
76 | """
77 | content = content.encode('utf-8')
78 | content_hash = str(content_hash)
79 | if db.get(content_hash) is not None:
80 | return counter
81 |
82 | batch.put(content_hash, content)
83 | return counter + 1
84 |
85 | def drain_queue(sock_queue, batch, db, counter, logger):
86 | """ Ensures queue is empty before closing """
87 | time.sleep(3) # TODO: the socket needs a better way of closing
88 | while not sock_queue.empty():
89 | content, content_hash = sock_queue.get()
90 | counter = process_content(content, content_hash,
91 | batch, db, counter, logger)
92 |
--------------------------------------------------------------------------------
/automation/DataAggregator/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sensor-js/OpenWPM-mobile/1834f4c5b8fd0d3976b2e57f5310fd72860a681f/automation/DataAggregator/__init__.py
--------------------------------------------------------------------------------
/automation/DeployBrowsers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sensor-js/OpenWPM-mobile/1834f4c5b8fd0d3976b2e57f5310fd72860a681f/automation/DeployBrowsers/__init__.py
--------------------------------------------------------------------------------
/automation/DeployBrowsers/deploy_browser.py:
--------------------------------------------------------------------------------
1 | import deploy_firefox
2 | from ..Errors import BrowserConfigError
3 |
4 | def deploy_browser(status_queue, browser_params, manager_params, crash_recovery):
5 | """ receives a dictionary of browser parameters and passes it to the relevant constructor """
6 | if browser_params['browser'].lower() == 'chrome':
7 | raise BrowserConfigError("Chrome is not supported. OpenWPM currently "
8 | "only supports measurement with Firefox.")
9 | if browser_params['browser'].lower() == 'firefox':
10 | return deploy_firefox.deploy_firefox(status_queue, browser_params, manager_params, crash_recovery)
11 |
--------------------------------------------------------------------------------
/automation/DeployBrowsers/firefox_extensions/adblock_plus-2.7.xpi:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sensor-js/OpenWPM-mobile/1834f4c5b8fd0d3976b2e57f5310fd72860a681f/automation/DeployBrowsers/firefox_extensions/adblock_plus-2.7.xpi
--------------------------------------------------------------------------------
/automation/DeployBrowsers/firefox_extensions/ghostery/ghostery-5.4.10.xpi:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sensor-js/OpenWPM-mobile/1834f4c5b8fd0d3976b2e57f5310fd72860a681f/automation/DeployBrowsers/firefox_extensions/ghostery/ghostery-5.4.10.xpi
--------------------------------------------------------------------------------
/automation/DeployBrowsers/firefox_extensions/https_everywhere-5.1.0.xpi:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sensor-js/OpenWPM-mobile/1834f4c5b8fd0d3976b2e57f5310fd72860a681f/automation/DeployBrowsers/firefox_extensions/https_everywhere-5.1.0.xpi
--------------------------------------------------------------------------------
/automation/DeployBrowsers/screen_resolutions.txt:
--------------------------------------------------------------------------------
1 | 1920,1080
2 | 1366,768
3 | 1280,1024
4 | 1280,800
5 | 1024,768
6 |
--------------------------------------------------------------------------------
/automation/DeployBrowsers/user_agent_strings.txt:
--------------------------------------------------------------------------------
1 | Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0
2 | Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:24.0) Gecko/20100101 Firefox/24.0
3 | Mozilla/5.0 (Windows NT 6.0; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0
4 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:24.0) Gecko/20100101 Firefox/24.0
5 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:25.0) Gecko/20100101 Firefox/25.0
6 | Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0
7 | Mozilla/5.0 (Windows NT 6.1; rv:28.0) Gecko/20100101 Firefox/28.0
8 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/536.26.17 (KHTML, like Gecko) Version/6.0.2 Safari/536.26.17
9 | Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36
10 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1664.3 Safari/537.36
11 | Mozilla/5.0 (compatible; MSIE 10.6; Windows NT 6.1; Trident/5.0; InfoPath.2; SLCC1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 2.0.50727) 3gpp-gba UNTRUSTED/1.0
12 | Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)
13 |
--------------------------------------------------------------------------------
/automation/Errors.py:
--------------------------------------------------------------------------------
1 | """ OpenWPM Custom Errors """
2 |
3 | class CommandExecutionError(Exception):
4 | """ Raise for errors related to executing commands """
5 | def __init__(self, message, command, *args):
6 | self.message = message
7 | self.command = command
8 | super(CommandExecutionError, self).__init__(message, command, *args)
9 |
10 | class ProfileLoadError(Exception):
11 | """ Raise for errors that occur while loading profile """
12 | def __init__(self, message, *args):
13 | self.message = message
14 | super(ProfileLoadError, self).__init__(message, *args)
15 |
16 | class BrowserConfigError(Exception):
17 | """ Raise for errors that occur from a misconfiguration of the browser """
18 | def __init__(self, message, *args):
19 | self.message = message
20 | super(BrowserConfigError, self).__init__(message, *args)
21 |
22 | class BrowserCrashError(Exception):
23 | """ Raise for non-critical crashes within the BrowserManager process """
24 | def __init__(self, message, *args):
25 | self.message = message
26 | super(BrowserCrashError, self).__init__(message, *args)
27 |
--------------------------------------------------------------------------------
/automation/Extension/firefox/data/create_content_policy_table.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE IF NOT EXISTS content_policy(
2 | id INTEGER PRIMARY KEY ASC,
3 | crawl_id INTEGER,
4 | content_type INTEGER,
5 | content_location TEXT,
6 | request_origin TEXT,
7 | mime_type_guess TEXT,
8 | page_id INTEGER,
9 | visit_id INTEGER
10 | );
11 |
--------------------------------------------------------------------------------
/automation/Extension/firefox/data/create_cookies_table.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE IF NOT EXISTS javascript_cookies(
2 | id INTEGER PRIMARY KEY ASC,
3 | crawl_id INTEGER,
4 | visit_id INTEGER,
5 | change TEXT,
6 | creationTime DATETIME,
7 | expiry DATETIME,
8 | is_http_only INTEGER,
9 | is_session INTEGER,
10 | last_accessed DATETIME,
11 | raw_host TEXT,
12 | expires INTEGER,
13 | host TEXT,
14 | is_domain INTEGER,
15 | is_secure INTEGER,
16 | name TEXT,
17 | path TEXT,
18 | policy INTEGER,
19 | status INTEGER,
20 | value TEXT
21 | );
22 |
--------------------------------------------------------------------------------
/automation/Extension/firefox/data/create_http_requests_table.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE IF NOT EXISTS http_requests(
2 | id INTEGER PRIMARY KEY AUTOINCREMENT,
3 | crawl_id INTEGER NOT NULL,
4 | visit_id INTEGER NOT NULL,
5 | url TEXT NOT NULL,
6 | top_level_url TEXT,
7 | method TEXT NOT NULL,
8 | referrer TEXT NOT NULL,
9 | headers TEXT NOT NULL,
10 | is_XHR BOOLEAN,
11 | is_frame_load BOOLEAN,
12 | is_full_page BOOLEAN,
13 | is_third_party_channel BOOLEAN,
14 | is_third_party_window BOOLEAN,
15 | triggering_origin TEXT,
16 | loading_origin TEXT,
17 | loading_href TEXT,
18 | req_call_stack TEXT,
19 | content_policy_type INTEGER NOT NULL,
20 | post_body TEXT,
21 | time_stamp TEXT NOT NULL
22 | );
23 |
--------------------------------------------------------------------------------
/automation/Extension/firefox/data/create_http_responses_table.sql:
--------------------------------------------------------------------------------
1 | /* TODO: link with requests */
2 | CREATE TABLE IF NOT EXISTS http_responses(
3 | id INTEGER PRIMARY KEY AUTOINCREMENT,
4 | crawl_id INTEGER NOT NULL,
5 | visit_id INTEGER NOT NULL,
6 | url TEXT NOT NULL,
7 | method TEXT NOT NULL,
8 | referrer TEXT NOT NULL,
9 | response_status INTEGER NOT NULL,
10 | response_status_text TEXT NOT NULL,
11 | is_cached BOOLEAN NOT NULL,
12 | headers TEXT NOT NULL,
13 | location TEXT NOT NULL,
14 | time_stamp TEXT NOT NULL,
15 | content_hash TEXT
16 | );
17 |
--------------------------------------------------------------------------------
/automation/Extension/firefox/data/create_javascript_table.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE IF NOT EXISTS javascript(
2 | id INTEGER PRIMARY KEY,
3 | crawl_id INTEGER,
4 | visit_id INTEGER,
5 | script_url TEXT,
6 | script_line TEXT,
7 | script_col TEXT,
8 | func_name TEXT,
9 | script_loc_eval TEXT,
10 | call_stack TEXT,
11 | symbol TEXT,
12 | operation TEXT,
13 | value TEXT,
14 | arguments TEXT,
15 | time_stamp TEXT NOT NULL
16 | );
17 |
--------------------------------------------------------------------------------
/automation/Extension/firefox/data/create_pages_table.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE IF NOT EXISTS pages(
2 | id INTEGER PRIMARY KEY ASC,
3 | crawl_id INTEGER,
4 | visit_id INTEGER,
5 | location TEXT,
6 | parent_id INTEGER
7 | );
8 |
--------------------------------------------------------------------------------
/automation/Extension/firefox/data/remove_webdriver_attributes.js:
--------------------------------------------------------------------------------
1 | // We don't know the order the content scripts will load
2 | // so let's try to remove the attributes now (if they already exist)
3 | // or register an event handler if they don't.
4 | // * https://github.com/SeleniumHQ/selenium/blob/b82512999938d41f6765ce8017284dcabe437d4c/javascript/firefox-driver/extension/content/server.js#L49
5 | // * https://github.com/SeleniumHQ/selenium/blob/b82512999938d41f6765ce8017284dcabe437d4c/javascript/firefox-driver/extension/content/dommessenger.js#L98
6 | function getPageScript() {
7 | // return a string
8 | return "(" + function() {
9 | if ("webdriver" in navigator) {
10 | console.log("Webdriver attributes present, remove immediately");
11 | // Attributes can be removed immediately
12 | document.documentElement.removeAttribute("webdriver");
13 | delete window.navigator["webdriver"];
14 | console.log("Webdriver attributes removed!");
15 | } else {
16 | // Listener for `document` attribute
17 | document.addEventListener("DOMAttrModified", function monitor(ev) {
18 | console.log("Removing webdriver attribute from document");
19 | document.documentElement.removeAttribute("webdriver");
20 | document.removeEventListener("DOMAttrModified", monitor, false);
21 | }, false);
22 |
23 | // Prevent webdriver attribute from getting set on navigator
24 | var originalDefineProperty = Object.defineProperty;
25 | Object.defineProperty(Object, 'defineProperty', {
26 | value: function(obj, prop, descriptor) {
27 | if (obj == window.navigator && prop == 'webdriver') {
28 | console.log("Preventing definition of webdriver property on navigator.");
29 |
30 | // Return Object.defineProperty to original state
31 | Object.defineProperty(Object, 'defineProperty', {
32 | value: originalDefineProperty
33 | });
34 | return undefined;
35 | }
36 | return originalDefineProperty(obj, prop, descriptor);
37 | }
38 | });
39 | console.log("Webdriver attribute handlers started!");
40 | }
41 | } + "());";
42 | }
43 |
44 | function insertScript(text) {
45 | var parent = document.documentElement,
46 | script = document.createElement('script');
47 | script.text = text;
48 | script.async = false;
49 |
50 | parent.insertBefore(script, parent.firstChild);
51 | parent.removeChild(script);
52 | }
53 | insertScript(getPageScript());
54 |
--------------------------------------------------------------------------------
/automation/Extension/firefox/data/trigger_sensor_events.js:
--------------------------------------------------------------------------------
1 | function getPageScript() {
2 | // return a string
3 |
4 | return "(" + function() {
5 | // Triggering sensor events every second after page load
6 | setInterval(trigger_sensor_events, 20);
7 |
8 | function trigger_sensor_events(){
9 | trigger_devicelight_event();
10 | // setTimeout(trigger_lightlevel_event, 100);
11 | setTimeout(trigger_deviceproximity_event, 200);
12 | setTimeout(trigger_userproximity_event, 400);
13 | setTimeout(trigger_deviceorientation_event, 600);
14 | setTimeout(trigger_devicemotion_event, 800);
15 | }
16 |
17 | function trigger_devicelight_event(){
18 | var devicelight_event = new DeviceLightEvent('devicelight', {
19 | 'value': 987,
20 | 'bubbles': true,
21 | 'cancelable': true
22 | });
23 | window.dispatchEvent(devicelight_event)
24 | }
25 |
26 | function trigger_lightlevel_event(){
27 | // This is not supported and causes JS error on Firefox
28 | // Let's not use it
29 | var lightlevel_event = new LightLevelEvent('lightlevel', {
30 | 'value': "bright",
31 | 'bubbles': true,
32 | 'cancelable': true
33 | });
34 | // window.dispatchEvent(lightlevel_event)
35 | }
36 |
37 | function trigger_deviceproximity_event(){
38 | // Firefox and Chrome on Android don't seem to support this event
39 | var deviceproximity_event = new DeviceProximityEvent('deviceproximity', {
40 | 'min': 0,
41 | 'max': 100,
42 | 'value': 3,
43 | 'bubbles': true,
44 | 'cancelable': true
45 | });
46 | window.dispatchEvent(deviceproximity_event)
47 | }
48 |
49 | function trigger_userproximity_event(){
50 | var userproximity_event = new UserProximityEvent('userproximity', {
51 | 'near': true,
52 | 'bubbles': true,
53 | 'cancelable': true
54 | });
55 | window.dispatchEvent(userproximity_event)
56 | }
57 |
58 | function trigger_deviceorientation_event(){
59 | var deviceorientation_event = new DeviceOrientationEvent('deviceorientation', {
60 | 'alpha': 43.1234 + random_fraction(),
61 | 'beta': 32.9876 + random_fraction(),
62 | 'gamma': 21.6543 + random_fraction(),
63 | 'bubbles': true,
64 | 'cancelable': true
65 | });
66 | window.dispatchEvent(deviceorientation_event)
67 | }
68 |
69 | function trigger_devicemotion_event(){
70 | var devicemotion_event = new DeviceMotionEvent('devicemotion', {
71 | 'acceleration':{
72 | 'x':0.1256 + random_fraction(),
73 | 'y':-0.1234 + random_fraction(),
74 | 'z':-0.1845 + random_fraction()
75 | },
76 | 'accelerationIncludingGravity':{
77 | 'x':0.0256 + random_fraction(),
78 | 'y':0.1234 + random_fraction(),
79 | 'z':9.7568 + random_fraction()
80 | },
81 | 'rotationRate':{
82 | 'alpha':0.0005 + random_fraction(),
83 | 'beta':0.0034 + random_fraction(),
84 | 'gamma':-0.0048 + random_fraction()
85 | },
86 | 'interval': 16.6660 + random_fraction(),
87 | 'bubbles': true,
88 | 'cancelable': true
89 | });
90 | window.dispatchEvent(devicemotion_event)
91 | }
92 |
93 | function random_fraction(leading_zeroes){
94 | var leading_zeroes = leading_zeroes || 5;
95 | return Math.random() / Math.pow(10, leading_zeroes);
96 | }
97 |
98 | console.log("Fake sensor events will be dispatched!");
99 |
100 | } + "());";
101 | }
102 |
103 |
104 | function insertScript(text) {
105 | var parent = document.documentElement,
106 | script = document.createElement('script');
107 | script.text = text;
108 | script.async = false;
109 |
110 | parent.insertBefore(script, parent.firstChild);
111 | parent.removeChild(script);
112 | }
113 | insertScript(getPageScript());
114 |
--------------------------------------------------------------------------------
/automation/Extension/firefox/doc/main.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sensor-js/OpenWPM-mobile/1834f4c5b8fd0d3976b2e57f5310fd72860a681f/automation/Extension/firefox/doc/main.md
--------------------------------------------------------------------------------
/automation/Extension/firefox/index.js:
--------------------------------------------------------------------------------
1 | const fileIO = require("sdk/io/file");
2 | const system = require("sdk/system");
3 | const pageMod = require("sdk/page-mod");
4 | const data = require("sdk/self").data;
5 | var loggingDB = require("./lib/loggingdb.js");
6 | var pageManager = require("./lib/page-manager.js");
7 | var cookieInstrument = require("./lib/cookie-instrument.js");
8 | var jsInstrument = require("./lib/javascript-instrument.js");
9 | var cpInstrument = require("./lib/content-policy-instrument.js");
10 | var httpInstrument = require("./lib/http-instrument.js");
11 |
12 |
13 | exports.main = function(options, callbacks) {
14 |
15 | // Read the browser configuration from file
16 | var path = system.pathFor("ProfD") + '/browser_params.json';
17 | if (fileIO.exists(path)) {
18 | var config = JSON.parse(fileIO.read(path, 'r'));
19 | console.log("Browser Config:", config);
20 | } else {
21 | console.log("WARNING: config not found. Assuming this is a test run of",
22 | "the extension. Outputting all queries to console.");
23 | var config = {
24 | sqlite_address:null,
25 | leveldb_address:null,
26 | logger_address:null,
27 | disable_webdriver_self_id:true,
28 | cookie_instrument:true,
29 | js_instrument:true,
30 | cp_instrument:true,
31 | http_instrument:true,
32 | save_javascript:true,
33 | testing:true,
34 | crawl_id:''
35 | };
36 | }
37 |
38 | loggingDB.open(config['sqlite_address'],
39 | config['leveldb_address'],
40 | config['logger_address'],
41 | config['crawl_id']);
42 |
43 | // Prevent the webdriver from identifying itself in the DOM. See #91
44 | if (config['disable_webdriver_self_id']) {
45 | loggingDB.logDebug("Disabling webdriver self identification");
46 | pageMod.PageMod({
47 | include: "*",
48 | contentScriptWhen: "start",
49 | contentScriptFile: data.url("remove_webdriver_attributes.js")
50 | });
51 | }
52 | // Trigger artificial sensor events
53 | if (config['trigger_sensor_events']) {
54 | console.log("Enabling fake sensor events");
55 | pageMod.PageMod({
56 | include: "*",
57 | contentScriptWhen: "ready",
58 | contentScriptFile: data.url("trigger_sensor_events.js")
59 | });
60 | }
61 | if (config['cookie_instrument']) {
62 | loggingDB.logDebug("Cookie instrumentation enabled");
63 | cookieInstrument.run(config['crawl_id']);
64 | }
65 | if (config['js_instrument']) {
66 | loggingDB.logDebug("Javascript instrumentation enabled");
67 | jsInstrument.run(config['crawl_id'], config['testing']);
68 | }
69 | if (config['cp_instrument']) {
70 | loggingDB.logDebug("Content Policy instrumentation enabled");
71 | cpInstrument.run(config['crawl_id']);
72 | }
73 | if (config['http_instrument']) {
74 | loggingDB.logDebug("HTTP Instrumentation enabled");
75 | httpInstrument.run(config['crawl_id'], config['save_javascript']);
76 | }
77 | };
78 |
--------------------------------------------------------------------------------
/automation/Extension/firefox/lib/content-policy-instrument.js:
--------------------------------------------------------------------------------
1 | const {Cc, Ci, components} = require("chrome");
2 | const data = require("sdk/self").data;
3 | var { Class } = require('sdk/core/heritage');
4 | var { xpcom, Unknown, Service } = require('sdk/platform/xpcom');
5 | var uuid = require('sdk/util/uuid').uuid();
6 | var loggingDB = require("./loggingdb.js");
7 | var pageManager = require("./page-manager.js");
8 |
9 | exports.run = function(crawlID) {
10 |
11 | // Set up logging
12 | var createContentPolicyTable = data.load("create_content_policy_table.sql");
13 | loggingDB.executeSQL(createContentPolicyTable, false);
14 |
15 | // Instrument content policy API
16 | // Provides additional information about what caused a request and what it's for
17 | var InstrumentContentPolicy = Class({
18 | extends: Unknown,
19 | interfaces: [ "nsIContentPolicy" ],
20 |
21 | shouldLoad: function(contentType, contentLocation, requestOrigin, context, mimeTypeGuess, extra) {
22 | var update = { };
23 | update["crawl_id"] = crawlID;
24 | update["content_type"] = contentType;
25 | update["content_location"] = loggingDB.escapeString(contentLocation.spec);
26 | update["request_origin"] = loggingDB.escapeString(requestOrigin ? requestOrigin.spec : "");
27 | update["page_id"] = -1;
28 | if(context) {
29 | var domNode = null;
30 | var domWindow = null;
31 | try { domNode = context.QueryInterface(Ci.nsIDOMNode); }
32 | catch(error) { }
33 | try { domWindow = context.QueryInterface(Ci.nsIDOMWindow); }
34 | catch(error) { }
35 | var window = null;
36 | if(domNode && domNode.ownerDocument && domNode.ownerDocument.defaultView)
37 | window = domNode.ownerDocument.defaultView;
38 | //document = domNode.ownerDocument;
39 | if(domWindow)
40 | window = domWindow;
41 | if(window) {
42 | update["page_id"] = pageManager.pageIDFromWindow(window);
43 | }
44 | }
45 | update["mime_type_guess"] = loggingDB.escapeString(mimeTypeGuess ? mimeTypeGuess : "");
46 |
47 | loggingDB.executeSQL(loggingDB.createInsert("content_policy", update), true);
48 |
49 | return Ci.nsIContentPolicy.ACCEPT;
50 | },
51 |
52 | // Fires infrequently, instrumentation unused
53 | shouldProcess: function(contentType, contentLocation, requestOrigin, context, mimeType, extra) {
54 | return Ci.nsIContentPolicy.ACCEPT;
55 | }
56 | });
57 |
58 | var contractID = "@stanford.edu/instrument-content-policy;1";
59 |
60 | var instrumentContentPolicyService = Service({
61 | contract: contractID,
62 | Component: InstrumentContentPolicy
63 | });
64 |
65 | var categoryManager = Cc["@mozilla.org/categorymanager;1"].getService(Ci.nsICategoryManager);
66 | categoryManager.addCategoryEntry("content-policy", contractID, contractID, false, false);
67 |
68 | };
69 |
--------------------------------------------------------------------------------
/automation/Extension/firefox/lib/cookie-instrument.js:
--------------------------------------------------------------------------------
1 | const {Cc, Ci} = require("chrome");
2 | var events = require("sdk/system/events");
3 | const data = require("sdk/self").data;
4 | var loggingDB = require("./loggingdb.js");
5 |
6 | exports.run = function(crawlID) {
7 |
8 | // Set up logging
9 | var createCookiesTable = data.load("create_cookies_table.sql");
10 | loggingDB.executeSQL(createCookiesTable, false);
11 |
12 | // Instrument cookie changes
13 | events.on("cookie-changed", function(event) {
14 | var data = event.data;
15 | // TODO: Support other cookie operations
16 | if(data == "deleted" || data == "added" || data == "changed") {
17 | var update = {};
18 | update["change"] = loggingDB.escapeString(data);
19 | update["crawl_id"] = crawlID;
20 |
21 | var cookie = event.subject.QueryInterface(Ci.nsICookie2);
22 |
23 | // Creation time (in microseconds)
24 | var creationTime = new Date(cookie.creationTime / 1000); // requires milliseconds
25 | update["creationTime"] = creationTime.toLocaleFormat('%Y-%m-%d %H:%M:%S');
26 |
27 | // Expiry time (in seconds)
28 | // May return ~Max(int64). I believe this is a session
29 | // cookie which doesn't expire. Sessions cookies with
30 | // non-max expiry time expire after session or at expiry.
31 | var expiryTime = cookie.expiry; // returns seconds
32 | if (expiryTime == 9223372036854776000) {
33 | var expiryTimeString = '9999-12-31 23:59:59';
34 | } else {
35 | var expiryTimeDate = new Date(expiryTime * 1000) // requires milliseconds
36 | var expiryTimeString = expiryTimeDate.toLocaleFormat('%Y-%m-%d %H:%M:%S');
37 | }
38 | update["expiry"] = expiryTimeString;
39 | update["is_http_only"] = loggingDB.boolToInt(cookie.isHttpOnly);
40 | update["is_session"] = loggingDB.boolToInt(cookie.isSession);
41 |
42 | // Accessed time (in microseconds)
43 | var lastAccessedTime = new Date(cookie.lastAccessed / 1000); // requires milliseconds
44 | update["last_accessed"] = lastAccessedTime.toLocaleFormat('%Y-%m-%d %H:%M:%S');
45 | update["raw_host"] = loggingDB.escapeString(cookie.rawHost);
46 |
47 | cookie = cookie.QueryInterface(Ci.nsICookie);
48 | update["expires"] = cookie.expires;
49 | update["host"] = loggingDB.escapeString(cookie.host);
50 | update["is_domain"] = loggingDB.boolToInt(cookie.isDomain);
51 | update["is_secure"] = loggingDB.boolToInt(cookie.isSecure);
52 | update["name"] = loggingDB.escapeString(cookie.name);
53 | update["path"] = loggingDB.escapeString(cookie.path);
54 | update["policy"] = cookie.policy;
55 | update["status"] = cookie.status;
56 | update["value"] = loggingDB.escapeString(cookie.value);
57 |
58 | loggingDB.executeSQL(loggingDB.createInsert("javascript_cookies", update), true);
59 | }
60 | }, true);
61 |
62 | };
63 |
--------------------------------------------------------------------------------
/automation/Extension/firefox/lib/javascript-instrument.js:
--------------------------------------------------------------------------------
1 | var pageMod = require("sdk/page-mod");
2 | const data = require("sdk/self").data;
3 | var loggingDB = require("./loggingdb.js");
4 | var pageManager = require("./page-manager.js");
5 |
6 | exports.run = function(crawlID, testing) {
7 |
8 | // Set up tables
9 | var createJavascriptTable = data.load("create_javascript_table.sql");
10 | loggingDB.executeSQL(createJavascriptTable, false);
11 |
12 | // Inject content script to instrument JavaScript API
13 | pageMod.PageMod({
14 | include: "*",
15 | contentScriptWhen: "start",
16 | contentScriptFile: data.url("./content.js"),
17 | contentScriptOptions: {
18 | 'testing': testing
19 | },
20 | onAttach: function onAttach(worker) {
21 | var url = worker.url;
22 | function processCallsAndValues(data) {
23 | var update = {};
24 | update["crawl_id"] = crawlID;
25 | update["script_url"] = loggingDB.escapeString(data.scriptUrl);
26 | update["script_line"] = loggingDB.escapeString(data.scriptLine);
27 | update["script_col"] = loggingDB.escapeString(data.scriptCol);
28 | update["func_name"] = loggingDB.escapeString(data.funcName);
29 | update["script_loc_eval"] = loggingDB.escapeString(data.scriptLocEval);
30 | update["call_stack"] = loggingDB.escapeString(data.callStack);
31 | update["symbol"] = loggingDB.escapeString(data.symbol);
32 | update["operation"] = loggingDB.escapeString(data.operation);
33 | update["value"] = loggingDB.escapeString(data.value);
34 | update["time_stamp"] = data.timeStamp;
35 |
36 | // Create a json object for function arguments
37 | // We create an object that maps array positon to argument
38 | // e.g. someFunc('a',123,'b') --> {0: a, 1: 123, 2: 'b'}
39 | // to make it easier to query the data, using something like the
40 | // sqlite3 json1 extension.
41 | var args = {};
42 | if (data.operation == 'call' && data.args.length > 0) {
43 | for(var i = 0; i < data.args.length; i++) {
44 | args[i] = data.args[i]
45 | }
46 | update["arguments"] = loggingDB.escapeString(JSON.stringify(args));
47 | }
48 |
49 | loggingDB.executeSQL(loggingDB.createInsert("javascript", update), true);
50 | }
51 | worker.port.on("logCall", function(data){processCallsAndValues(data)});
52 | worker.port.on("logValue", function(data){processCallsAndValues(data)});
53 | }
54 | });
55 | };
56 |
--------------------------------------------------------------------------------
/automation/Extension/firefox/lib/page-manager.js:
--------------------------------------------------------------------------------
1 | const {Cc, Ci} = require("chrome");
2 | const data = require("sdk/self").data;
3 | var loggingDB = require("./loggingdb.js");
4 | var events = require("sdk/system/events");
5 |
6 | var crawlID = null;
7 |
8 | exports.setup = function(crawl_ID) {
9 | crawlID = crawl_ID;
10 |
11 | // Set up logging
12 | var createPagesTable = data.load("create_pages_table.sql");
13 | loggingDB.executeSQL(createPagesTable, false);
14 |
15 | // Log new windows
16 | events.on("content-document-global-created", function(event) {
17 | var window = event.subject;
18 | var pageID = pageIDFromWindow(window);
19 | var parentID = window.parent ? pageIDFromWindow(window.parent) : -1;
20 | var location = window.document && window.document.location ? window.document.location : "";
21 | insertPage(pageID, location, parentID);
22 | }, true);
23 |
24 | };
25 |
26 | var insertPage = function(pageID, location, parentID) {
27 | var update = { };
28 | update["crawl_id"] = crawlID;
29 | update["id"] = pageID;
30 | update["location"] = loggingDB.escapeString(location);
31 | update["parent_id"] = parentID;
32 | loggingDB.executeSQL(loggingDB.createInsert("pages", update), true);
33 | };
34 | exports.insertPage = insertPage;
35 |
36 | var pageIDFromWindow = function (window) {
37 | try {
38 | return window.QueryInterface(Ci.nsIInterfaceRequestor).getInterface(Ci.nsIDOMWindowUtils).currentInnerWindowID;
39 | }
40 | catch(error) {
41 | }
42 | return -1;
43 | };
44 | exports.pageIDFromWindow = pageIDFromWindow;
45 |
46 | exports.pageIDFromHttpChannel = function(httpChannel) {
47 | try {
48 | var notificationCallbacks = null;
49 | if(httpChannel.notificationCallbacks)
50 | notificationCallbacks = httpChannel.notificationCallbacks;
51 | else if(httpChannel.loadGroup)
52 | notificationCallbacks = httpChannel.loadGroup.notificationCallbacks;
53 | if(notificationCallbacks) {
54 | var loadContext = notificationCallbacks.getInterface(Ci.nsILoadContext)
55 | var window = loadContext.associatedWindow;
56 | return pageIDFromWindow(window);
57 | }
58 | }
59 | catch(error) {
60 | //console.log("Error getting page ID: " + httpChannel.URI.spec);
61 | }
62 | return -1;
63 | };
64 |
--------------------------------------------------------------------------------
/automation/Extension/firefox/lib/socket.js:
--------------------------------------------------------------------------------
1 | const {Cc, Ci} = require("chrome");
2 |
3 | var bufferpack = require("bufferpack/bufferpack");
4 |
5 | var tm = Cc["@mozilla.org/thread-manager;1"].getService();
6 | var socketService = Cc["@mozilla.org/network/socket-transport-service;1"]
7 | .getService(Ci.nsISocketTransportService);
8 |
9 | class ListeningSocket {
10 | // Socket which feeds incomming messages to a queue
11 | constructor() {
12 |
13 | console.log("Initializing a listening sever socket...");
14 | this._serverSocket = Cc["@mozilla.org/network/server-socket;1"]
15 | .createInstance(Ci.nsIServerSocket);
16 | this._inputStream = null;
17 | this._serverSocket.init(-1, true, -1); // init with random port
18 |
19 | this.port = this._serverSocket.port;
20 | this.queue = []; // stores messages sent to socket
21 | console.log("...serverSocket listening on port:",this.port);
22 |
23 | }
24 |
25 | startListening() {
26 | var thisSocket = this; // self reference for closure
27 | this._serverSocket.asyncListen({
28 | onSocketAccepted: function(sock, transport) {
29 | thisSocket._inputStream = transport.openInputStream(0, 0, 0);
30 | thisSocket._inputStream.asyncWait({
31 | onInputStreamReady: function() {
32 | thisSocket._updateQueue();
33 | }
34 | }, 0, 0, tm.mainThread);
35 | }
36 | });
37 | }
38 |
39 | _updateQueue() {
40 | var bInputStream = Cc["@mozilla.org/binaryinputstream;1"]
41 | .createInstance(Ci.nsIBinaryInputStream);
42 | bInputStream.setInputStream(this._inputStream);
43 |
44 | var buff = bInputStream.readByteArray(5);
45 | var meta = bufferpack.unpack('>Lc', buff);
46 | var string = bInputStream.readBytes(meta[0]);
47 | if (meta[1] != 'n' && meta[1] == 'j') {
48 | string = JSON.parse(string);
49 | } else if (meta[1] != 'n') {
50 | console.error("Unsupported serialization type (",meta[1],").");
51 | return;
52 | }
53 | this.queue.push(string);
54 |
55 | var thisSocket = this; // self reference for closure
56 | this._inputStream.asyncWait({
57 | onInputStreamReady: function(){
58 | thisSocket._updateQueue();
59 | }
60 | }, 0, 0, tm.mainThread);
61 | }
62 | }
63 | exports.ListeningSocket = ListeningSocket;
64 |
65 | class SendingSocket {
66 | // Socket which encodes messages and sets to specified (host, port)
67 | constructor() {
68 | this._stream = null;
69 | this._bOutputStream = Cc["@mozilla.org/binaryoutputstream;1"]
70 | .createInstance(Ci.nsIBinaryOutputStream);
71 | }
72 |
73 | connect(host, port) {
74 | // Open socket connection to remote host
75 | try {
76 | var transport = socketService.createTransport(null, 0, host, port, null);
77 | this._stream = transport.openOutputStream(1, 4096, 1048575);
78 | this._bOutputStream.setOutputStream(this._stream)
79 | return true;
80 | } catch (err) {
81 | console.error(err,err.message);
82 | return false;
83 | }
84 | }
85 |
86 | send(query) {
87 | // Format: [sql_query, [arg1, arg2, arg3]]
88 | // e.g. ["INSERT INTO table (item1, item2) VALUES (?,?)", [val1, val2]]
89 | try {
90 | var msg = JSON.stringify(query);
91 | var buff = bufferpack.pack('>Lc',[msg.length,'j']);
92 | this._bOutputStream.writeByteArray(buff, buff.length);
93 | this._stream.write(msg, msg.length);
94 | return true;
95 | } catch (err) {
96 | console.error(err,err.message);
97 | return false;
98 | }
99 | }
100 |
101 | close() {
102 | this._stream.close();
103 | }
104 | }
105 | exports.SendingSocket = SendingSocket;
106 |
--------------------------------------------------------------------------------
/automation/Extension/firefox/node_modules/bufferpack/.npmignore:
--------------------------------------------------------------------------------
1 |
2 | .gitignore
3 | *.md
4 | test/
--------------------------------------------------------------------------------
/automation/Extension/firefox/node_modules/bufferpack/CHANGELOG:
--------------------------------------------------------------------------------
1 |
2 |
3 | - 0.0.6 (1/25/12)
4 |
5 | Fixed issue with unpacking empty null term string
6 |
7 |
--------------------------------------------------------------------------------
/automation/Extension/firefox/node_modules/bufferpack/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright (c) 2008, Fair Oaks Labs, Inc.
2 | All rights reserved.
3 |
4 | Redistribution and use in source and binary forms, with or without modification, are
5 | permitted provided that the following conditions are met:
6 |
7 | * Redistributions of source code must retain the above copyright notice, this list
8 | of conditions and the following disclaimer.
9 |
10 | * Redistributions in binary form must reproduce the above copyright notice, this
11 | list of conditions and the following disclaimer in the documentation and/or other
12 | materials provided with the distribution.
13 |
14 | * Neither the name of Fair Oaks Labs, Inc. nor the names of its contributors may be
15 | used to endorse or promote products derived from this software without specific
16 | prior written permission.
17 |
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
19 | EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
20 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
21 | THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
23 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
25 | STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
26 | THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 |
--------------------------------------------------------------------------------
/automation/Extension/firefox/node_modules/bufferpack/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "bufferpack",
3 | "description": "Module to pack/unpack primitives and c strings into/out of a Node.js buffer",
4 | "version": "0.0.6",
5 | "keywords": [
6 | "jspack",
7 | "buffer",
8 | "octet",
9 | "primitive",
10 | "string"
11 | ],
12 | "homepage": "https://github.com/ryanrolds/bufferpack",
13 | "repository": {
14 | "type": "git",
15 | "url": "git://github.com/ryanrolds/bufferpack.git"
16 | },
17 | "main": "./bufferpack.js",
18 | "author": {
19 | "name": "Ryan Olds",
20 | "email": "ryanrolds@gmail.com"
21 | },
22 | "maintainers": [
23 | {
24 | "name": "Peter Griess",
25 | "url": "https://github.com/pgriess"
26 | },
27 | {
28 | "name": "Peter Magnusson",
29 | "email": "peter@birchroad.net",
30 | "url": "http://github.com/birchroad/node-jspack"
31 | },
32 | {
33 | "name": "Ryan Olds",
34 | "email": "ryanrolds@gmail.com",
35 | "url": "https://github.com/ryanrolds"
36 | }
37 | ],
38 | "devDependencies": {
39 | "mocha": "= 0.10.2",
40 | "should": "= 0.5.1"
41 | },
42 | "scripts": {
43 | "test": "./node_modules/.bin/mocha test/*.test.js --reporter spec"
44 | },
45 | "bugs": {
46 | "url": "https://github.com/ryanrolds/bufferpack/issues"
47 | },
48 | "readme": "ERROR: No README data found!",
49 | "_id": "bufferpack@0.0.6",
50 | "dist": {
51 | "shasum": "a7bf3619848f1f74e33bd9cb4b5909dc93ba0b30"
52 | },
53 | "_from": "bufferpack@",
54 | "_resolved": "https://registry.npmjs.org/bufferpack/-/bufferpack-0.0.6.tgz"
55 | }
56 |
--------------------------------------------------------------------------------
/automation/Extension/firefox/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "openwpm",
3 | "title": "openwpm",
4 | "description": "Extension with socket interface into OpenWPM",
5 | "author": "Steven Englehardt",
6 | "license": "GPL v3",
7 | "version": "0.0.1",
8 | "dependencies": {
9 | "bufferpack": "0.0.6"
10 | },
11 | "permissions": {
12 | "unsafe-content-script": true
13 | }
14 | }
15 |
--------------------------------------------------------------------------------
/automation/Extension/firefox/test/test-main.js:
--------------------------------------------------------------------------------
1 | var main = require("./main");
2 |
3 | exports["test main"] = function(assert) {
4 | assert.pass("Unit test running!");
5 | };
6 |
7 | exports["test main async"] = function(assert, done) {
8 | assert.pass("async Unit test running!");
9 | done();
10 | };
11 |
12 | require("sdk/test").run(exports);
13 |
--------------------------------------------------------------------------------
/automation/MPLogger.py:
--------------------------------------------------------------------------------
1 | """ Support for logging with the multiprocessing module """
2 | from SocketInterface import serversocket
3 |
4 | from Queue import Empty as EmptyQueue
5 | import logging.handlers
6 | import logging
7 | import struct
8 | import json
9 | import time
10 | import sys
11 | import os
12 |
13 | class ClientSocketHandler(logging.handlers.SocketHandler):
14 | """
15 | Make SocketHandler compatible with SocketInterface.py
16 | """
17 | def makePickle(self, record):
18 | """
19 | Serializes the record via json and prepends a length/serialization
20 | flag. Returns it ready for transmission across the socket.
21 | """
22 | ei = record.exc_info
23 | if ei:
24 | # just to get traceback text into record.exc_text ...
25 | dummy = self.format(record) # noqa
26 | record.exc_info = None # to avoid Unpickleable error
27 | d = dict(record.__dict__)
28 | d['msg'] = record.getMessage()
29 | d['args'] = None
30 | s = json.dumps(d)
31 | if ei:
32 | record.exc_info = ei # for next handler
33 | return struct.pack('>Ic', len(s), 'j') + s
34 |
35 | def loggingclient(logger_address, logger_port, level=logging.DEBUG):
36 | """ Establishes a logger that sends log records to loggingserver """
37 | logger = logging.getLogger(__name__)
38 | logger.setLevel(level)
39 |
40 | # Logger object shared, so we only want to connect handlers once
41 | if not len(logger.handlers):
42 |
43 | # Set up the SocketHandler - formatted server-side
44 | socketHandler = ClientSocketHandler(logger_address, logger_port)
45 | socketHandler.setLevel(level)
46 | logger.addHandler(socketHandler)
47 |
48 | # Set up logging to console
49 | consoleHandler = logging.StreamHandler(sys.stdout)
50 | consoleHandler.setLevel(logging.INFO)
51 | formatter = logging.Formatter('%(module)-20s - %(levelname)-8s - %(message)s')
52 | consoleHandler.setFormatter(formatter)
53 | logger.addHandler(consoleHandler)
54 |
55 | return logger
56 |
57 | def loggingserver(log_file, status_queue):
58 | """
59 | A logging server to serialize writes to the log file from multiple
60 | processes.
61 |
62 | location of the log file on disk
63 | is a queue connect to the TaskManager used for communication
64 | """
65 | # Configure the log file
66 | logging.basicConfig(filename=os.path.expanduser(log_file),
67 | format= '%(asctime)s - %(processName)-11s[%(threadName)-10s]' +
68 | ' - %(module)-20s - %(levelname)-8s: %(message)s',
69 | level=logging.INFO)
70 |
71 | # Sets up the serversocket to start accepting connections
72 | sock = serversocket()
73 | status_queue.put(sock.sock.getsockname()) # let TM know location
74 | sock.start_accepting()
75 |
76 | while True:
77 | # Check for KILL command from TaskManager
78 | if not status_queue.empty():
79 | status_queue.get()
80 | sock.close()
81 | _drain_queue(sock.queue)
82 | break
83 |
84 | # Process logs
85 | try:
86 | obj = sock.queue.get(True, 10)
87 | _handleLogRecord(obj)
88 | except EmptyQueue:
89 | pass
90 |
91 | def _handleLogRecord(obj):
92 | """ Handle log, logs everything sent. Should filter client-side """
93 |
94 | # Log message came from browser extension: requires special handling
95 | if len(obj) == 2 and obj[0] == 'EXT':
96 | obj = json.loads(obj[1])
97 | record = logging.LogRecord(name=__name__,
98 | level=obj['level'],
99 | pathname=obj['pathname'],
100 | lineno=obj['lineno'],
101 | msg=obj['msg'],
102 | args=obj['args'],
103 | exc_info=obj['exc_info'],
104 | func=obj['func'])
105 | else:
106 | record = logging.makeLogRecord(obj)
107 | logger = logging.getLogger(record.name)
108 | logger.handle(record)
109 |
110 | def _drain_queue(sock_queue):
111 | """ Ensures queue is empty before closing """
112 | time.sleep(3) # TODO: the socket needs a better way of closing
113 | while not sock_queue.empty():
114 | obj = sock_queue.get()
115 | _handleLogRecord(obj)
116 |
117 | if __name__ == '__main__':
118 | # Some tests
119 | import logging, logging.handlers
120 | import multiprocess as mp
121 |
122 | # Set up loggingserver
123 | log_file = '~/mplogger.log'
124 | status_queue = mp.Queue()
125 | loggingserver = mp.Process(target=loggingserver, args=(log_file, status_queue))
126 | loggingserver.daemon = True
127 | loggingserver.start()
128 | server_address = status_queue.get()
129 |
130 | # Connect main process to logging server
131 | rootLogger = logging.getLogger('')
132 | rootLogger.setLevel(logging.DEBUG)
133 | socketHandler = ClientSocketHandler(*server_address)
134 | rootLogger.addHandler(socketHandler)
135 |
136 | # Send some sample logs
137 | logging.info('Test1')
138 | logging.error('Test2')
139 | logging.critical('Test3')
140 | logging.debug('Test4')
141 | logging.warning('Test5')
142 |
143 | logger1 = logging.getLogger('test1')
144 | logger2 = logging.getLogger('test2')
145 | logger1.info('asdfasdfsa')
146 | logger2.info('1234567890')
147 |
148 | # Close the logging server
149 | status_queue.put('DIE')
150 | loggingserver.join()
151 | print "Server closed, exiting..."
152 |
--------------------------------------------------------------------------------
/automation/Proxy/MITMProxy.py:
--------------------------------------------------------------------------------
1 | from ..SocketInterface import clientsocket
2 | from ..MPLogger import loggingclient
3 | import mitm_commands
4 |
5 | from libmproxy import controller
6 | import Queue
7 | import sys
8 | import traceback
9 |
10 |
11 | class InterceptingMaster (controller.Master):
12 | """
13 | Customized MITMProxy
14 | Extends the proxy controller to add some additional
15 | functionality for handling /logging requests and responses
16 |
17 | Inspired by the following example. Note the gist has a lot of bugs.
18 | https://gist.github.com/dannvix/5285924
19 | """
20 |
21 | def __init__(self, server, visit_id_queue, browser_params, manager_params, status_queue):
22 | self.browser_params = browser_params
23 | self.manager_params = manager_params
24 |
25 | # Attributes used to flag the first-party domain
26 | self.visit_id_queue = visit_id_queue # first-party domain provided by BrowserManager
27 | self.prev_visit_id, self.curr_visit_id = None, None # previous and current top level domains
28 | self.prev_requests, self.curr_requests = set(), set() # set of requests for previous and current site
29 |
30 | # Open a socket to communicate with DataAggregator
31 | self.db_socket = clientsocket(serialization='dill')
32 | self.db_socket.connect(*manager_params['aggregator_address'])
33 |
34 | # Open a socket to communicate with LevelDBAggregator
35 | self.ldb_socket = None
36 | if browser_params['save_javascript_proxy']:
37 | self.ldb_socket = clientsocket(serialization='dill')
38 | self.ldb_socket.connect(*manager_params['ldb_address'])
39 |
40 | # Open a socket to communicate with MPLogger
41 | self.logger = loggingclient(*manager_params['logger_address'])
42 |
43 | # Store status_queue for communication back to TaskManager
44 | self.status_queue = status_queue
45 |
46 | controller.Master.__init__(self, server)
47 |
48 | def load_process_message(self, q, timeout):
49 | """ Tries to read and process a message from the proxy queue, returns True iff this succeeds """
50 | try:
51 | msg = q.get(timeout=timeout)
52 | controller.Master.handle(self, *msg)
53 | return True
54 | except Queue.Empty:
55 | return False
56 |
57 | def tick(self, q, timeout=0.01):
58 | """ new tick function used to label first-party domains and avoid race conditions when doing so """
59 | if self.curr_visit_id is None: # proxy is fresh, need to get first-party domain right away
60 | self.curr_visit_id = self.visit_id_queue.get()
61 | elif not self.visit_id_queue.empty(): # new FP has been visited
62 | # drains the queue to get rid of stale messages from previous site
63 | while self.load_process_message(q, timeout):
64 | pass
65 |
66 | self.prev_requests, self.curr_requests = self.curr_requests, set()
67 | self.prev_visit_id, self.curr_visit_id = self.curr_visit_id, self.visit_id_queue.get()
68 |
69 | self.load_process_message(q, timeout)
70 |
71 | def run(self):
72 | """ Light wrapper around run with error printing """
73 | try:
74 | controller.Master.run(self)
75 | except KeyboardInterrupt:
76 | print 'KeyboardInterrupt received. Shutting down'
77 | self.shutdown()
78 | sys.exit(0)
79 | except Exception:
80 | excp = traceback.format_exception(*sys.exc_info())
81 | self.logger.critical('BROWSER %i: Exception. Shutting down proxy!\n%s' % (self.browser_params['crawl_id'], excp))
82 | self.status_queue.put(('FAILED', None))
83 | self.shutdown()
84 | raise
85 |
86 | def handle_request(self, msg):
87 | """ Receives HTTP request, and sends it to logging function """
88 | msg.reply()
89 | self.curr_requests.add(msg.request)
90 | mitm_commands.process_general_mitm_request(self.db_socket,
91 | self.browser_params,
92 | self.curr_visit_id,
93 | msg)
94 |
95 | # Record data from HTTP responses
96 | def handle_response(self, msg):
97 | """ Receives HTTP response, and sends it to logging function """
98 | msg.reply()
99 |
100 | # attempts to get the top url visit id, based on the request object
101 | if msg.request in self.prev_requests:
102 | visit_id = self.prev_visit_id
103 | self.prev_requests.remove(msg.request)
104 | elif msg.request in self.curr_requests:
105 | visit_id = self.curr_visit_id
106 | self.curr_requests.remove(msg.request)
107 | else: # ignore responses for which we cannot match the request
108 | return
109 | mitm_commands.process_general_mitm_response(self.db_socket,
110 | self.ldb_socket,
111 | self.logger,
112 | self.browser_params,
113 | visit_id, msg)
114 |
--------------------------------------------------------------------------------
/automation/Proxy/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sensor-js/OpenWPM-mobile/1834f4c5b8fd0d3976b2e57f5310fd72860a681f/automation/Proxy/__init__.py
--------------------------------------------------------------------------------
/automation/Proxy/cert/mitmproxy-ca-cert.cer:
--------------------------------------------------------------------------------
1 | -----BEGIN CERTIFICATE-----
2 | MIIDoTCCAomgAwIBAgIGDUd8Ol4xMA0GCSqGSIb3DQEBCwUAMCgxEjAQBgNVBAMM
3 | CW1pdG1wcm94eTESMBAGA1UECgwJbWl0bXByb3h5MB4XDTE2MDQwNTIyMjMyM1oX
4 | DTIxMDQwNjIyMjMyM1owKDESMBAGA1UEAwwJbWl0bXByb3h5MRIwEAYDVQQKDAlt
5 | aXRtcHJveHkwggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQDpWVI/DZBn
6 | Zt4BHGBpsGAkLwVfJPf4p1mSVvSwtZp25ghCQi+GMqaAcjk/rYG9k/BqhKFN43NC
7 | ggr9/hhm73oy2JxVOMbMtEKOc+z0plajxtROQnItoSTIYilbx9VPXYmYXTb5o0v5
8 | npc0huIuKGRq5KVI1hFnj6GdXO/EWJioUpUC1c3zyz2vsbK3nE0Ze0lmWQ9Aur+6
9 | cvQrkvidLeF9irfV/PatIUYQ6KmVDPGS865xH8nh3aDbK7vGS4UnM8YwCXpPz4R+
10 | 0QCNCrzCdvT+TkuvFJNFS8o5+zT1mEi+2LBUc+AApXaoCFNc4tZ20kgvsXeIJiRo
11 | hOjYR90tzc89AgMBAAGjgdAwgc0wDwYDVR0TAQH/BAUwAwEB/zARBglghkgBhvhC
12 | AQEEBAMCAgQweAYDVR0lBHEwbwYIKwYBBQUHAwEGCCsGAQUFBwMCBggrBgEFBQcD
13 | BAYIKwYBBQUHAwgGCisGAQQBgjcCARUGCisGAQQBgjcCARYGCisGAQQBgjcKAwEG
14 | CisGAQQBgjcKAwMGCisGAQQBgjcKAwQGCWCGSAGG+EIEATAOBgNVHQ8BAf8EBAMC
15 | AQYwHQYDVR0OBBYEFKpUAZXAaEWlCENC0uzof2rZsfQfMA0GCSqGSIb3DQEBCwUA
16 | A4IBAQBSceM4F6o0mDlxdxyq0Kn8QAQSaSPR0Mc0cgbIlisZ/TArBdM4hP/io0pG
17 | 9O2/xSVfggVELsWFsA447V/0dRN/544wXjLv0D6O/hLvDrLdxeV/EGzwh98TSt9p
18 | jT/lw7TD+9r/RQg95RKorsX+IdnEd201/DNc/lc3SMV6RQaZMXFqwvc8RKgie7r9
19 | L0lLDfpPVQufOXGpUakgiQyju/qnnMQeZgw8qCubmdcwFVSQ9HkeSiRyvzQwYNT1
20 | FvxFP9p0pG9pdZLvzV1EzLtVFqH0X6la5dNYQUX9YSm1HyfSxgwPOprAstnB8xaI
21 | e1WOBDqrvIfVypJFB0IFMlmfs2Pk
22 | -----END CERTIFICATE-----
23 |
--------------------------------------------------------------------------------
/automation/Proxy/cert/mitmproxy-ca-cert.p12:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sensor-js/OpenWPM-mobile/1834f4c5b8fd0d3976b2e57f5310fd72860a681f/automation/Proxy/cert/mitmproxy-ca-cert.p12
--------------------------------------------------------------------------------
/automation/Proxy/cert/mitmproxy-ca-cert.pem:
--------------------------------------------------------------------------------
1 | -----BEGIN CERTIFICATE-----
2 | MIIDoTCCAomgAwIBAgIGDUd8Ol4xMA0GCSqGSIb3DQEBCwUAMCgxEjAQBgNVBAMM
3 | CW1pdG1wcm94eTESMBAGA1UECgwJbWl0bXByb3h5MB4XDTE2MDQwNTIyMjMyM1oX
4 | DTIxMDQwNjIyMjMyM1owKDESMBAGA1UEAwwJbWl0bXByb3h5MRIwEAYDVQQKDAlt
5 | aXRtcHJveHkwggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQDpWVI/DZBn
6 | Zt4BHGBpsGAkLwVfJPf4p1mSVvSwtZp25ghCQi+GMqaAcjk/rYG9k/BqhKFN43NC
7 | ggr9/hhm73oy2JxVOMbMtEKOc+z0plajxtROQnItoSTIYilbx9VPXYmYXTb5o0v5
8 | npc0huIuKGRq5KVI1hFnj6GdXO/EWJioUpUC1c3zyz2vsbK3nE0Ze0lmWQ9Aur+6
9 | cvQrkvidLeF9irfV/PatIUYQ6KmVDPGS865xH8nh3aDbK7vGS4UnM8YwCXpPz4R+
10 | 0QCNCrzCdvT+TkuvFJNFS8o5+zT1mEi+2LBUc+AApXaoCFNc4tZ20kgvsXeIJiRo
11 | hOjYR90tzc89AgMBAAGjgdAwgc0wDwYDVR0TAQH/BAUwAwEB/zARBglghkgBhvhC
12 | AQEEBAMCAgQweAYDVR0lBHEwbwYIKwYBBQUHAwEGCCsGAQUFBwMCBggrBgEFBQcD
13 | BAYIKwYBBQUHAwgGCisGAQQBgjcCARUGCisGAQQBgjcCARYGCisGAQQBgjcKAwEG
14 | CisGAQQBgjcKAwMGCisGAQQBgjcKAwQGCWCGSAGG+EIEATAOBgNVHQ8BAf8EBAMC
15 | AQYwHQYDVR0OBBYEFKpUAZXAaEWlCENC0uzof2rZsfQfMA0GCSqGSIb3DQEBCwUA
16 | A4IBAQBSceM4F6o0mDlxdxyq0Kn8QAQSaSPR0Mc0cgbIlisZ/TArBdM4hP/io0pG
17 | 9O2/xSVfggVELsWFsA447V/0dRN/544wXjLv0D6O/hLvDrLdxeV/EGzwh98TSt9p
18 | jT/lw7TD+9r/RQg95RKorsX+IdnEd201/DNc/lc3SMV6RQaZMXFqwvc8RKgie7r9
19 | L0lLDfpPVQufOXGpUakgiQyju/qnnMQeZgw8qCubmdcwFVSQ9HkeSiRyvzQwYNT1
20 | FvxFP9p0pG9pdZLvzV1EzLtVFqH0X6la5dNYQUX9YSm1HyfSxgwPOprAstnB8xaI
21 | e1WOBDqrvIfVypJFB0IFMlmfs2Pk
22 | -----END CERTIFICATE-----
23 |
--------------------------------------------------------------------------------
/automation/Proxy/cert/mitmproxy-ca.pem:
--------------------------------------------------------------------------------
1 | -----BEGIN PRIVATE KEY-----
2 | MIIEvgIBADANBgkqhkiG9w0BAQEFAASCBKgwggSkAgEAAoIBAQDpWVI/DZBnZt4B
3 | HGBpsGAkLwVfJPf4p1mSVvSwtZp25ghCQi+GMqaAcjk/rYG9k/BqhKFN43NCggr9
4 | /hhm73oy2JxVOMbMtEKOc+z0plajxtROQnItoSTIYilbx9VPXYmYXTb5o0v5npc0
5 | huIuKGRq5KVI1hFnj6GdXO/EWJioUpUC1c3zyz2vsbK3nE0Ze0lmWQ9Aur+6cvQr
6 | kvidLeF9irfV/PatIUYQ6KmVDPGS865xH8nh3aDbK7vGS4UnM8YwCXpPz4R+0QCN
7 | CrzCdvT+TkuvFJNFS8o5+zT1mEi+2LBUc+AApXaoCFNc4tZ20kgvsXeIJiRohOjY
8 | R90tzc89AgMBAAECggEBANHxFek6t+gACc7wC+Iq2GTVlTyj0yH41sKtz1pJGV7N
9 | 2TJbv8w6N8kqe01KOoco2c3L5CBieemlfGRGPwglf0jNz+FwwpObvqHRtKfsG9kf
10 | 4xVB/N4GtERA5Lr5kHXmTwwxGWwCOjGsOJf+nvIuTyJI8lgUmPYWqK4e1m1N1JLi
11 | eaxr3zRPK0s+wGXCUOAsd4cjNARpxEBJEkexTr7lWpGG5byIDrfVrQxDctP4h8Ji
12 | CbLCT8kpqjtBecJSj9uBDoDqthvJ+Py/PmKWK80Zeco+d+0qH40wTBUAvhLMP3VK
13 | 8assIuaOVZX0Pww4f2Frz9OrInwLdbZNOZJSuuml5RUCgYEA/Y0NgMPghRB4E4tg
14 | ZlrIQlRus3dHqWOdgbXFGVLQdvtsr/bIbG7lu+RcVPWTD1a9dTv/FW3GzSovTG1b
15 | FF2H/ic4bAnKrtPSR9xphDyM5O8jk/oVkMHUEGEZy7OdiOP4C9sIZz2WK0YUUwIt
16 | W2RCNqkZEeZtJviKdGWvHmEBK+cCgYEA65pQxYnZbymwySjB7VMnXofxCi1y283Y
17 | 4zuP3uqNPOhpA+Sdq0Mh9XvfkcCaDXdIFtgD9vvUpQOWmsG1ILRo+bvrdAYHzQ4j
18 | CO8ha8aYiN9tvn6kYMDVgXFacc787qSpL6AeR7ybYQU7fe1uHixi6wEKTin92/Ns
19 | e00CWzVTpzsCgYEAz/rp/puuCbhupqmHU65X4oDbpX7MW5gI1SNDH/icY1zt1JE7
20 | 6iY1cCBr1Iz0KnreQdIK9YrsrdJSpgB124i1SrblQ0ns5ed+789PBlecwxWeO33C
21 | PtGfoCfmPv+A048cIq1ygS01hx2fAlAg4HynC6s9kz9Ofc8V01Ctit/LVDUCgYAc
22 | 0h7JW2iW2aG/qdW3Q3HQdY698PtY+iBrA7FA0q5+YevexwumlKrFzeZ2fPobZZkS
23 | +k/Z2cqUeRDmU4Xlv0wMKLnP0qEHq5ALmr0a4wtryvEw2WsgTtaPZB9tRqXYR5pO
24 | siaiHedgAfTaHb5XwJRFLTZmg2qDio6dsrj0EVzvWwKBgGPKGjNN4orOvgTwRTNB
25 | +9drtwB3hxYmGXa7Tbzq62SNN6exJUvuW9sVeSssv058Nk1hTEQ7DJAXQOfkFWQf
26 | XX5VMyumF9xqIZ0c793m54VY7hd+SkwziwvVrtTuMiy9wjPHOxUd81gdW+OKARTO
27 | A2Z4tVG+hinAI7cZM4yR5Van
28 | -----END PRIVATE KEY-----
29 | -----BEGIN CERTIFICATE-----
30 | MIIDoTCCAomgAwIBAgIGDUd8Ol4xMA0GCSqGSIb3DQEBCwUAMCgxEjAQBgNVBAMM
31 | CW1pdG1wcm94eTESMBAGA1UECgwJbWl0bXByb3h5MB4XDTE2MDQwNTIyMjMyM1oX
32 | DTIxMDQwNjIyMjMyM1owKDESMBAGA1UEAwwJbWl0bXByb3h5MRIwEAYDVQQKDAlt
33 | aXRtcHJveHkwggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQDpWVI/DZBn
34 | Zt4BHGBpsGAkLwVfJPf4p1mSVvSwtZp25ghCQi+GMqaAcjk/rYG9k/BqhKFN43NC
35 | ggr9/hhm73oy2JxVOMbMtEKOc+z0plajxtROQnItoSTIYilbx9VPXYmYXTb5o0v5
36 | npc0huIuKGRq5KVI1hFnj6GdXO/EWJioUpUC1c3zyz2vsbK3nE0Ze0lmWQ9Aur+6
37 | cvQrkvidLeF9irfV/PatIUYQ6KmVDPGS865xH8nh3aDbK7vGS4UnM8YwCXpPz4R+
38 | 0QCNCrzCdvT+TkuvFJNFS8o5+zT1mEi+2LBUc+AApXaoCFNc4tZ20kgvsXeIJiRo
39 | hOjYR90tzc89AgMBAAGjgdAwgc0wDwYDVR0TAQH/BAUwAwEB/zARBglghkgBhvhC
40 | AQEEBAMCAgQweAYDVR0lBHEwbwYIKwYBBQUHAwEGCCsGAQUFBwMCBggrBgEFBQcD
41 | BAYIKwYBBQUHAwgGCisGAQQBgjcCARUGCisGAQQBgjcCARYGCisGAQQBgjcKAwEG
42 | CisGAQQBgjcKAwMGCisGAQQBgjcKAwQGCWCGSAGG+EIEATAOBgNVHQ8BAf8EBAMC
43 | AQYwHQYDVR0OBBYEFKpUAZXAaEWlCENC0uzof2rZsfQfMA0GCSqGSIb3DQEBCwUA
44 | A4IBAQBSceM4F6o0mDlxdxyq0Kn8QAQSaSPR0Mc0cgbIlisZ/TArBdM4hP/io0pG
45 | 9O2/xSVfggVELsWFsA447V/0dRN/544wXjLv0D6O/hLvDrLdxeV/EGzwh98TSt9p
46 | jT/lw7TD+9r/RQg95RKorsX+IdnEd201/DNc/lc3SMV6RQaZMXFqwvc8RKgie7r9
47 | L0lLDfpPVQufOXGpUakgiQyju/qnnMQeZgw8qCubmdcwFVSQ9HkeSiRyvzQwYNT1
48 | FvxFP9p0pG9pdZLvzV1EzLtVFqH0X6la5dNYQUX9YSm1HyfSxgwPOprAstnB8xaI
49 | e1WOBDqrvIfVypJFB0IFMlmfs2Pk
50 | -----END CERTIFICATE-----
51 |
--------------------------------------------------------------------------------
/automation/Proxy/cert/mitmproxy-dhparam.pem:
--------------------------------------------------------------------------------
1 |
2 | -----BEGIN DH PARAMETERS-----
3 | MIICCAKCAgEAyT6LzpwVFS3gryIo29J5icvgxCnCebcdSe/NHMkD8dKJf8suFCg3
4 | O2+dguLakSVif/t6dhImxInJk230HmfC8q93hdcg/j8rLGJYDKu3ik6H//BAHKIv
5 | j5O9yjU3rXCfmVJQic2Nne39sg3CreAepEts2TvYHhVv3TEAzEqCtOuTjgDv0ntJ
6 | Gwpj+BJBRQGG9NvprX1YGJ7WOFBP/hWU7d6tgvE6Xa7T/u9QIKpYHMIkcN/l3ZFB
7 | chZEqVlyrcngtSXCROTPcDOQ6Q8QzhaBJS+Z6rcsd7X+haiQqvoFcmaJ08Ks6LQC
8 | ZIL2EtYJw8V8z7C0igVEBIADZBI6OTbuuhDwRw//zU1uq52Oc48CIZlGxTYG/Evq
9 | o9EWAXUYVzWkDSTeBH1r4z/qLPE2cnhtMxbFxuvK53jGB0emy2y1Ei6IhKshJ5qX
10 | IB/aE7SSHyQ3MDHHkCmQJCsOd4Mo26YX61NZ+n501XjqpCBQ2+DfZCBh8Va2wDyv
11 | A2Ryg9SUz8j0AXViRNMJgJrr446yro/FuJZwnQcO3WQnXeqSBnURqKjmqkeFP+d8
12 | 6mk2tqJaY507lRNqtGlLnj7f5RNoBFJDCLBNurVgfvq9TCVWKDIFD4vZRjCrnl6I
13 | rD693XKIHUCWOjMh1if6omGXKHH40QuME2gNa50+YPn1iYDl88uDbbMCAQI=
14 | -----END DH PARAMETERS-----
15 |
--------------------------------------------------------------------------------
/automation/Proxy/cert8.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sensor-js/OpenWPM-mobile/1834f4c5b8fd0d3976b2e57f5310fd72860a681f/automation/Proxy/cert8.db
--------------------------------------------------------------------------------
/automation/Proxy/deploy_mitm_proxy.py:
--------------------------------------------------------------------------------
1 | from ..MPLogger import loggingclient
2 | import MITMProxy
3 |
4 | from libmproxy import proxy
5 | from libmproxy.proxy.server import ProxyServer
6 | import threading
7 | import socket
8 | import Queue
9 | import os
10 |
11 |
12 | def init_proxy(browser_params, manager_params, status_queue):
13 | """
14 | Uses mitmproxy used to log HTTP Requests and Responses
15 | configuration parameters of host browser
16 | configuration parameters of the TaskManager
17 | a Queue to report proxy status back to TaskManager
18 | """
19 | logger = loggingclient(*manager_params['logger_address'])
20 | proxy_site_queue = Queue.Queue() # queue for crawler to communicate with proxy
21 |
22 | # gets local port from one of the free ports
23 | sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
24 | sock.bind(('', 0))
25 | proxy_port = sock.getsockname()[1]
26 | sock.close()
27 |
28 | config = proxy.ProxyConfig(cadir=os.path.join(os.path.dirname(__file__), 'cert'),port=proxy_port)
29 | server = ProxyServer(config)
30 | logger.info('BROWSER %i: Intercepting Proxy listening on %i' % (browser_params['crawl_id'], proxy_port))
31 | m = MITMProxy.InterceptingMaster(server, proxy_site_queue, browser_params, manager_params, status_queue)
32 | thread = threading.Thread(target=m.run, args=())
33 | thread.daemon = True
34 | thread.start()
35 | return proxy_port, proxy_site_queue
36 |
--------------------------------------------------------------------------------
/automation/Proxy/key3.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sensor-js/OpenWPM-mobile/1834f4c5b8fd0d3976b2e57f5310fd72860a681f/automation/Proxy/key3.db
--------------------------------------------------------------------------------
/automation/Proxy/mitm_commands.py:
--------------------------------------------------------------------------------
1 | # This module parses MITM Proxy requests/responses into (command, data pairs)
2 | # This should mean that the MITMProxy code should simply pass the messages + its own data to this module
3 |
4 | from urlparse import urlparse
5 | import datetime
6 | import mmh3
7 | import json
8 | import zlib
9 |
10 | def encode_to_unicode(msg):
11 | """
12 | Tries different encodings before setting on utf8 ignoring any errors
13 | We can likely inspect the headers for an encoding as well, though it
14 | won't always be correct.
15 | """
16 | try:
17 | msg = unicode(msg, 'utf8')
18 | except UnicodeDecodeError:
19 | try:
20 | msg = unicode(msg, 'ISO-8859-1')
21 | except UnicodeDecodeError:
22 | msg = unicode(msg, 'utf8', 'ignore')
23 | return msg
24 |
25 |
26 | def process_general_mitm_request(db_socket, browser_params, visit_id, msg):
27 | """ Logs a HTTP request object """
28 | referrer = msg.request.headers['referer'][0] if len(msg.request.headers['referer']) > 0 else ''
29 |
30 | data = (browser_params['crawl_id'],
31 | encode_to_unicode(msg.request.url),
32 | msg.request.method,
33 | encode_to_unicode(referrer),
34 | json.dumps(msg.request.headers.get_state()),
35 | visit_id,
36 | str(datetime.datetime.now()))
37 |
38 | db_socket.send(("INSERT INTO http_requests_proxy (crawl_id, url, method, "
39 | "referrer, headers, visit_id, time_stamp) VALUES (?,?,?,?,?,?,?)", data))
40 |
41 |
42 | def process_general_mitm_response(db_socket, ldb_socket, logger, browser_params, visit_id, msg):
43 | """ Logs a HTTP response object and, if necessary, """
44 | referrer = msg.request.headers['referer'][0] if len(msg.request.headers['referer']) > 0 else ''
45 | location = msg.response.headers['location'][0] if len(msg.response.headers['location']) > 0 else ''
46 |
47 | content_hash = save_javascript_content(ldb_socket, logger, browser_params, msg)
48 |
49 | data = (browser_params['crawl_id'],
50 | encode_to_unicode(msg.request.url),
51 | encode_to_unicode(msg.request.method),
52 | encode_to_unicode(referrer),
53 | msg.response.code,
54 | msg.response.msg,
55 | json.dumps(msg.response.headers.get_state()),
56 | encode_to_unicode(location),
57 | visit_id,
58 | str(datetime.datetime.now()),
59 | content_hash)
60 |
61 | db_socket.send(("INSERT INTO http_responses_proxy (crawl_id, url, method, "
62 | "referrer, response_status, response_status_text, headers, "
63 | "location, visit_id, time_stamp, content_hash) "
64 | "VALUES (?,?,?,?,?,?,?,?,?,?,?)", data))
65 |
66 |
67 | def save_javascript_content(ldb_socket, logger, browser_params, msg):
68 | """ Save javascript files de-duplicated and compressed on disk """
69 | if not browser_params['save_javascript_proxy']:
70 | return
71 |
72 | # Check if this response is javascript content
73 | is_js = False
74 | if (len(msg.response.headers['Content-Type']) > 0 and
75 | 'javascript' in msg.response.headers['Content-Type'][0]):
76 | is_js = True
77 | if not is_js and urlparse(msg.request.url).path.split('.')[-1] == 'js':
78 | is_js = True
79 | if not is_js:
80 | return
81 |
82 | # Decompress any content with compression
83 | # We want files to hash to the same value
84 | # Firefox currently only accepts gzip/deflate
85 | script = ''
86 | content_encoding = msg.response.headers['Content-Encoding']
87 | if (len(content_encoding) == 0 or
88 | content_encoding[0].lower() == 'utf-8' or
89 | content_encoding[0].lower() == 'identity' or
90 | content_encoding[0].lower() == 'none' or
91 | content_encoding[0].lower() == 'ansi_x3.4-1968' or
92 | content_encoding[0].lower() == 'utf8' or
93 | content_encoding[0] == ''):
94 | script = msg.response.content
95 | elif 'gzip' in content_encoding[0].lower():
96 | try:
97 | script = zlib.decompress(msg.response.content, zlib.MAX_WBITS|16)
98 | except zlib.error as e:
99 | logger.error('BROWSER %i: Received zlib error when trying to decompress gzipped javascript: %s' % (browser_params['crawl_id'],str(e)))
100 | return
101 | elif 'deflate' in content_encoding[0].lower():
102 | try:
103 | script = zlib.decompress(msg.response.content, -zlib.MAX_WBITS)
104 | except zlib.error as e:
105 | logger.error('BROWSER %i: Received zlib error when trying to decompress deflated javascript: %s' % (browser_params['crawl_id'],str(e)))
106 | return
107 | else:
108 | logger.error('BROWSER %i: Received Content-Encoding %s. Not supported by Firefox, skipping archive.' % (browser_params['crawl_id'], str(content_encoding)))
109 | return
110 | script = encode_to_unicode(script)
111 |
112 | # Hash script for deduplication on disk
113 | hasher = mmh3.hash128
114 | script_hash = str(hasher(script.encode('utf-8')) >> 64)
115 |
116 | ldb_socket.send((script, script_hash))
117 |
118 | return script_hash
119 |
--------------------------------------------------------------------------------
/automation/SocketInterface.py:
--------------------------------------------------------------------------------
1 | import Queue
2 | import threading
3 | import traceback
4 | import socket
5 | import struct
6 | import json
7 | import dill
8 |
9 | #TODO - Implement a cleaner shutdown for server socket
10 | # see: https://stackoverflow.com/questions/1148062/python-socket-accept-blocks-prevents-app-from-quitting
11 |
12 | class serversocket:
13 | """
14 | A server socket to recieve and process string messages
15 | from client sockets to a central queue
16 | """
17 | def __init__(self, verbose=False):
18 | self.sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
19 | self.sock.bind(('localhost', 0))
20 | self.sock.listen(10) # queue a max of n connect requests
21 | self.verbose = verbose
22 | self.queue = Queue.Queue()
23 | if self.verbose:
24 | print "Server bound to: " + str(self.sock.getsockname())
25 |
26 | def start_accepting(self):
27 | """ Start the listener thread """
28 | thread = threading.Thread(target=self._accept, args=())
29 | thread.daemon = True # stops from blocking shutdown
30 | thread.start()
31 |
32 | def _accept(self):
33 | """ Listen for connections and pass handling to a new thread """
34 | while True:
35 | (client, address) = self.sock.accept()
36 | thread = threading.Thread(target=self._handle_conn, args=(client, address))
37 | thread.daemon = True
38 | thread.start()
39 |
40 | def _handle_conn(self, client, address):
41 | """
42 | Recieve messages and pass to queue. Messages are prefixed with
43 | a 4-byte integer to specify the message length and 1-byte character
44 | to indicate the type of serialization applied to the message.
45 |
46 | Supported serialization formats:
47 | 'n' : no serialization
48 | 'd' : dill pickle
49 | 'j' : json
50 | """
51 | if self.verbose:
52 | print "Thread: " + str(threading.current_thread()) + " connected to: " + str(address)
53 | try:
54 | while True:
55 | msg = self.receive_msg(client, 5)
56 | msglen, serialization = struct.unpack('>Lc', msg)
57 | if self.verbose:
58 | print "Msglen: " + str(msglen) + " is_serialized: " + str(serialization != 'n')
59 | msg = self.receive_msg(client, msglen)
60 | if serialization != 'n':
61 | try:
62 | if serialization == 'd': # dill serialization
63 | msg = dill.loads(msg)
64 | elif serialization == 'j': # json serialization
65 | msg = json.loads(msg)
66 | else:
67 | print "Unrecognized serialization type: %s" % serialization
68 | continue
69 | except (UnicodeDecodeError, ValueError) as e:
70 | print "Error de-serializing message: %s \n %s" % (
71 | msg, traceback.format_exc(e))
72 | continue
73 | self.queue.put(msg)
74 | except RuntimeError:
75 | if self.verbose:
76 | print "Client socket: " + str(address) + " closed"
77 |
78 | def receive_msg(self, client, msglen):
79 | msg = ''
80 | while len(msg) < msglen:
81 | chunk = client.recv(msglen-len(msg))
82 | if chunk == '':
83 | raise RuntimeError("socket connection broken")
84 | msg = msg + chunk
85 | return msg
86 |
87 | def close(self):
88 | self.sock.close()
89 |
90 | class clientsocket:
91 | """A client socket for sending messages"""
92 | def __init__(self, serialization='json', verbose=False):
93 | """ `serialization` specifies the type of serialization to use for
94 | non-str messages. Supported formats:
95 | * 'json' uses the json module. Cross-language support. (default)
96 | * 'dill' uses the dill pickle module. Python only.
97 | """
98 | self.sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
99 | if serialization != 'json' and serialization != 'dill':
100 | raise ValueError("Unsupported serialization type: %s" % serialization)
101 | self.serialization = serialization
102 | self.verbose = verbose
103 |
104 | def connect(self, host, port):
105 | if self.verbose: print "Connecting to: %s:%i" % (host, port)
106 | self.sock.connect((host, port))
107 |
108 | def send(self, msg):
109 | """
110 | Sends an arbitrary python object to the connected socket. Serializes
111 | using dill if not str, and prepends msg len (4-bytes) and
112 | serialization type (1-byte).
113 | """
114 | #if input not string, serialize to string
115 | if type(msg) is not str:
116 | if self.serialization == 'dill':
117 | msg = dill.dumps(msg)
118 | serialization = 'd'
119 | elif self.serialization == 'json':
120 | msg = json.dumps(msg)
121 | serialization = 'j'
122 | else:
123 | raise ValueError("Unsupported serialization type set: %s" % serialization)
124 | else:
125 | serialization = 'n'
126 |
127 | if self.verbose: print "Sending message with serialization %s" % serialization
128 |
129 | #prepend with message length
130 | msg = struct.pack('>Lc', len(msg), serialization) + msg
131 | totalsent = 0
132 | while totalsent < len(msg):
133 | sent = self.sock.send(msg[totalsent:])
134 | if sent == 0:
135 | raise RuntimeError("socket connection broken")
136 | totalsent = totalsent + sent
137 |
138 | def close(self):
139 | self.sock.close()
140 |
141 | if __name__ == '__main__':
142 | import sys
143 |
144 | #Just for testing
145 | if sys.argv[1] == 's':
146 | sock = serversocket(verbose=True)
147 | sock.start_accepting()
148 | raw_input("Press enter to exit...")
149 | sock.close()
150 | elif sys.argv[1] == 'c':
151 | host = raw_input("Enter the host name:\n")
152 | port = raw_input("Enter the port:\n")
153 | serialization = raw_input("Enter the serialization type (default: 'json'):\n")
154 | if serialization == '':
155 | serialization = 'json'
156 | sock = clientsocket(serialization=serialization)
157 | sock.connect(host, int(port))
158 | msg = None
159 |
160 | # some predefined messages
161 | tuple_msg = ('hello','world')
162 | list_msg = ['hello','world']
163 | dict_msg = {'hello':'world'}
164 | def function_msg(x): return x
165 |
166 | # read user input
167 | while msg != "quit":
168 | msg = raw_input("Enter a message to send:\n")
169 | if msg == 'tuple':
170 | sock.send(tuple_msg)
171 | elif msg == 'list':
172 | sock.send(list_msg)
173 | elif msg == 'dict':
174 | sock.send(dict_msg)
175 | elif msg == 'function':
176 | sock.send(function_msg)
177 | else:
178 | sock.send(msg)
179 | sock.close()
180 |
--------------------------------------------------------------------------------
/automation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sensor-js/OpenWPM-mobile/1834f4c5b8fd0d3976b2e57f5310fd72860a681f/automation/__init__.py
--------------------------------------------------------------------------------
/automation/default_browser_params.json:
--------------------------------------------------------------------------------
1 | {
2 | "extension_enabled": true,
3 | "disable_webdriver_self_id": true,
4 | "cookie_instrument": false,
5 | "js_instrument": true,
6 | "cp_instrument": false,
7 | "http_instrument": false,
8 | "save_javascript": true,
9 |
10 | "random_attributes": false,
11 | "bot_mitigation": false,
12 | "disable_flash": true,
13 | "profile_tar": null,
14 | "profile_archive_dir": null,
15 | "headless": true,
16 | "browser": "firefox",
17 | "tp_cookies": "always",
18 | "donottrack": false,
19 | "ghostery": false,
20 | "https-everywhere": false,
21 | "adblock-plus": false,
22 | "tracking-protection": false,
23 | "proxy": false,
24 | "save_javascript_proxy": false,
25 | "mobile_platform": "android"
26 | }
27 |
--------------------------------------------------------------------------------
/automation/default_manager_params.json:
--------------------------------------------------------------------------------
1 | {
2 | "data_directory": "~/openwpm/",
3 | "log_directory": "~/openwpm/",
4 | "database_name": "crawl-data.sqlite",
5 | "log_file": "openwpm.log",
6 | "failure_limit": null,
7 | "testing": false
8 | }
9 |
--------------------------------------------------------------------------------
/automation/schema.sql:
--------------------------------------------------------------------------------
1 | /* This file is sourced during the initialization
2 | * of the crawler. Make sure everything is CREATE
3 | * IF NOT EXISTS, otherwise there will be errors
4 | */
5 |
6 | /* Crawler Tables */
7 |
8 | CREATE TABLE IF NOT EXISTS task (
9 | task_id INTEGER PRIMARY KEY AUTOINCREMENT,
10 | start_time DATETIME DEFAULT CURRENT_TIMESTAMP,
11 | manager_params TEXT NOT NULL,
12 | openwpm_version TEXT NOT NULL,
13 | browser_version TEXT NOT NULL);
14 |
15 | CREATE TABLE IF NOT EXISTS crawl (
16 | crawl_id INTEGER PRIMARY KEY AUTOINCREMENT,
17 | task_id INTEGER NOT NULL,
18 | browser_params TEXT NOT NULL,
19 | screen_res TEXT,
20 | ua_string TEXT,
21 | finished BOOLEAN NOT NULL DEFAULT 0,
22 | start_time DATETIME DEFAULT CURRENT_TIMESTAMP,
23 | FOREIGN KEY(task_id) REFERENCES task(task_id));
24 |
25 | CREATE TABLE IF NOT EXISTS xpath (
26 | id INTEGER PRIMARY KEY AUTOINCREMENT,
27 | name VARCHAR(100) NOT NULL,
28 | url VARCHAR(500) NOT NULL,
29 | xpath VARCHAR(500) NOT NULL,
30 | absolute_xpath VARCHAR(500),
31 | ctime DATETIME DEFAULT CURRENT_TIMESTAMP,
32 | UNIQUE(name, url));
33 |
34 | CREATE TABLE IF NOT EXISTS site_visits (
35 | visit_id INTEGER PRIMARY KEY,
36 | crawl_id INTEGER NOT NULL,
37 | site_url VARCHAR(500) NOT NULL,
38 | FOREIGN KEY(crawl_id) REFERENCES crawl(id));
39 |
40 | /* Proxy Tables */
41 |
42 | /* TODO: add publix_suffix to db structure */
43 | /* TODO: link with headers */
44 | CREATE TABLE IF NOT EXISTS http_requests_proxy (
45 | id INTEGER PRIMARY KEY AUTOINCREMENT,
46 | crawl_id INTEGER NOT NULL,
47 | url VARCHAR(500) NOT NULL,
48 | method VARCHAR(500) NOT NULL,
49 | referrer VARCHAR(500) NOT NULL,
50 | headers VARCHAR(500) NOT NULL,
51 | visit_id INTEGER NOT NULL,
52 | time_stamp VARCHAR(500) NOT NULL);
53 |
54 | /* TODO: add publix_suffix to db structure */
55 | /* TODO: link with headers */
56 | /* TODO: link with requests */
57 | CREATE TABLE IF NOT EXISTS http_responses_proxy (
58 | id INTEGER PRIMARY KEY AUTOINCREMENT,
59 | crawl_id INTEGER NOT NULL,
60 | url VARCHAR(500) NOT NULL,
61 | method VARCHAR(500) NOT NULL,
62 | referrer VARCHAR(500) NOT NULL,
63 | response_status INTEGER NOT NULL,
64 | response_status_text VARCHAR(500) NOT NULL,
65 | headers VARCHAR(500) NOT NULL,
66 | location VARCHAR(500) NOT NULL,
67 | visit_id INTEGER NOT NULL,
68 | time_stamp VARCHAR(500) NOT NULL,
69 | content_hash VARCHAR(50));
70 |
71 | /* Firefox Storage Vector Dumps */
72 |
73 | CREATE TABLE IF NOT EXISTS flash_cookies (
74 | id INTEGER PRIMARY KEY AUTOINCREMENT,
75 | crawl_id INTEGER NOT NULL,
76 | visit_id INTEGER NOT NULL,
77 | domain VARCHAR(500),
78 | filename VARCHAR(500),
79 | local_path VARCHAR(1000),
80 | key TEXT,
81 | content TEXT,
82 | FOREIGN KEY(crawl_id) REFERENCES crawl(id),
83 | FOREIGN KEY(visit_id) REFERENCES site_visits(id));
84 |
85 | CREATE TABLE IF NOT EXISTS profile_cookies (
86 | id INTEGER PRIMARY KEY AUTOINCREMENT,
87 | crawl_id INTEGER NOT NULL,
88 | visit_id INTEGER NOT NULL,
89 | baseDomain TEXT,
90 | name TEXT,
91 | value TEXT,
92 | host TEXT,
93 | path TEXT,
94 | expiry INTEGER,
95 | accessed INTEGER,
96 | creationTime INTEGER,
97 | isSecure INTEGER,
98 | isHttpOnly INTEGER,
99 | FOREIGN KEY(crawl_id) REFERENCES crawl(id),
100 | FOREIGN KEY(visit_id) REFERENCES site_visits(id));
101 |
102 | CREATE TABLE IF NOT EXISTS localStorage (
103 | id INTEGER PRIMARY KEY AUTOINCREMENT,
104 | crawl_id INTEGER NOT NULL,
105 | page_url VARCHAR(500) NOT NULL,
106 | scope TEXT,
107 | KEY TEXT,
108 | value TEXT,
109 | FOREIGN KEY(crawl_id) REFERENCES crawl(id));
110 |
111 | /* Crawl History table */
112 | CREATE TABLE IF NOT EXISTS CrawlHistory (
113 | crawl_id INTEGER,
114 | command TEXT,
115 | arguments TEXT,
116 | bool_success INTEGER,
117 | dtg DATETIME DEFAULT (CURRENT_TIMESTAMP),
118 | FOREIGN KEY(crawl_id) REFERENCES crawl(id));
119 |
120 |
--------------------------------------------------------------------------------
/automation/utilities/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sensor-js/OpenWPM-mobile/1834f4c5b8fd0d3976b2e57f5310fd72860a681f/automation/utilities/__init__.py
--------------------------------------------------------------------------------
/automation/utilities/db_utils.py:
--------------------------------------------------------------------------------
1 | import sqlite3
2 | import os
3 | import plyvel
4 |
5 |
6 | def query_db(db, query, params=None):
7 | """Run a query against the given db.
8 |
9 | If params is not None, securely construct a query from the given
10 | query string and params.
11 | """
12 | with sqlite3.connect(db) as con:
13 | if params is None:
14 | rows = con.execute(query).fetchall()
15 | else:
16 | rows = con.execute(query, params).fetchall()
17 | return rows
18 |
19 |
20 | def get_javascript_content(data_directory):
21 | """Yield key, value pairs from the deduplicated leveldb content database
22 |
23 | Parameters
24 | ----------
25 | data_directory : str
26 | root directory of the crawl files containing `javascript.ldb`
27 | """
28 | db_path = os.path.join(data_directory, 'javascript.ldb')
29 | db = plyvel.DB(db_path,
30 | create_if_missing=False,
31 | compression='snappy')
32 | for content_hash, content in db.iterator():
33 | yield content_hash, content
34 | db.close()
35 |
36 |
37 | def get_javascript_entries(db, all_columns=False):
38 | if all_columns:
39 | select_columns = "*"
40 | else:
41 | select_columns = "script_url, symbol, operation, value, arguments"
42 |
43 | return query_db(db, "SELECT %s FROM javascript" % select_columns)
44 |
45 |
46 | def any_command_failed(db):
47 | """Returns True if any command in a given database failed"""
48 | rows = query_db(db, "SELECT * FROM CrawlHistory;")
49 | for row in rows:
50 | if row[3] != 1:
51 | return True
52 | return False
53 |
--------------------------------------------------------------------------------
/automation/utilities/domain_utils.py:
--------------------------------------------------------------------------------
1 | from publicsuffix import PublicSuffixList, fetch
2 | from ipaddress import ip_address
3 | from urlparse import urlparse
4 | from functools import wraps
5 | import tempfile
6 | import codecs
7 | import os
8 |
9 | # We cache the Public Suffix List in temp directory
10 | PSL_CACHE_LOC = os.path.join(tempfile.gettempdir(),'public_suffix_list.dat')
11 |
12 | def get_psl():
13 | """
14 | Grabs an updated public suffix list.
15 | """
16 | if not os.path.isfile(PSL_CACHE_LOC):
17 | print "%s does not exist, downloading a copy." % PSL_CACHE_LOC
18 | psl_file = fetch()
19 | with codecs.open(PSL_CACHE_LOC, 'w', encoding='utf8') as f:
20 | f.write(psl_file.read())
21 | print "Using psl from cache: %s" % PSL_CACHE_LOC
22 | psl_cache = codecs.open(PSL_CACHE_LOC, encoding='utf8')
23 | return PublicSuffixList(psl_cache)
24 |
25 | def load_psl(function):
26 | @wraps(function)
27 | def wrapper(*args, **kwargs):
28 | if not kwargs.has_key('psl'):
29 | if wrapper.psl is None:
30 | wrapper.psl = get_psl()
31 | return function(*args, psl=wrapper.psl, **kwargs)
32 | else:
33 | return function(*args, **kwargs)
34 | wrapper.psl = None
35 | return wrapper
36 |
37 | def is_ip_address(hostname):
38 | """
39 | Check if the given string is a valid IP address
40 | """
41 | try:
42 | ip_address(unicode(hostname))
43 | return True
44 | except ValueError:
45 | return False
46 |
47 | @load_psl
48 | def get_ps_plus_1(url, **kwargs):
49 | """
50 | Returns the PS+1 of the url. This will also return
51 | an IP address if the hostname of the url is a valid
52 | IP address.
53 |
54 | An (optional) PublicSuffixList object can be passed with keyword arg 'psl',
55 | otherwise a version cached in the system temp directory is used.
56 | """
57 | if not kwargs.has_key('psl'):
58 | raise ValueError("A PublicSuffixList must be passed as a keyword argument.")
59 | hostname = urlparse(url).hostname
60 | if is_ip_address(hostname):
61 | return hostname
62 | elif hostname is None:
63 | # Possible reasons hostname is None, `url` is:
64 | # * malformed
65 | # * a relative url
66 | # * a `javascript:` or `data:` url
67 | # * many others
68 | return
69 | else:
70 | return kwargs['psl'].get_public_suffix(hostname)
71 |
72 | @load_psl
73 | def hostname_subparts(url, include_ps=False, **kwargs):
74 | """
75 | Returns a list of slices of a url's hostname down to the PS+1 (or PS if include_ps)
76 |
77 | For example: http://a.b.c.d.com/path?query#frag would yield:
78 | [a.b.c.d.com, b.c.d.com, c.d.com, d.com] if include_ps == False
79 | [a.b.c.d.com, b.c.d.com, c.d.com, d.com, com] if include_ps == True
80 |
81 | An (optional) PublicSuffixList object can be passed with keyword arg 'psl'.
82 | otherwise a version cached in the system temp directory is used.
83 | """
84 | if not kwargs.has_key('psl'):
85 | raise ValueError("A PublicSuffixList must be passed as a keyword argument.")
86 | hostname = urlparse(url).hostname
87 |
88 | # If an IP address, just return a single item list with the IP
89 | if is_ip_address(hostname):
90 | return [hostname]
91 |
92 | subparts = list()
93 | ps_plus_1 = kwargs['psl'].get_public_suffix(hostname)
94 |
95 | # We expect all ps_plus_1s to have at least one '.'
96 | # If they don't, the url was likely malformed, so we'll just return an
97 | # empty list
98 | if '.' not in ps_plus_1:
99 | return []
100 | subdomains = hostname[:-(len(ps_plus_1)+1)].split('.')
101 | if subdomains == ['']:
102 | subdomains = []
103 | for i in range(len(subdomains)):
104 | subparts.append('.'.join(subdomains[i:])+'.'+ps_plus_1)
105 | subparts.append(ps_plus_1)
106 | if include_ps:
107 | try:
108 | subparts.append(ps_plus_1[ps_plus_1.index('.')+1:])
109 | except:
110 | pass
111 | return subparts
112 |
113 | def get_stripped_url(url, scheme=False):
114 | """Returns a url stripped to (scheme)?+hostname+path"""
115 | purl = urlparse(url)
116 | surl = ''
117 | if scheme:
118 | surl += purl.scheme + '://'
119 | try:
120 | surl += purl.hostname + purl.path
121 | except TypeError:
122 | surl += purl.hostname
123 | return surl
124 |
125 | def get_stripped_urls(urls, scheme=False):
126 | """ Returns a set (or list) of urls stripped to (scheme)?+hostname+path """
127 | new_urls = list()
128 | for url in urls:
129 | get_stripped_url(url, scheme)
130 | if type(urls) == set:
131 | return set(new_urls)
132 | return new_urls
133 |
--------------------------------------------------------------------------------
/demo.py:
--------------------------------------------------------------------------------
1 | from automation import TaskManager, CommandSequence
2 |
3 | # The list of sites that we wish to crawl
4 | NUM_BROWSERS = 15
5 | # sites = ["https://securehomes.esat.kuleuven.be/~gacar/dev/test/sensor/"]
6 | sites = []
7 | csv_name = "top-1m.csv"
8 | no_of_sites = 100000
9 | for l in open(csv_name).readlines()[1:no_of_sites]:
10 | url = l.split(",")[-1].rstrip()
11 | sites.append("http://%s" % url)
12 | #sites = ['http://www.example.com',
13 | #'http://www.princeton.edu',
14 | #'http://citp.princeton.edu/']
15 |
16 | # Loads the manager preference and 3 copies of the default browser dictionaries
17 | manager_params, browser_params = TaskManager.load_default_params(NUM_BROWSERS)
18 |
19 | # Update browser configuration (use this for per-browser settings)
20 | for i in xrange(NUM_BROWSERS):
21 | browser_params[i]['http_instrument'] = True # Record HTTP Requests and Responses
22 | browser_params[i]['disable_flash'] = False #Enable flash for all three browsers
23 | browser_params[0]['headless'] = True #Launch only browser 0 headless
24 |
25 | # Update TaskManager configuration (use this for crawl-wide settings)
26 | manager_params['data_directory'] = '~/openwpm/'
27 | manager_params['log_directory'] = '~/openwpm/'
28 |
29 | # Instantiates the measurement platform
30 | # Commands time out by default after 60 seconds
31 | manager = TaskManager.TaskManager(manager_params, browser_params)
32 |
33 | # Visits the sites with all browsers simultaneously
34 | for site in sites:
35 | command_sequence = CommandSequence.CommandSequence(site)
36 |
37 | # Start by visiting the page
38 | command_sequence.get(sleep=0, timeout=60)
39 |
40 | # dump_profile_cookies/dump_flash_cookies closes the current tab.
41 | command_sequence.dump_profile_cookies(120)
42 |
43 | manager.execute_command_sequence(command_sequence, index='**') # ** = synchronized browsers
44 |
45 | # Shuts down the browsers and waits for the data to finish logging
46 | manager.close()
47 |
--------------------------------------------------------------------------------
/feature_extraction/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sensor-js/OpenWPM-mobile/1834f4c5b8fd0d3976b2e57f5310fd72860a681f/feature_extraction/__init__.py
--------------------------------------------------------------------------------
/feature_extraction/utils.py:
--------------------------------------------------------------------------------
1 | from tld import get_fld
2 | from urlparse import urlparse
3 | import ipaddress
4 | import json
5 |
6 | DISCONNECT_JSON = "adblock/disconnect.json"
7 |
8 |
9 | def get_ps1_or_host(url):
10 | if not url.startswith("http"):
11 | url = 'http://' + url
12 |
13 | try:
14 | return get_fld(url, fail_silently=False)
15 | except Exception:
16 | hostname = urlparse(url).hostname
17 | try:
18 | ipaddress.ip_address(hostname)
19 | return hostname
20 | except Exception:
21 | return None
22 |
23 |
24 | def is_third_party(url, site_url):
25 | # !!!: We return False when we have missing information
26 | if not site_url:
27 | return False
28 |
29 | site_ps1 = get_ps1_or_host(site_url)
30 | if site_ps1 is None:
31 | return False
32 |
33 | req_ps1 = get_ps1_or_host(url)
34 | if req_ps1 is None:
35 | # print url
36 | return False
37 | if (req_ps1 == site_ps1):
38 | return False
39 |
40 | return True
41 |
42 |
43 | def get_disconnect_blocked_hosts(disconnect_json=DISCONNECT_JSON):
44 | blocked_hosts = set()
45 | disconnect = json.loads(open(disconnect_json).read())
46 | categories = disconnect["categories"]
47 | for _, entries in categories.iteritems():
48 | for entry in entries:
49 | adresses = entry.values()
50 | for address in adresses:
51 | address.pop("dnt", None) # there's one such entry
52 | # and it's not a domain/host
53 | hosts_list = address.values()
54 | blocked_hosts.update(hosts_list[0])
55 |
56 | print len(blocked_hosts), "blocked hosts"
57 | # note that disconnect keep a list of blocked hosts, not PS+1s
58 | assert "adwords.google.com" in blocked_hosts
59 | assert "facebook.com" in blocked_hosts
60 | return list(blocked_hosts)
61 |
62 |
63 | def is_blocked_by_disconnect_old(url, disconnect_blocked_hosts):
64 | return urlparse(url).hostname in disconnect_blocked_hosts
65 |
66 |
67 | def is_blocked_by_disconnect(url, disconnect_blocked_hosts):
68 | host = urlparse(url).hostname
69 | if host in disconnect_blocked_hosts:
70 | return True
71 | while True:
72 | # strip one subdomain at a time
73 | host = host.split(".", 1)[-1] # take foo.com from bar.foo.com
74 | if "." not in host:
75 | return False
76 | if host in disconnect_blocked_hosts:
77 | return True
78 | return False # this shouldn't happen unless we are provided a corrupt hostname
79 |
80 |
81 | if __name__ == '__main__':
82 | # Test for the is_blocked_by_disconnect
83 | # TODO: move to a separate file
84 | assert is_blocked_by_disconnect("http://adwords.google.com", ["facebook.com", "adwords.google.com"])
85 | assert not is_blocked_by_disconnect("http://example.com", ["facebook.com", "google.com"])
86 | assert not is_blocked_by_disconnect("http://8.8.8.8", ["facebook.com", "google.com"])
87 | disconnect_blocked_hosts = get_disconnect_blocked_hosts()
88 | assert is_blocked_by_disconnect("https://tps40.doubleverify.com/visit.js",
89 | disconnect_blocked_hosts)
90 | assert is_blocked_by_disconnect("https://pagead2.googlesyndication.com/bg/CI_hqThbQjBwoUSK10cIsovHByRI4InaU0wolTzGCLU.js",
91 | disconnect_blocked_hosts)
92 | assert not is_blocked_by_disconnect("http://bar-foo.com", ["foo.com"])
93 | assert not is_blocked_by_disconnect("http://oo.com", ["foo.com"])
94 | assert is_blocked_by_disconnect("http://bar.foo.com", ["foo.com"])
95 | assert is_blocked_by_disconnect("http://sub.bar.foo.com", ["foo.com"])
96 |
--------------------------------------------------------------------------------
/install-analysis.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | # Download adblock lists
5 | mkdir -p feature_extraction/adblock
6 | wget https://easylist.to/easylist/easylist.txt -P adblock
7 | wget https://easylist.to/easylist/easyprivacy.txt -P adblock
8 | wget https://github.com/disconnectme/disconnect-tracking-protection/blob/master/services.json -O adblock/disconnect.json
9 | sudo pip install adblockparser
10 |
--------------------------------------------------------------------------------
/install.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | echo "Would you like to install Adobe Flash Player? (Only required for crawls with Flash) [y,N]"
5 | read -s -n 1 response
6 | if [[ $response = "" ]] || [ $response == 'n' ] || [ $response == 'N' ]; then
7 | flash=false
8 | echo Not installing Adobe Flash Plugin
9 | elif [ $response == 'y' ] || [ $response == 'Y' ]; then
10 | flash=true
11 | echo Installing Adobe Flash Plugin
12 | sudo sh -c 'echo "deb http://archive.canonical.com/ubuntu/ trusty partner" >> /etc/apt/sources.list.d/canonical_partner.list'
13 | else
14 | echo Unrecognized response, exiting
15 | exit 1
16 | fi
17 |
18 | sudo apt-get update
19 |
20 | # npm is required for compiling Firefox extension
21 | sudo apt-get install -y firefox htop git python-dev libxml2-dev libxslt-dev libffi-dev libssl-dev build-essential xvfb libboost-python-dev libleveldb-dev libjpeg-dev libgtk2.0-0 npm
22 |
23 | # For some versions of ubuntu, the package libleveldb1v5 isn't available. Use libleveldb1 instead.
24 | sudo apt-get install -y libleveldb1v5 || sudo apt-get install -y libleveldb1
25 |
26 | if [ "$flash" = true ]; then
27 | sudo apt-get install -y adobe-flashplugin
28 | fi
29 |
30 | # Check if we're running on continuous integration
31 | # Python requirements are already installed by .travis.yml on Travis
32 | if [ "$TRAVIS" != "true" ]; then
33 | wget https://bootstrap.pypa.io/get-pip.py
34 | sudo -H python get-pip.py
35 | rm get-pip.py
36 | sudo pip install -U -r requirements.txt
37 | fi
38 |
39 | # Install specific version of Firefox known to work well with the selenium version above
40 | if [ $(uname -m) == 'x86_64' ]; then
41 | echo Downloading 64-bit Firefox
42 | wget https://ftp.mozilla.org/pub/firefox/releases/45.9.0esr/linux-x86_64/en-US/firefox-45.9.0esr.tar.bz2
43 | else
44 | echo Downloading 32-bit Firefox
45 | wget https://ftp.mozilla.org/pub/firefox/releases/45.9.0esr/linux-i686/en-US/firefox-45.9.0esr.tar.bz2
46 | fi
47 | tar jxf firefox*.tar.bz2
48 | rm -rf firefox-bin
49 | mv firefox firefox-bin
50 | rm firefox*.tar.bz2
51 |
52 |
53 | # Fix naming issue (exists in 14.04 and 16.04)
54 | if [ ! -f /usr/bin/node ]; then
55 | sudo ln -s /usr/bin/nodejs /usr/bin/node
56 | fi
57 |
58 | # install jpm
59 | sudo npm install jpm -g
60 |
--------------------------------------------------------------------------------
/mobile_sensor_crawl.py:
--------------------------------------------------------------------------------
1 | from automation import TaskManager, CommandSequence
2 |
3 | # number of browsers to run in parallel
4 | NUM_BROWSERS = 10
5 |
6 |
7 | sites = []
8 | csv_name = "top-1m.csv"
9 | no_of_sites = 100000 # crawl 100K sites
10 | for l in open(csv_name).readlines()[0:no_of_sites]:
11 | site = l.split(",")[-1].rstrip()
12 | sites.append(site)
13 |
14 | # Loads the manager preference and 3 copies of the default browser dictionaries
15 | manager_params, browser_params = TaskManager.load_default_params(NUM_BROWSERS)
16 |
17 | # Update browser configuration (use this for per-browser settings)
18 | for i in xrange(NUM_BROWSERS):
19 | browser_params[i]['http_instrument'] = True # Record HTTP Requests and Responses
20 | browser_params[i]['disable_flash'] = True # Disable flash for all browsers
21 | browser_params[i]['js_instrument'] = True # Enable JS instrumentation
22 | browser_params[i]['save_javascript'] = True # save JS files
23 | browser_params[i]['headless'] = True # headless
24 | browser_params[i]['trigger_sensor_events'] = True # fake sensor events
25 | browser_params[i]['mobile_platform'] = "android" # or "iphone"
26 |
27 | # Update TaskManager configuration (use this for crawl-wide settings)
28 | manager_params['data_directory'] = '~/openwpm_mobile_100k/'
29 | manager_params['log_directory'] = '~/openwpm_mobile_100k/'
30 |
31 | # Instantiates the measurement platform
32 | # Commands time out by default after 60 seconds
33 | manager = TaskManager.TaskManager(manager_params, browser_params)
34 |
35 | # Visits the sites with all browsers simultaneously
36 | for rank, site in enumerate(sites, 1):
37 | url = "http://%s" % site
38 | command_sequence = CommandSequence.CommandSequence(url, reset=True)
39 |
40 | # Start by visiting the page
41 | command_sequence.get(sleep=10, timeout=60)
42 | # command_sequence.save_screenshot('%d_%s_screenshot' % (rank, site))
43 | # dump_profile_cookies/dump_flash_cookies closes the current tab.
44 | command_sequence.dump_profile_cookies(120)
45 |
46 | manager.execute_command_sequence(command_sequence)
47 |
48 | # Shuts down the browsers and waits for the data to finish logging
49 | manager.close()
50 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | setuptools
2 | pyvirtualdisplay
3 | beautifulsoup4
4 | python-dateutil
5 | tld
6 | multiprocess
7 | dill
8 | pyamf
9 | psutil
10 | plyvel
11 | tblib
12 | tabulate
13 | pytest
14 | publicsuffix
15 | # Install specific version of selenium known to work well with the Firefox install we use
16 | selenium==2.53.0
17 | mmh3
18 | # IPython 6.0+ does not support python 2.7
19 | IPython>=5.0,<6.0
20 | # See https://github.com/ActiveState/appdirs/issues/89
21 | appdirs>=1.4.3
22 |
--------------------------------------------------------------------------------
/test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sensor-js/OpenWPM-mobile/1834f4c5b8fd0d3976b2e57f5310fd72860a681f/test/__init__.py
--------------------------------------------------------------------------------
/test/conftest.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import utilities
3 | from ..automation.utilities.platform_utils import create_xpi
4 |
5 |
6 | @pytest.fixture(scope="session", autouse=True)
7 | def prepare_test_setup(request):
8 | """Run an HTTP server during the tests."""
9 | print "\nCalling create_xpi", create_xpi()
10 | print "\nStarting local_http_server"
11 | server, server_thread = utilities.start_server()
12 |
13 | def local_http_server_stop():
14 | print "\nClosing server thread..."
15 | server.shutdown()
16 | server_thread.join()
17 |
18 | request.addfinalizer(local_http_server_stop)
19 |
--------------------------------------------------------------------------------
/test/manual_test.py:
--------------------------------------------------------------------------------
1 | from utilities import BASE_TEST_URL, start_server
2 | from conftest import create_xpi
3 | from os.path import dirname, join, realpath
4 | from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
5 | from selenium import webdriver
6 | import subprocess
7 | import atexit
8 |
9 | OPENWPM_LOG_PREFIX = "console.log: openwpm: "
10 | INSERT_PREFIX = "Array"
11 | BASE_DIR = dirname(dirname(realpath(__file__)))
12 | EXT_PATH = join(BASE_DIR, 'automation', 'Extension', 'firefox')
13 | FF_BIN_PATH = join(BASE_DIR, 'firefox-bin', 'firefox')
14 |
15 |
16 | class bcolors:
17 | HEADER = '\033[95m'
18 | OKBLUE = '\033[94m'
19 | OKGREEN = '\033[92m'
20 | WARNING = '\033[93m'
21 | FAIL = '\033[91m'
22 | ENDC = '\033[0m'
23 | BOLD = '\033[1m'
24 | UNDERLINE = '\033[4m'
25 |
26 |
27 | def get_command_output(command, cwd=None):
28 | popen = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE,
29 | stderr=subprocess.STDOUT, cwd=cwd)
30 | return iter(popen.stdout.readline, b"")
31 |
32 |
33 | def colorize(line):
34 | if INSERT_PREFIX in line: # print long DB insert lines in blue
35 | line = line.replace(INSERT_PREFIX, bcolors.OKBLUE + INSERT_PREFIX)
36 | if OPENWPM_LOG_PREFIX in line:
37 | line = line.replace(OPENWPM_LOG_PREFIX,
38 | OPENWPM_LOG_PREFIX + bcolors.OKGREEN)
39 | return line
40 |
41 |
42 | def start_webdriver(with_extension=False):
43 | """ Open a webdriver instance and a server for the test pages
44 |
45 | This is meant to be imported and run manually from a python or
46 | ipython shell. A webdriver instance is returned and both the webdriver
47 | and server will automatically clean up when the shell is exited.
48 |
49 | Parameters
50 | ----------
51 | with_extension : boolean
52 | Set to True to also load OpenWPM extension instrumentation
53 |
54 | Returns
55 | -------
56 | webdriver
57 | A selenium webdriver instance.
58 | """
59 | fb = FirefoxBinary(FF_BIN_PATH)
60 | server, thread = start_server()
61 |
62 | def register_cleanup(driver):
63 | driver.get(BASE_TEST_URL)
64 |
65 | def cleanup_server():
66 | print "Cleanup before shutdown..."
67 | server.shutdown()
68 | thread.join()
69 | print "...sever shutdown"
70 | driver.quit()
71 | print "...webdriver closed"
72 |
73 | atexit.register(cleanup_server)
74 | return driver
75 |
76 | if not with_extension:
77 | return register_cleanup(webdriver.Firefox(firefox_binary=fb))
78 |
79 | # add openwpm extension to profile
80 | create_xpi()
81 | fp = webdriver.FirefoxProfile()
82 | ext_xpi = join(EXT_PATH, 'openwpm.xpi')
83 | fp.add_extension(extension=ext_xpi)
84 | fp.set_preference("extensions.@openwpm.sdk.console.logLevel", "all")
85 |
86 | return register_cleanup(
87 | webdriver.Firefox(firefox_binary=fb, firefox_profile=fp))
88 |
89 |
90 | def start_jpm():
91 | cmd_jpm_run = "jpm run --binary-args 'url %s' -b %s" % (BASE_TEST_URL,
92 | FF_BIN_PATH)
93 | server, thread = start_server()
94 | try:
95 | # http://stackoverflow.com/a/4417735/3104416
96 | for line in get_command_output(cmd_jpm_run, cwd=EXT_PATH):
97 | print colorize(line), bcolors.ENDC,
98 | except KeyboardInterrupt:
99 | print "Keyboard Interrupt detected, shutting down..."
100 | print "\nClosing server thread..."
101 | server.shutdown()
102 | thread.join()
103 |
104 |
105 | if __name__ == '__main__':
106 | import IPython
107 | import sys
108 |
109 | # TODO use some real parameter handling library
110 | if len(sys.argv) == 1:
111 | start_jpm()
112 | elif len(sys.argv) >= 2 and sys.argv[1] == '--selenium':
113 | if len(sys.argv) == 3 and sys.argv[2] == '--no-extension':
114 | driver = start_webdriver(False)
115 | else:
116 | driver = start_webdriver(True)
117 | print "\nDropping into ipython shell...."
118 | print " * Interact with the webdriver instance using `driver`"
119 | print " * The webdriver and test page server will close automatically"
120 | print " * Use `exit` to quit the ipython shell\n"
121 | IPython.embed()
122 | else:
123 | print ("Unrecognized arguments. Usage:\n"
124 | "python manual_test.py ('--selenium')? ('--no-extension')?")
125 |
--------------------------------------------------------------------------------
/test/openwpmtest.py:
--------------------------------------------------------------------------------
1 | from os.path import join
2 | import utilities
3 | import pytest
4 | import commands
5 | from ..automation import TaskManager
6 |
7 |
8 | class OpenWPMTest(object):
9 | NUM_BROWSERS = 1
10 |
11 | @pytest.fixture(autouse=True)
12 | def set_tmpdir(self, tmpdir):
13 | """Create a tmpdir fixture to be used in `get_test_config`.
14 |
15 | Based on:
16 | https://mail.python.org/pipermail/pytest-dev/2014-April/002484.html
17 | """
18 | self.tmpdir = str(tmpdir)
19 |
20 | def visit(self, page_url, data_dir="", sleep_after=0):
21 | """Visit a test page with the given parameters."""
22 | manager_params, browser_params = self.get_config(data_dir)
23 | manager = TaskManager.TaskManager(manager_params, browser_params)
24 | if not page_url.startswith("http"):
25 | page_url = utilities.BASE_TEST_URL + page_url
26 | manager.get(url=page_url, sleep=sleep_after)
27 | manager.close()
28 | return manager_params['db']
29 |
30 | def get_test_config(self, data_dir="",
31 | num_browsers=NUM_BROWSERS):
32 | """Load and return the default test parameters."""
33 | if not data_dir:
34 | data_dir = self.tmpdir
35 | manager_params, browser_params = TaskManager.\
36 | load_default_params(num_browsers)
37 | manager_params['data_directory'] = data_dir
38 | manager_params['log_directory'] = data_dir
39 | browser_params[0]['headless'] = True
40 | manager_params['db'] = join(manager_params['data_directory'],
41 | manager_params['database_name'])
42 | return manager_params, browser_params
43 |
44 | def is_installed(self, pkg_name):
45 | """Check if a Linux package is installed."""
46 | cmd = 'which %s' % pkg_name
47 | status, _ = commands.getstatusoutput(cmd)
48 | return False if status else True
49 |
50 | def assert_is_installed(self, pkg):
51 | assert self.is_installed(pkg), 'Cannot find %s in your system' % pkg
52 |
53 | def assert_py_pkg_installed(self, pkg):
54 | # some modules are imported using a different name than the ones used
55 | # at the installation.
56 | pkg_name_mapping = {"pyopenssl": "OpenSSL",
57 | "mitmproxy": "libmproxy",
58 | "beautifulsoup4": "bs4",
59 | "python-dateutil": "dateutil"
60 | }
61 | # get the mapped name if it exists.
62 | pkg_importable = pkg_name_mapping.get(pkg.lower(), pkg)
63 | try:
64 | __import__(pkg_importable)
65 | except ImportError:
66 | pytest.fail("Cannot find python package %s in your system" % pkg)
67 |
--------------------------------------------------------------------------------
/test/test_adblock_plus.py:
--------------------------------------------------------------------------------
1 | from urlparse import urlparse
2 | import pytest
3 | import os
4 |
5 | from ..automation import TaskManager
6 | from ..automation.Errors import BrowserConfigError
7 | from ..automation.utilities.platform_utils import fetch_adblockplus_list
8 | from ..automation.utilities import domain_utils, db_utils
9 |
10 | import utilities
11 | import expected
12 | from openwpmtest import OpenWPMTest
13 |
14 | psl = domain_utils.get_psl()
15 |
16 |
17 | class TestABP(OpenWPMTest):
18 |
19 | def get_config(self, data_dir=""):
20 | manager_params, browser_params = self.get_test_config(data_dir)
21 | browser_params[0]['http_instrument'] = True
22 | browser_params[0]['adblock-plus'] = True
23 | return manager_params, browser_params
24 |
25 | def test_list_fetch(self, tmpdir):
26 | data_dir = str(tmpdir)
27 | fetch_adblockplus_list(data_dir)
28 | assert os.path.isfile(os.path.join(data_dir, 'patterns.ini'))
29 | assert os.path.isfile(os.path.join(data_dir, 'elemhide.css'))
30 |
31 | def test_blocks_includes(self, tmpdir):
32 | data_dir = str(tmpdir)
33 | list_loc = os.path.join(data_dir, 'adblock_plus')
34 | manager_params, browser_params = self.get_config(data_dir)
35 | fetch_adblockplus_list(list_loc)
36 | browser_params[0]['adblock-plus_list_location'] = list_loc
37 | manager = TaskManager.TaskManager(manager_params, browser_params)
38 | manager.get(utilities.BASE_TEST_URL + '/abp/adblock_plus_test.html')
39 | manager.close()
40 |
41 | db = os.path.join(data_dir, manager_params['database_name'])
42 | rows = db_utils.query_db(db, "SELECT url FROM http_requests")
43 | urls = set()
44 | for url, in rows:
45 | ps1 = psl.get_public_suffix(urlparse(url).hostname)
46 | # exclude requests to safebrowsing and tracking protection backends
47 | if ps1 not in ("mozilla.com", "mozilla.net"):
48 | urls.add(url)
49 | assert urls == expected.adblockplus
50 |
51 | def test_error_with_missing_option(self):
52 | manager_params, browser_params = self.get_config()
53 | with pytest.raises(BrowserConfigError):
54 | manager = TaskManager.TaskManager(manager_params, browser_params)
55 | manager.close()
56 |
57 | def test_error_with_missing_list(self, tmpdir):
58 | data_dir = str(tmpdir)
59 | list_loc = os.path.join(data_dir, 'adblock_plus')
60 | manager_params, browser_params = self.get_config(data_dir)
61 | browser_params[0]['adblock-plus_list_location'] = list_loc
62 | with pytest.raises(BrowserConfigError):
63 | manager = TaskManager.TaskManager(manager_params, browser_params)
64 | manager.close()
65 |
--------------------------------------------------------------------------------
/test/test_crawl.py:
--------------------------------------------------------------------------------
1 | from urlparse import urlparse
2 | import tarfile
3 | import pytest
4 | import os
5 |
6 | from ..automation import TaskManager
7 | from ..automation.utilities import domain_utils, db_utils
8 | from openwpmtest import OpenWPMTest
9 |
10 |
11 | TEST_SITES = [
12 | 'http://google.com',
13 | 'http://facebook.com',
14 | 'http://youtube.com',
15 | 'http://yahoo.com',
16 | 'http://baidu.com',
17 | 'http://wikipedia.org',
18 | 'http://qq.com',
19 | 'http://linkedin.com',
20 | 'http://taobao.com',
21 | 'http://twitter.com',
22 | 'http://live.com',
23 | 'http://amazon.com',
24 | 'http://sina.com.cn',
25 | 'http://google.co.in',
26 | 'http://hao123.com',
27 | 'http://blogspot.com',
28 | 'http://weibo.com',
29 | 'http://wordpress.com',
30 | 'http://yandex.ru',
31 | 'http://yahoo.co.jp'
32 | ]
33 |
34 | psl = domain_utils.get_psl()
35 |
36 |
37 | class TestCrawl(OpenWPMTest):
38 | """ Runs a short test crawl.
39 |
40 | This should be used to test any features that require real
41 | crawl data. This should be avoided if possible, as controlled
42 | tests will be easier to debug
43 | """
44 |
45 | def get_config(self, data_dir=""):
46 | manager_params, browser_params = self.get_test_config(data_dir)
47 | browser_params[0]['profile_archive_dir'] =\
48 | os.path.join(manager_params['data_directory'], 'browser_profile')
49 | browser_params[0]['http_instrument'] = True
50 | return manager_params, browser_params
51 |
52 | @pytest.mark.slow
53 | def test_browser_profile_coverage(self, tmpdir):
54 | """ Test the coverage of the browser's profile
55 |
56 | This verifies that Firefox's places.sqlite database contains
57 | all visited sites (with a few exceptions). If it does not,
58 | it is likely the profile is lost at some point during the crawl
59 | """
60 | # Run the test crawl
61 | data_dir = os.path.join(str(tmpdir), 'data_dir')
62 | manager_params, browser_params = self.get_config(data_dir)
63 | manager = TaskManager.TaskManager(manager_params, browser_params)
64 | for site in TEST_SITES:
65 | manager.get(site)
66 | ff_db_tar = os.path.join(browser_params[0]['profile_archive_dir'],
67 | 'profile.tar.gz')
68 | manager.close()
69 |
70 | # Extract crawl profile
71 | with tarfile.open(ff_db_tar) as tar:
72 | tar.extractall(browser_params[0]['profile_archive_dir'])
73 |
74 | # Output databases
75 | ff_db = os.path.join(browser_params[0]['profile_archive_dir'],
76 | 'places.sqlite')
77 | crawl_db = manager_params['db']
78 |
79 | # Grab urls from crawl database
80 | rows = db_utils.query_db(crawl_db, "SELECT url FROM http_requests")
81 | req_ps = set() # visited domains from http_requests table
82 | for url, in rows:
83 | req_ps.add(psl.get_public_suffix(urlparse(url).hostname))
84 |
85 | hist_ps = set() # visited domains from CrawlHistory Table
86 | successes = dict()
87 | rows = db_utils.query_db(crawl_db, "SELECT arguments, bool_success "
88 | "FROM CrawlHistory WHERE command='GET'")
89 | for url, success in rows:
90 | ps = psl.get_public_suffix(urlparse(url).hostname)
91 | hist_ps.add(ps)
92 | successes[ps] = success
93 |
94 | # Grab urls from Firefox database
95 | profile_ps = set() # visited domains from firefox profile
96 | rows = db_utils.query_db(ff_db, "SELECT url FROM moz_places")
97 | for host, in rows:
98 | try:
99 | profile_ps.add(psl.get_public_suffix(urlparse(host).hostname))
100 | except AttributeError:
101 | pass
102 |
103 | # We expect urls to be in the Firefox profile if:
104 | # 1. We've made requests to it
105 | # 2. The url is a top_url we entered into the address bar
106 | # 3. The url successfully loaded (see: Issue #40)
107 | # 4. The site does not respond to the initial request with a 204 (won't show in FF DB)
108 | missing_urls = req_ps.intersection(hist_ps).difference(profile_ps)
109 | unexpected_missing_urls = set()
110 | for url in missing_urls:
111 | if successes[url] == 0 or successes[url] == -1:
112 | continue
113 |
114 | # Get the visit id for the url
115 | rows = db_utils.query_db(crawl_db,
116 | "SELECT visit_id FROM site_visits "
117 | "WHERE site_url = ?",
118 | ('http://' + url,))
119 |
120 | try:
121 | visit_id = rows[0][0]
122 | except Exception:
123 | visit_id = rows[0]
124 |
125 | rows = db_utils.query_db(crawl_db,
126 | "SELECT COUNT(*) FROM http_responses "
127 | "WHERE visit_id = ?",
128 | (visit_id,))
129 | if rows[0] > 1:
130 | continue
131 |
132 | rows = db_utils.query_db(crawl_db,
133 | "SELECT response_status, location FROM "
134 | "http_responses WHERE visit_id = ?",
135 | (visit_id,))
136 | response_status, location = rows[0]
137 | if response_status == 204:
138 | continue
139 | if location == 'http://': # site returned a blank redirect
140 | continue
141 | unexpected_missing_urls.add(url)
142 |
143 | assert len(unexpected_missing_urls) == 0
144 |
--------------------------------------------------------------------------------
/test/test_custom_function_command.py:
--------------------------------------------------------------------------------
1 | import expected
2 | import utilities
3 | from ..automation import CommandSequence
4 | from ..automation import TaskManager
5 | from ..automation.utilities import db_utils
6 | from openwpmtest import OpenWPMTest
7 |
8 | url_a = utilities.BASE_TEST_URL + '/simple_a.html'
9 |
10 |
11 | class TestCustomFunctionCommand(OpenWPMTest):
12 | """Test `custom_function` command's ability to handle various inline functions"""
13 |
14 | def get_config(self, data_dir=""):
15 | return self.get_test_config(data_dir)
16 |
17 | def test_custom_function(self):
18 | """ Test `custom_function` with an inline function that collects links """
19 |
20 | from ..automation.SocketInterface import clientsocket
21 | def collect_links(table_name, scheme, **kwargs):
22 | """ Collect links with matching `scheme` and save in table `table_name` """
23 | driver = kwargs['driver']
24 | manager_params = kwargs['manager_params']
25 | link_elements = driver.find_elements_by_tag_name('a')
26 | link_urls = [element.get_attribute("href") for element in link_elements]
27 | link_urls = filter(lambda x: x.startswith(scheme+'://'), link_urls)
28 | current_url = driver.current_url
29 |
30 | sock = clientsocket()
31 | sock.connect(*manager_params['aggregator_address'])
32 |
33 | query = ("CREATE TABLE IF NOT EXISTS %s ("
34 | "top_url TEXT, link TEXT);" % table_name)
35 | sock.send((query, ()))
36 |
37 | for link in link_urls:
38 | query = ("INSERT INTO %s (top_url, link) "
39 | "VALUES (?, ?)" % table_name)
40 | sock.send((query, (current_url, link)))
41 | sock.close()
42 |
43 | manager_params, browser_params = self.get_config()
44 | manager = TaskManager.TaskManager(manager_params, browser_params)
45 | cs = CommandSequence.CommandSequence(url_a)
46 | cs.get(sleep=0, timeout=60)
47 | cs.run_custom_function(collect_links, ('page_links', 'http'))
48 | manager.execute_command_sequence(cs)
49 | manager.close()
50 | query_result = db_utils.query_db(manager_params['db'],
51 | "SELECT top_url, link FROM page_links;")
52 | assert expected.page_links == set(query_result)
53 |
--------------------------------------------------------------------------------
/test/test_env.py:
--------------------------------------------------------------------------------
1 | import re
2 | from os.path import realpath, dirname, join, isfile, isdir
3 | from openwpmtest import OpenWPMTest
4 |
5 |
6 | class TestDependencies(OpenWPMTest):
7 |
8 | BASE_DIR = dirname(dirname(realpath(__file__)))
9 |
10 | def test_dependencies(self):
11 | self.assert_is_installed("npm")
12 | self.assert_is_installed("jpm")
13 | self.assert_is_installed('mitmdump')
14 | self.assert_is_installed('firefox')
15 | ff_bin_dir = join(self.BASE_DIR, "firefox-bin")
16 | assert isdir(ff_bin_dir)
17 | ff_binary = join(ff_bin_dir, "firefox")
18 | assert isfile(ff_binary)
19 |
20 | def test_py_pkgs(self):
21 | PY_REQUIREMENTS_TXT = join(self.BASE_DIR, "requirements.txt")
22 | assert isfile(PY_REQUIREMENTS_TXT)
23 | for line in open(PY_REQUIREMENTS_TXT):
24 | if line.startswith("#"):
25 | continue
26 | pkg = re.split(r'[>=<]', line.strip())[0]
27 | print "Checking Python package", pkg
28 | self.assert_py_pkg_installed(pkg)
29 |
--------------------------------------------------------------------------------
/test/test_js_instrument.py:
--------------------------------------------------------------------------------
1 | from openwpmtest import OpenWPMTest
2 | from ..automation.utilities import db_utils
3 |
4 | GETS_AND_SETS = {
5 | ("window.test.prop1", "get", "prop1"),
6 | ("window.test.prop1", "set", "blah1"),
7 | ("window.test.prop1", "get", "blah1"),
8 | ("window.test.prop2", "get", "prop2"),
9 | ("window.test.prop2", "set", "blah2"),
10 | ("window.test.prop2", "get", "blah2"),
11 | ("window.test.objProp", "get", "{\"hello\":\"world\"}"),
12 | ("window.test.objProp", "set", "{\"key\":\"value\"}"),
13 | ("window.test.objProp", "get", "{\"key\":\"value\"}"),
14 | ("window.test.prop3", "get", "default-value"),
15 | ("window.test.prop3", "set", "blah3"),
16 | ("window.test.prop3", "get", "blah3"),
17 | ('window.test.method1', 'set', 'FUNCTION'),
18 | ('window.test.method1', 'set', 'now static'),
19 | ('window.test.method1', 'get', 'now static'),
20 | ('window.test.prop1', 'set', 'FUNCTION'),
21 | ('window.test.nestedObj', 'get',
22 | '{"prop1":"default1","prop2":"default2","method1":"FUNCTION"}')
23 | }
24 |
25 | METHOD_CALLS = {
26 | ('window.test.prop1', 'call', '{"0":"now accepting arugments"}'),
27 | ('window.test.method1', 'call', '{"0":"hello","1":"{\\"world\\":true}"}'),
28 | ('window.test.method1', 'call', '{"0":"new argument"}')
29 | }
30 |
31 | RECURSIVE_GETS_AND_SETS = {
32 | ("window.test2.nestedObj.prop1", "get", "default1"),
33 | ("window.test2.nestedObj.prop1", "set", "updatedprop1"),
34 | ("window.test2.nestedObj.prop1", "get", "updatedprop1"),
35 | ("window.test2.nestedObj.prop2", "get", "default2"),
36 | ("window.test2.nestedObj.method1", "set", "FUNCTION"),
37 | ("window.test2.nestedObj.doubleNested.prop1", "get", "double default"),
38 | ("window.test2.nestedObj.doubleNested.prop1", "set", "doubleprop1"),
39 | ("window.test2.nestedObj.doubleNested.prop1", "get", "doubleprop1"),
40 | ("window.test2.nestedObj.doubleNested.method1", "set", "FUNCTION")
41 | }
42 |
43 | RECURSIVE_METHOD_CALLS = {
44 | ('window.test2.nestedObj.method1', 'call', '{"0":"arg-before"}'),
45 | ('window.test2.nestedObj.method1', 'call', '{"0":"arg-after"}'),
46 | ('window.test2.nestedObj.doubleNested.method1', 'call', '{"0":"blah"}')
47 | }
48 |
49 | RECURSIVE_PROP_SET = {
50 | ('window.test2.l1.l2.l3.l4.l5.prop', 'get', 'level5prop'),
51 | ('window.test2.l1.l2.l3.l4.l5.l6', 'get', '{"prop":"level6prop"}')
52 | }
53 |
54 | SET_PREVENT_CALLS = {
55 | (u'window.test3.method1', u'call', None),
56 | ('window.test3.obj1.method2', 'call', None)
57 | }
58 |
59 | SET_PREVENT_GETS_AND_SETS = {
60 | (u'window.test3.prop1', u'set', u'newprop1'),
61 | ('window.test3.method1', 'set(prevented)', 'FUNCTION'),
62 | ('window.test3.obj1', 'set(prevented)', '{"new":"object"}'),
63 | (u'window.test3.obj1.prop2', u'set', u'newprop2'),
64 | ('window.test3.obj1.method2', 'set(prevented)', 'FUNCTION'),
65 | ('window.test3.obj1.obj2', 'set(prevented)', '{"new":"object2"}'),
66 | (u'window.test3.prop1', u'get', u'newprop1'),
67 | ('window.test3.obj1.obj2', 'get', '{"testobj":"nested"}'),
68 | ('window.test3.obj1.prop2', 'get', 'newprop2'),
69 | }
70 |
71 |
72 | class TestJSInstrument(OpenWPMTest):
73 |
74 | def get_config(self, data_dir=""):
75 | manager_params, browser_params = self.get_test_config(data_dir)
76 | browser_params[0]['js_instrument'] = True
77 | manager_params['testing'] = True
78 | return manager_params, browser_params
79 |
80 | def test_instrument_object(self):
81 | """ Ensure instrumentObject logs all property gets, sets, and calls """
82 | db = self.visit('/instrument_object.html')
83 | rows = db_utils.get_javascript_entries(db)
84 |
85 | # Check calls of non-recursive instrumentation
86 | observed_gets_and_sets = set()
87 | observed_calls = set()
88 | for script_url, symbol, operation, value, arguments in rows:
89 | if not symbol.startswith('window.test.'):
90 | continue
91 | if operation == 'get' or operation == 'set':
92 | observed_gets_and_sets.add((symbol, operation, value))
93 | else:
94 | observed_calls.add((symbol, operation, arguments))
95 | assert observed_calls == METHOD_CALLS
96 | assert observed_gets_and_sets == GETS_AND_SETS
97 |
98 | # Check calls of recursive instrumentation
99 | observed_gets_and_sets = set()
100 | observed_calls = set()
101 | for script_url, symbol, operation, value, arguments in rows:
102 | if not symbol.startswith('window.test2.nestedObj'):
103 | continue
104 | if operation == 'get' or operation == 'set':
105 | observed_gets_and_sets.add((symbol, operation, value))
106 | else:
107 | observed_calls.add((symbol, operation, arguments))
108 | assert observed_calls == RECURSIVE_METHOD_CALLS
109 | assert observed_gets_and_sets == RECURSIVE_GETS_AND_SETS
110 |
111 | # Check that calls not present after default recursion limit (5)
112 | # We should only see the window.test2.l1.l2.l3.l4.l5.prop access
113 | # and not window.test2.l1.l2.l3.l4.l5.l6.prop access.
114 | prop_access = set()
115 | for script_url, symbol, operation, value, arguments in rows:
116 | if not symbol.startswith('window.test2.l1'):
117 | continue
118 | prop_access.add((symbol, operation, value))
119 | assert prop_access == RECURSIVE_PROP_SET
120 |
121 | # Check calls of object with sets prevented
122 | observed_gets_and_sets = set()
123 | observed_calls = set()
124 | for script_url, symbol, operation, value, arguments in rows:
125 | if not symbol.startswith('window.test3'):
126 | continue
127 | if operation == 'call':
128 | observed_calls.add((symbol, operation, arguments))
129 | else:
130 | observed_gets_and_sets.add((symbol, operation, value))
131 | assert observed_calls == SET_PREVENT_CALLS
132 | assert observed_gets_and_sets == SET_PREVENT_GETS_AND_SETS
133 |
--------------------------------------------------------------------------------
/test/test_pages/abp/adblock_plus_test.html:
--------------------------------------------------------------------------------
1 |
2 | AdBlock Plus Test Page
3 | If functioning correctly, we expect AdBlock Plus to prevent the included
4 | requests from succeeding.
5 |
6 |
7 | Script 1:
8 | Script 2:
9 | Script 3:
10 |
11 |
12 |
13 |
14 |
15 |
16 |
37 |
38 |
39 |
--------------------------------------------------------------------------------
/test/test_pages/abp/adspot/1.js:
--------------------------------------------------------------------------------
1 | var include_1 = true;
2 |
--------------------------------------------------------------------------------
/test/test_pages/abp/adsystem/3.js:
--------------------------------------------------------------------------------
1 | var include_3 = true;
2 |
--------------------------------------------------------------------------------
/test/test_pages/abp/bannerads/2.js:
--------------------------------------------------------------------------------
1 | var include_2 = true;
2 |
--------------------------------------------------------------------------------
/test/test_pages/audio_fingerprinting.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | AudioContext Fingerprint Test Page
6 | AudioContext Fingerprint Test Page
7 |
8 |
9 |
41 |
42 |
43 |
44 |
--------------------------------------------------------------------------------
/test/test_pages/battery_fingerprinting.html:
--------------------------------------------------------------------------------
1 |
2 |
3 | AudioContext Fingerprint Test Page
4 |
5 |
6 |
7 | Battery Fingerprinting Test Page
8 | Charging?
9 | Charging Level:
10 | Charging Time:
11 | Discharging Time:
12 |
13 |
58 |
59 |
60 |
--------------------------------------------------------------------------------
/test/test_pages/canvas_fingerprinting.html:
--------------------------------------------------------------------------------
1 |
2 |
3 | Canvas Fingerprinting Test Page
4 | Canvas Fingerprinting Test Page
5 |
6 |
7 |
8 |
9 |
31 |
32 |
33 |
--------------------------------------------------------------------------------
/test/test_pages/expected_source.html:
--------------------------------------------------------------------------------
1 |
2 |
3 | Simple Page A
4 |
10 |
11 |
12 | Click me!
13 | Click me also!
14 | Click me for a JS alert!
15 | Go to google.com
16 | Go to example.com
17 |
18 |
19 |
--------------------------------------------------------------------------------
/test/test_pages/http_stacktrace.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | async load by script
5 |
25 |
26 |
27 | The scripts on this page inject an image, an invisible pixel and a script.
28 |
29 |
30 |
31 |
--------------------------------------------------------------------------------
/test/test_pages/http_test_page.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Simple HTTP Test page
5 |
6 |
7 |
8 |
9 | This test page sources an image, script, and css and favicon resource.
10 | It also includes an iframe which contains the same image and a different
11 | script.
12 |
13 |
14 |
15 |
16 |
17 |
--------------------------------------------------------------------------------
/test/test_pages/http_test_page_2.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Simple HTTP Test page 2
5 |
6 |
7 | This test page sources two different images and a script.
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/test/test_pages/js_call_stack.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Simple Page A
5 |
12 |
13 |
14 |
15 |
16 |
--------------------------------------------------------------------------------
/test/test_pages/js_cookie.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | localStorage example
5 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/test/test_pages/lso/FlashCookie.swf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sensor-js/OpenWPM-mobile/1834f4c5b8fd0d3976b2e57f5310fd72860a681f/test/test_pages/lso/FlashCookie.swf
--------------------------------------------------------------------------------
/test/test_pages/lso/flash-cookie.js:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sensor-js/OpenWPM-mobile/1834f4c5b8fd0d3976b2e57f5310fd72860a681f/test/test_pages/lso/flash-cookie.js
--------------------------------------------------------------------------------
/test/test_pages/lso/setlso.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Flash cookie example
5 |
6 |
7 |
8 |
9 |
21 |
22 |
23 |
--------------------------------------------------------------------------------
/test/test_pages/post_file_upload.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | File upload form test page
6 |
12 |
13 |
14 | Submit a form with the given encoding type in the URL params.
15 |
16 |
22 |
23 |
24 |
--------------------------------------------------------------------------------
/test/test_pages/post_request.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | POST request test page
6 |
19 |
20 |
21 | Submit a form with the given encoding type in the URL params.
22 |
23 |
31 | Automated tests uses the following URLs to submit data in different encodings
32 |
37 |
38 |
39 |
--------------------------------------------------------------------------------
/test/test_pages/post_request_ajax.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Test page - POST request using AJAX
5 |
8 |
9 |
51 |
52 |
53 | Automated tests uses the following URLs to submit data in different formats using AJAX POST
54 |
61 |
62 |
63 |
--------------------------------------------------------------------------------
/test/test_pages/property_enumeration.html:
--------------------------------------------------------------------------------
1 |
2 |
3 | Property Enumeration Test Page
4 | Property Enumeration Test Page
5 |
6 |
7 |
8 |
34 |
35 |
36 |
--------------------------------------------------------------------------------
/test/test_pages/sensor_value_test.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | Sensor Value Ranges
6 |
7 | Ambient light
8 |
9 |
10 |
11 | Proximity
12 |
13 |
14 |
15 |
16 | Battery
17 | Is battery in charge? unavailable
18 | Battery will be charged in unavailable seconds
19 | Battery will be discharged in unavailable seconds
20 | Current battery level: unavailable
21 |
22 | Orientation
23 |
24 |
25 |
26 |
27 | Accelerometer
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 | Gyro
37 |
38 |
39 |
40 |
41 |
42 |
43 |
138 |
139 |
140 |
--------------------------------------------------------------------------------
/test/test_pages/sensors.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | localStorage example
5 |
19 |
20 |
21 |
22 |
--------------------------------------------------------------------------------
/test/test_pages/shared/test_favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sensor-js/OpenWPM-mobile/1834f4c5b8fd0d3976b2e57f5310fd72860a681f/test/test_pages/shared/test_favicon.ico
--------------------------------------------------------------------------------
/test/test_pages/shared/test_image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sensor-js/OpenWPM-mobile/1834f4c5b8fd0d3976b2e57f5310fd72860a681f/test/test_pages/shared/test_image.png
--------------------------------------------------------------------------------
/test/test_pages/shared/test_image_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sensor-js/OpenWPM-mobile/1834f4c5b8fd0d3976b2e57f5310fd72860a681f/test/test_pages/shared/test_image_2.png
--------------------------------------------------------------------------------
/test/test_pages/shared/test_script.js:
--------------------------------------------------------------------------------
1 | //A simple script
2 | window.test_script_loaded = true;
3 | console.log("test script loaded");
4 |
--------------------------------------------------------------------------------
/test/test_pages/shared/test_script_2.js:
--------------------------------------------------------------------------------
1 | // A second simple script
2 | window.test_script_2_loaded = true;
3 | console.log("test script 2 loaded");
4 |
5 | var test = 1;
6 |
7 | function test_function() {
8 | test = test + 1;
9 | console.log(test);
10 | }
11 |
12 | test_function();
13 | test_function();
14 | test_function();
15 |
--------------------------------------------------------------------------------
/test/test_pages/shared/test_style.css:
--------------------------------------------------------------------------------
1 | p {
2 | color: red;
3 | }
4 |
--------------------------------------------------------------------------------
/test/test_pages/shared/utils.js:
--------------------------------------------------------------------------------
1 | function getParameterByName(name) {
2 | // http://stackoverflow.com/a/901144
3 | name = name.replace(/[\[]/, "\\[").replace(/[\]]/, "\\]");
4 | var regex = new RegExp("[\\?&]" + name + "=([^]*)"),
5 | results = regex.exec(location.search);
6 | return results === null ? "" : decodeURIComponent(results[1].replace(/\+/g, " "));
7 | }
8 |
--------------------------------------------------------------------------------
/test/test_pages/simple_a.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Simple Page A
5 |
11 |
12 |
13 | Click me!
14 | Click me also!
15 | Click me for a JS alert!
16 | Go to google.com
17 | Go to example.com
18 |
19 |
--------------------------------------------------------------------------------
/test/test_pages/simple_b.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Simple Page B
5 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/test/test_pages/simple_c.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Simple Page C
5 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/test/test_pages/simple_d.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Simple Page D
5 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/test/test_pages/stack.js:
--------------------------------------------------------------------------------
1 | // A function in an external script
2 | function js_check_navigator() {
3 | console.log(window.navigator.userAgent);
4 | var foo = eval("window.navigator.platform");
5 | }
6 |
7 | // call the above function
8 | js_check_navigator();
9 |
10 | // use eval
11 | var bar = eval("window.navigator.buildID");
12 |
13 | //use Function
14 | new Function("window.navigator.appVersion")();
15 |
--------------------------------------------------------------------------------
/test/test_pages/webrtc_localip.html:
--------------------------------------------------------------------------------
1 |
2 | WebRTC Local IP Test Page
3 | WebRTC Local IP Test Page
4 |
5 |
6 |
33 |
34 |
35 |
--------------------------------------------------------------------------------
/test/test_profile.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from os.path import join, isfile
3 | from ..automation import TaskManager
4 | from ..automation.Errors import CommandExecutionError, ProfileLoadError
5 | from openwpmtest import OpenWPMTest
6 |
7 |
8 | # TODO update these tests to make use of blocking commands
9 | class TestProfile(OpenWPMTest):
10 |
11 | def get_config(self, data_dir=""):
12 | manager_params, browser_params = self.get_test_config(data_dir)
13 | browser_params[0]['profile_archive_dir'] =\
14 | join(manager_params['data_directory'], 'browser_profile')
15 | return manager_params, browser_params
16 |
17 | def test_saving(self):
18 | manager_params, browser_params = self.get_config()
19 | manager = TaskManager.TaskManager(manager_params, browser_params)
20 | manager.get('http://example.com')
21 | manager.close()
22 | assert isfile(join(browser_params[0]['profile_archive_dir'],'profile.tar.gz'))
23 |
24 | def test_crash(self):
25 | manager_params, browser_params = self.get_config()
26 | manager_params['failure_limit'] = 0
27 | manager = TaskManager.TaskManager(manager_params, browser_params)
28 | with pytest.raises(CommandExecutionError):
29 | manager.get('http://example.com') # So we have a profile
30 | manager.get('example.com') # Selenium requires scheme prefix
31 | manager.get('example.com') # Requires two commands to shut down
32 |
33 | def test_crash_profile(self):
34 | manager_params, browser_params = self.get_config()
35 | manager_params['failure_limit'] = 2
36 | manager = TaskManager.TaskManager(manager_params, browser_params)
37 | try:
38 | manager.get('http://example.com') # So we have a profile
39 | manager.get('example.com') # Selenium requires scheme prefix
40 | manager.get('example.com') # Selenium requires scheme prefix
41 | manager.get('example.com') # Selenium requires scheme prefix
42 | manager.get('example.com') # Requires two commands to shut down
43 | except CommandExecutionError:
44 | pass
45 | assert isfile(join(browser_params[0]['profile_archive_dir'],'profile.tar.gz'))
46 |
47 | def test_profile_error(self):
48 | manager_params, browser_params = self.get_config()
49 | browser_params[0]['profile_tar'] = '/tmp/NOTREAL'
50 | with pytest.raises(ProfileLoadError):
51 | TaskManager.TaskManager(manager_params, browser_params) # noqa
52 |
53 | def test_profile_saved_when_launch_crashes(self):
54 | manager_params, browser_params = self.get_config()
55 | browser_params[0]['proxy'] = True
56 | browser_params[0]['save_javascript'] = True
57 | manager = TaskManager.TaskManager(manager_params, browser_params)
58 | manager.get('http://example.com')
59 |
60 | # Kill the LevelDBAggregator
61 | # This will cause the proxy launch to crash
62 | manager.ldb_status_queue.put("DIE")
63 | manager.browsers[0]._SPAWN_TIMEOUT = 2 # Have timeout occur quickly
64 | manager.browsers[0]._UNSUCCESSFUL_SPAWN_LIMIT = 2 # Have timeout occur quickly
65 | manager.get('example.com') # Cause a selenium crash to force browser to restart
66 |
67 | # The browser will fail to launch due to the proxy crashes
68 | try:
69 | manager.get('http://example.com')
70 | except CommandExecutionError:
71 | pass
72 | manager.close()
73 | assert isfile(join(browser_params[0]['profile_archive_dir'],'profile.tar.gz'))
74 |
75 | #TODO Check for Flash
76 | #TODO Check contents of profile (tests should fail anyway if profile doesn't contain everything)
77 |
--------------------------------------------------------------------------------
/test/test_sensors.py:
--------------------------------------------------------------------------------
1 | import utilities
2 | from openwpmtest import OpenWPMTest
3 | from ..automation import TaskManager
4 | from ..automation.utilities import db_utils
5 | import json
6 | # TODO: add test for setter instrumentation
7 |
8 |
9 | class TestExtension(OpenWPMTest):
10 | NUM_BROWSERS = 1
11 |
12 | def get_config(self, data_dir=""):
13 | manager_params, browser_params = self.get_test_config(data_dir)
14 | browser_params[0]['js_instrument'] = True
15 | return manager_params, browser_params
16 |
17 | def test_sensor_probing(self, tmpdir):
18 | test_url = utilities.BASE_TEST_URL + '/sensors.html'
19 | db = self.visit(test_url, str(tmpdir))
20 | rows = db_utils.get_javascript_entries(db, all_columns=True)
21 | observed_sensor_apis = set()
22 | expected_apis = set(['deviceorientation', 'devicemotion',
23 | 'deviceproximity', 'devicelight'])
24 | for row in rows:
25 | if row[9] == "window.addEventListener":
26 | observed_sensor_apis.add(json.loads(row[12])["0"])
27 | assert row[3] == test_url
28 | assert observed_sensor_apis == expected_apis
29 |
--------------------------------------------------------------------------------
/test/test_storage_vectors.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import utilities
3 | from ..automation import CommandSequence
4 | from ..automation import TaskManager
5 | from ..automation.utilities import db_utils
6 | from openwpmtest import OpenWPMTest
7 |
8 | expected_lso_content_a = [
9 | 1, # visit id
10 | u'localtest.me',
11 | u'FlashCookie.sol',
12 | u'localtest.me/FlashCookie.sol',
13 | u'test_key',
14 | u'REPLACEME']
15 |
16 | expected_lso_content_b = [
17 | 2, # visit id
18 | u'localtest.me',
19 | u'FlashCookie.sol',
20 | u'localtest.me/FlashCookie.sol',
21 | u'test_key',
22 | u'REPLACEME']
23 |
24 | expected_js_cookie = (
25 | 1, # visit id
26 | u'%s' % utilities.BASE_TEST_URL_DOMAIN,
27 | u'test_cookie',
28 | u'Test-0123456789',
29 | u'%s' % utilities.BASE_TEST_URL_DOMAIN,
30 | u'/')
31 |
32 |
33 | class TestStorageVectors(OpenWPMTest):
34 | """ Runs some basic tests to check that the saving of
35 | storage vectors (i.e. Flash LSOs, profile cookies) works.
36 |
37 | NOTE: These tests are very basic and should be expanded
38 | on to check for completeness and correctness.
39 | """
40 |
41 | def get_config(self, data_dir=""):
42 | return self.get_test_config(data_dir)
43 |
44 | @pytest.mark.skip("Flash is not used for mobile crawls")
45 | def test_flash_cookies(self):
46 | """ Check that some Flash LSOs are saved and
47 | are properly keyed in db."""
48 | # Run the test crawl
49 | manager_params, browser_params = self.get_config()
50 | browser_params[0]['disable_flash'] = False
51 | manager = TaskManager.TaskManager(manager_params, browser_params)
52 |
53 | # Get a site we know sets Flash cookies and visit it twice
54 | lso_value_a = utilities.rand_str(8)
55 | expected_lso_content_a[5] = lso_value_a # we'll expect this to be present
56 | qry_str = '?lso_test_key=%s&lso_test_value=%s' % ("test_key",
57 | lso_value_a)
58 | test_url_a = utilities.BASE_TEST_URL + '/lso/setlso.html' + qry_str
59 | cs = CommandSequence.CommandSequence(test_url_a)
60 | cs.get(sleep=3, timeout=120)
61 | cs.dump_flash_cookies()
62 | manager.execute_command_sequence(cs)
63 |
64 | lso_value_b = utilities.rand_str(8)
65 | expected_lso_content_b[5] = lso_value_b # we'll expect this to be present
66 | qry_str = '?lso_test_key=%s&lso_test_value=%s' % ("test_key",
67 | lso_value_b)
68 | test_url_b = utilities.BASE_TEST_URL + '/lso/setlso.html' + qry_str
69 | cs = CommandSequence.CommandSequence(test_url_b)
70 | cs.get(sleep=3, timeout=120)
71 | cs.dump_flash_cookies()
72 | manager.execute_command_sequence(cs)
73 |
74 | manager.close()
75 |
76 | # Check that some flash cookies are recorded
77 | qry_res = db_utils.query_db(manager_params['db'],
78 | "SELECT * FROM flash_cookies")
79 | lso_count = len(qry_res)
80 | assert lso_count == 2
81 | lso_content_a = list(qry_res[0][2:]) # Remove first two items
82 | lso_content_b = list(qry_res[1][2:]) # Remove first two items
83 | # remove randomly generated LSO directory name
84 | # e.g. TY2FOJUG/localtest.me/Flash.sol -> localtest.me/Flash.sol
85 | lso_content_a[3] = lso_content_a[3].split("/", 1)[-1] # remove LSO dirname
86 | lso_content_b[3] = lso_content_b[3].split("/", 1)[-1] # remove LSO dirname
87 | assert lso_content_a == expected_lso_content_a
88 | assert lso_content_b == expected_lso_content_b
89 |
90 | def test_profile_cookies(self):
91 | """ Check that some profile cookies are saved """
92 | # Run the test crawl
93 | manager_params, browser_params = self.get_config()
94 | manager = TaskManager.TaskManager(manager_params, browser_params)
95 | # TODO update this to local test site
96 | url = 'http://www.yahoo.com'
97 | cs = CommandSequence.CommandSequence(url)
98 | cs.get(sleep=3, timeout=120)
99 | cs.dump_profile_cookies()
100 | manager.execute_command_sequence(cs)
101 | manager.close()
102 |
103 | # Check that some flash cookies are recorded
104 | qry_res = db_utils.query_db(manager_params['db'],
105 | "SELECT COUNT(*) FROM profile_cookies")
106 | prof_cookie_count = qry_res[0]
107 | assert prof_cookie_count > 0
108 |
109 | def test_js_profile_cookies(self):
110 | """ Check that profile cookies set by JS are saved """
111 | # Run the test crawl
112 | manager_params, browser_params = self.get_config()
113 | manager = TaskManager.TaskManager(manager_params, browser_params)
114 | url = utilities.BASE_TEST_URL + "/js_cookie.html"
115 | cs = CommandSequence.CommandSequence(url)
116 | cs.get(sleep=3, timeout=120)
117 | cs.dump_profile_cookies()
118 | manager.execute_command_sequence(cs)
119 | manager.close()
120 | # Check that the JS cookie we stored is recorded
121 | qry_res = db_utils.query_db(manager_params['db'], "SELECT * FROM profile_cookies")
122 | assert len(qry_res) == 1 # we store only one cookie
123 | cookies = qry_res[0] # take the first cookie
124 | # compare URL, domain, name, value, origin, path
125 | assert cookies[2:8] == expected_js_cookie
126 |
--------------------------------------------------------------------------------
/test/test_trigger_sensor_events.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import utilities
3 | from openwpmtest import OpenWPMTest
4 | from ..automation import TaskManager
5 | from ..automation import CommandSequence
6 | from ..automation.utilities import db_utils
7 |
8 |
9 | class TestTriggerSensorEvents(OpenWPMTest):
10 | """Make sure that we trigger fake sensor events."""
11 |
12 | def get_config(self, data_dir=""):
13 | return self.get_test_config(data_dir)
14 |
15 | def test_trigger_sensor_events(self):
16 | manager_params, browser_params = self.get_config()
17 | browser_params[0]['trigger_sensor_events'] = True
18 | manager = TaskManager.TaskManager(manager_params, browser_params)
19 | test_url = utilities.BASE_TEST_URL + '/sensor_value_test.html'
20 |
21 | def get_text_from_el(driver, element_id):
22 | js_str = 'return document.getElementById("%s").innerHTML' %\
23 | element_id
24 | return driver.execute_script(js_str)
25 |
26 | def check_trigger_sensor_events(**kwargs):
27 | """Check if we find the sensor values on the page"""
28 | driver = kwargs['driver']
29 |
30 | device_light_str = get_text_from_el(driver, "DeviceLight")
31 | assert "AmbientLight current Value: " in device_light_str
32 | assert "Max:" in device_light_str
33 | assert "Min:" in device_light_str
34 |
35 | device_proximity_str = get_text_from_el(driver, "DeviceProximity")
36 | assert "DeviceProximity current Value: " in device_proximity_str
37 | assert "Max:" in device_proximity_str
38 | assert "Min:" in device_proximity_str
39 |
40 | user_proximity_str = get_text_from_el(driver, "UserProximity")
41 | assert user_proximity_str == "UserProximity: true"
42 |
43 | batt_in_charge_str = get_text_from_el(driver, "in-charge")
44 | assert batt_in_charge_str != "unavailable"
45 |
46 | batt_charging_time_str = get_text_from_el(driver, "charging-time")
47 | assert batt_charging_time_str != "unavailable"
48 |
49 | batt_discharging_time_str = get_text_from_el(driver,
50 | "discharging-time")
51 | assert batt_discharging_time_str != "unavailable"
52 |
53 | batt_level_str = get_text_from_el(driver, "battery-level")
54 | assert batt_level_str != "unavailable"
55 |
56 | assert "Z-axis: " in get_text_from_el(driver, "Orientation_a")
57 | assert "X-axis: " in get_text_from_el(driver, "Orientation_b")
58 | assert "Y-axis: " in get_text_from_el(driver, "Orientation_g")
59 |
60 | assert "AccelerometerIncludingGravity X-axis:" in\
61 | get_text_from_el(driver, "Accelerometer_gx")
62 | assert "AccelerometerIncludingGravity Y-axis:" in\
63 | get_text_from_el(driver, "Accelerometer_gy")
64 | assert "AccelerometerIncludingGravity Z-axis:" in\
65 | get_text_from_el(driver, "Accelerometer_gz")
66 |
67 | assert "Accelerometer X-axis: " in\
68 | get_text_from_el(driver, "Accelerometer_x")
69 | assert "Accelerometer Y-axis: " in\
70 | get_text_from_el(driver, "Accelerometer_y")
71 | assert "Accelerometer Z-axis: " in\
72 | get_text_from_el(driver, "Accelerometer_z")
73 | assert "Data Interval: " in\
74 | get_text_from_el(driver, "Accelerometer_i")
75 |
76 | assert "Gyro X-axis: " in\
77 | get_text_from_el(driver, "Gyro_x")
78 | assert "Gyro Y-axis: " in\
79 | get_text_from_el(driver, "Gyro_y")
80 | assert "Gyro Z-axis: " in\
81 | get_text_from_el(driver, "Gyro_z")
82 |
83 | cs = CommandSequence.CommandSequence(test_url, blocking=True)
84 | cs.get(sleep=5, timeout=60)
85 | cs.run_custom_function(check_trigger_sensor_events)
86 | manager.execute_command_sequence(cs)
87 | manager.close()
88 | assert not db_utils.any_command_failed(manager_params['db'])
89 |
--------------------------------------------------------------------------------
/test/utilities.py:
--------------------------------------------------------------------------------
1 | import SimpleHTTPServer
2 | import SocketServer
3 | import threading
4 | import os
5 | from random import choice
6 | from os.path import realpath, dirname
7 | LOCAL_WEBSERVER_PORT = 8000
8 | BASE_TEST_URL_DOMAIN = "localtest.me"
9 | BASE_TEST_URL_NOPATH = "http://%s:%s" % (BASE_TEST_URL_DOMAIN,
10 | LOCAL_WEBSERVER_PORT)
11 | BASE_TEST_URL = "%s/test_pages" % BASE_TEST_URL_NOPATH
12 |
13 |
14 | class MyTCPServer(SocketServer.TCPServer):
15 | """Subclass TCPServer to be able to reuse the same port (Errno 98)."""
16 | allow_reuse_address = True
17 |
18 |
19 | def start_server():
20 | """ Start a simple HTTP server to run local tests.
21 |
22 | We need this since page-mod events in the extension
23 | don't fire on `file://*`. Instead, point test code to
24 | `http://localtest.me:8000/test_pages/...`
25 | """
26 | print "Starting HTTP Server in a separate thread"
27 | # switch to test dir, this is where the test files are
28 | os.chdir(dirname(realpath(__file__)))
29 | Handler = SimpleHTTPServer.SimpleHTTPRequestHandler
30 | server = MyTCPServer(("localhost", LOCAL_WEBSERVER_PORT), Handler)
31 | thread = threading.Thread(target=server.serve_forever)
32 | thread.daemon = True
33 | thread.start()
34 | print "...serving at port", LOCAL_WEBSERVER_PORT
35 | return server, thread
36 |
37 |
38 | def rand_str(size=8):
39 | """Return random string with the given size."""
40 | RAND_CHARS = "abcdefghijklmnopqrstuvwxyz0123456789"
41 | return ''.join(choice(RAND_CHARS) for _ in range(size))
42 |
--------------------------------------------------------------------------------