├── .github └── workflows │ ├── publish.yml │ └── run-tests.yaml ├── .gitignore ├── .readthedocs.yaml ├── CHANGELOG.md ├── LICENSE ├── MANIFEST.in ├── README.md ├── docs ├── Makefile ├── conf.py ├── index.md └── requirements.txt ├── pyproject.toml ├── schema.png ├── tests ├── Makefile ├── frontpages.warc.gz ├── google.warc ├── google.warc.gz ├── no-warc-info.warc ├── scoop.wacz └── test_warcdb.py └── warcdb ├── __init__.py └── migrations.py /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | name: Publish Python Package 2 | 3 | on: 4 | release: 5 | types: [created] 6 | 7 | permissions: 8 | contents: read 9 | 10 | jobs: 11 | test: 12 | runs-on: ubuntu-latest 13 | strategy: 14 | matrix: 15 | python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"] 16 | steps: 17 | - uses: actions/checkout@v3 18 | - name: Set up Python ${{ matrix.python-version }} 19 | uses: actions/setup-python@v4 20 | with: 21 | python-version: ${{ matrix.python-version }} 22 | cache: pip 23 | cache-dependency-path: setup.py 24 | - name: Install dependencies 25 | run: | 26 | pip install '.[test]' 27 | - name: Run tests 28 | run: | 29 | pytest 30 | deploy: 31 | runs-on: ubuntu-latest 32 | needs: [test] 33 | steps: 34 | - uses: actions/checkout@v3 35 | - name: Set up Python 36 | uses: actions/setup-python@v4 37 | with: 38 | python-version: "3.11" 39 | cache: pip 40 | cache-dependency-path: setup.py 41 | - name: Install dependencies 42 | run: | 43 | pip install setuptools wheel twine build 44 | - name: Publish 45 | env: 46 | TWINE_USERNAME: __token__ 47 | TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }} 48 | run: | 49 | python -m build 50 | twine upload dist/* 51 | 52 | -------------------------------------------------------------------------------- /.github/workflows/run-tests.yaml: -------------------------------------------------------------------------------- 1 | name: Run tests 2 | on: 3 | push: 4 | pull_request: 5 | 6 | jobs: 7 | run-tests: 8 | runs-on: ubuntu-latest 9 | steps: 10 | - name: Check out repository 11 | uses: actions/checkout@v3 12 | 13 | - name: Setup python 14 | uses: actions/setup-python@v4 15 | with: 16 | python-version: "3.x" 17 | 18 | - name: Install test requirements 19 | run: | 20 | pip install poetry 21 | poetry install 22 | 23 | - name: Check formatting 24 | run: poetry run black --check . 25 | 26 | - name: Run tests 27 | run: | 28 | poetry run pytest 29 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | dist/ 3 | warcio_sqlite.egg-info/ 4 | .idea/ 5 | *.db 6 | poetry.lock 7 | __pycache__ 8 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | build: 4 | os: ubuntu-22.04 5 | tools: 6 | python: "3.11" 7 | 8 | sphinx: 9 | configuration: docs/conf.py 10 | 11 | formats: 12 | - pdf 13 | - epub 14 | 15 | python: 16 | install: 17 | - requirements: docs/requirements.txt -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | ### WarcDB v0.?.0 (Month DD, YYYY) ### 2 | 3 | * Add Black formatting ([#23](https://github.com/Florents-Tselai/WarcDB/pull/23)) 4 | * Add a view for HTTP headers ([#25](https://github.com/Florents-Tselai/WarcDB/pull/25)) 5 | * Model response status ([#26](https://github.com/Florents-Tselai/WarcDB/pull/26)) 6 | 7 | ### WarcDB v0.2.2 (October 21, 2023) ### 8 | 9 | Thanks to Ed Summers ([@edsu](https://github.com/edsu)) for his work on this 10 | release 11 | * Switch to `pyproject.toml` ([#11](https://github.com/Florents-Tselai/WarcDB/pull/11)) 12 | * Support for `WACZ` files ([#16](https://github.com/Florents-Tselai/WarcDB/pull/16)) 13 | * Support for schema versioning and migrations ([#13](https://github.com/Florents-Tselai/WarcDB/pull/13)) ([#20](https://github.com/Florents-Tselai/WarcDB/pull/20)) 14 | 15 | ### WarcDB v0.1.0 (June 19, 2022) ### 16 | 17 | * First version with `warcdb import` functionality -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Version 2.0, January 2004 2 | http://www.apache.org/licenses/ 3 | 4 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 5 | 6 | 1. Definitions. 7 | 8 | "License" shall mean the terms and conditions for use, reproduction, 9 | and distribution as defined by Sections 1 through 9 of this document. 10 | 11 | "Licensor" shall mean the copyright owner or entity authorized by 12 | the copyright owner that is granting the License. 13 | 14 | "Legal Entity" shall mean the union of the acting entity and all 15 | other entities that control, are controlled by, or are under common 16 | control with that entity. For the purposes of this definition, 17 | "control" means (i) the power, direct or indirect, to cause the 18 | direction or management of such entity, whether by contract or 19 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 20 | outstanding shares, or (iii) beneficial ownership of such entity. 21 | 22 | "You" (or "Your") shall mean an individual or Legal Entity 23 | exercising permissions granted by this License. 24 | 25 | "Source" form shall mean the preferred form for making modifications, 26 | including but not limited to software source code, documentation 27 | source, and configuration files. 28 | 29 | "Object" form shall mean any form resulting from mechanical 30 | transformation or translation of a Source form, including but 31 | not limited to compiled object code, generated documentation, 32 | and conversions to other media types. 33 | 34 | "Work" shall mean the work of authorship, whether in Source or 35 | Object form, made available under the License, as indicated by a 36 | copyright notice that is included in or attached to the work 37 | (an example is provided in the Appendix below). 38 | 39 | "Derivative Works" shall mean any work, whether in Source or Object 40 | form, that is based on (or derived from) the Work and for which the 41 | editorial revisions, annotations, elaborations, or other modifications 42 | represent, as a whole, an original work of authorship. For the purposes 43 | of this License, Derivative Works shall not include works that remain 44 | separable from, or merely link (or bind by name) to the interfaces of, 45 | the Work and Derivative Works thereof. 46 | 47 | "Contribution" shall mean any work of authorship, including 48 | the original version of the Work and any modifications or additions 49 | to that Work or Derivative Works thereof, that is intentionally 50 | submitted to Licensor for inclusion in the Work by the copyright owner 51 | or by an individual or Legal Entity authorized to submit on behalf of 52 | the copyright owner. For the purposes of this definition, "submitted" 53 | means any form of electronic, verbal, or written communication sent 54 | to the Licensor or its representatives, including but not limited to 55 | communication on electronic mailing lists, source code control systems, 56 | and issue tracking systems that are managed by, or on behalf of, the 57 | Licensor for the purpose of discussing and improving the Work, but 58 | excluding communication that is conspicuously marked or otherwise 59 | designated in writing by the copyright owner as "Not a Contribution." 60 | 61 | "Contributor" shall mean Licensor and any individual or Legal Entity 62 | on behalf of whom a Contribution has been received by Licensor and 63 | subsequently incorporated within the Work. 64 | 65 | 2. Grant of Copyright License. Subject to the terms and conditions of 66 | this License, each Contributor hereby grants to You a perpetual, 67 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 68 | copyright license to reproduce, prepare Derivative Works of, 69 | publicly display, publicly perform, sublicense, and distribute the 70 | Work and such Derivative Works in Source or Object form. 71 | 72 | 3. Grant of Patent License. Subject to the terms and conditions of 73 | this License, each Contributor hereby grants to You a perpetual, 74 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 75 | (except as stated in this section) patent license to make, have made, 76 | use, offer to sell, sell, import, and otherwise transfer the Work, 77 | where such license applies only to those patent claims licensable 78 | by such Contributor that are necessarily infringed by their 79 | Contribution(s) alone or by combination of their Contribution(s) 80 | with the Work to which such Contribution(s) was submitted. If You 81 | institute patent litigation against any entity (including a 82 | cross-claim or counterclaim in a lawsuit) alleging that the Work 83 | or a Contribution incorporated within the Work constitutes direct 84 | or contributory patent infringement, then any patent licenses 85 | granted to You under this License for that Work shall terminate 86 | as of the date such litigation is filed. 87 | 88 | 4. Redistribution. You may reproduce and distribute copies of the 89 | Work or Derivative Works thereof in any medium, with or without 90 | modifications, and in Source or Object form, provided that You 91 | meet the following conditions: 92 | 93 | (a) You must give any other recipients of the Work or 94 | Derivative Works a copy of this License; and 95 | 96 | (b) You must cause any modified files to carry prominent notices 97 | stating that You changed the files; and 98 | 99 | (c) You must retain, in the Source form of any Derivative Works 100 | that You distribute, all copyright, patent, trademark, and 101 | attribution notices from the Source form of the Work, 102 | excluding those notices that do not pertain to any part of 103 | the Derivative Works; and 104 | 105 | (d) If the Work includes a "NOTICE" text file as part of its 106 | distribution, then any Derivative Works that You distribute must 107 | include a readable copy of the attribution notices contained 108 | within such NOTICE file, excluding those notices that do not 109 | pertain to any part of the Derivative Works, in at least one 110 | of the following places: within a NOTICE text file distributed 111 | as part of the Derivative Works; within the Source form or 112 | documentation, if provided along with the Derivative Works; or, 113 | within a display generated by the Derivative Works, if and 114 | wherever such third-party notices normally appear. The contents 115 | of the NOTICE file are for informational purposes only and 116 | do not modify the License. You may add Your own attribution 117 | notices within Derivative Works that You distribute, alongside 118 | or as an addendum to the NOTICE text from the Work, provided 119 | that such additional attribution notices cannot be construed 120 | as modifying the License. 121 | 122 | You may add Your own copyright statement to Your modifications and 123 | may provide additional or different license terms and conditions 124 | for use, reproduction, or distribution of Your modifications, or 125 | for any such Derivative Works as a whole, provided Your use, 126 | reproduction, and distribution of the Work otherwise complies with 127 | the conditions stated in this License. 128 | 129 | 5. Submission of Contributions. Unless You explicitly state otherwise, 130 | any Contribution intentionally submitted for inclusion in the Work 131 | by You to the Licensor shall be under the terms and conditions of 132 | this License, without any additional terms or conditions. 133 | Notwithstanding the above, nothing herein shall supersede or modify 134 | the terms of any separate license agreement you may have executed 135 | with Licensor regarding such Contributions. 136 | 137 | 6. Trademarks. This License does not grant permission to use the trade 138 | names, trademarks, service marks, or product names of the Licensor, 139 | except as required for reasonable and customary use in describing the 140 | origin of the Work and reproducing the content of the NOTICE file. 141 | 142 | 7. Disclaimer of Warranty. Unless required by applicable law or 143 | agreed to in writing, Licensor provides the Work (and each 144 | Contributor provides its Contributions) on an "AS IS" BASIS, 145 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 146 | implied, including, without limitation, any warranties or conditions 147 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 148 | PARTICULAR PURPOSE. You are solely responsible for determining the 149 | appropriateness of using or redistributing the Work and assume any 150 | risks associated with Your exercise of permissions under this License. 151 | 152 | 8. Limitation of Liability. In no event and under no legal theory, 153 | whether in tort (including negligence), contract, or otherwise, 154 | unless required by applicable law (such as deliberate and grossly 155 | negligent acts) or agreed to in writing, shall any Contributor be 156 | liable to You for damages, including any direct, indirect, special, 157 | incidental, or consequential damages of any character arising as a 158 | result of this License or out of the use or inability to use the 159 | Work (including but not limited to damages for loss of goodwill, 160 | work stoppage, computer failure or malfunction, or any and all 161 | other commercial damages or losses), even if such Contributor 162 | has been advised of the possibility of such damages. 163 | 164 | 9. Accepting Warranty or Additional Liability. While redistributing 165 | the Work or Derivative Works thereof, You may choose to offer, 166 | and charge a fee for, acceptance of support, warranty, indemnity, 167 | or other liability obligations and/or rights consistent with this 168 | License. However, in accepting such obligations, You may act only 169 | on Your own behalf and on Your sole responsibility, not on behalf 170 | of any other Contributor, and only if You agree to indemnify, 171 | defend, and hold each Contributor harmless for any liability 172 | incurred by, or claims asserted against, such Contributor by reason 173 | of your accepting any such warranty or additional liability. 174 | 175 | END OF TERMS AND CONDITIONS 176 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE NOTICE README.md schema.png CHANGELOG.md 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # WarcDB: Web crawl data as SQLite databases. 2 | 3 | [![PyPI](https://img.shields.io/pypi/v/warcdb.svg)](https://pypi.org/project/warcdb/) 4 | [![Tests](https://github.com/Florents-Tselai/WarcDB/actions/workflows/run-tests.yaml/badge.svg?branch=main)](https://github.com/Florents-Tselai/WarcDB/actions/workflows/run-tests.yaml) 5 | [![License](https://img.shields.io/badge/license-Apache%202.0-blue.svg)](https://github.com/Florents-Tselai/WarcDB/blob/main/LICENSE) 6 | ![GitHub Stars](https://img.shields.io/github/stars/Florents-Tselai/WarcDB) 7 | [![Linkedin](https://img.shields.io/badge/LinkedIn-0077B5?logo=linkedin&logoColor=white)](https://www.linkedin.com/in/florentstselai/) 8 | [![Github Sponsors](https://img.shields.io/static/v1?label=Sponsor&message=%E2%9D%A4&logo=GitHub&link=https://github.com/sponsors/Florents-Tselai/)](https://github.com/sponsors/Florents-Tselai/) 9 | 10 | 11 | `WarcDB` is an `SQLite`-based file format that makes web crawl data easier to share and query. 12 | 13 | It is based on the standardized [Web ARChive format](https://en.wikipedia.org/wiki/Web_ARChive), 14 | used by web archives, and defined in [ISO 28500:2017](https://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.1/). 15 | 16 | ## Usage 17 | 18 | ```shell 19 | pip install warcdb 20 | ``` 21 | 22 | ```shell 23 | # Load the `archive.warcdb` file with data. 24 | warcdb import archive.warcdb ./tests/google.warc ./tests/frontpages.warc.gz "https://tselai.com/data/google.warc" 25 | 26 | warcdb enable-fts ./archive.warcdb response payload 27 | 28 | # Search for records that mention "stocks" in their response body 29 | warcdb search ./archive.warcdb response "stocks" -c "WARC-Record-ID" 30 | ``` 31 | As you can see you can use any mix of local/remote and raw/compressed archives. 32 | 33 | For example to get a part of the [Common Crawl January 2022 Crawl Archive ](https://data.commoncrawl.org/crawl-data/CC-MAIN-2022-05/index.html) in a streaming fashion: 34 | 35 | ```shell 36 | warcdb import archive.warcdb "https://data.commoncrawl.org/crawl-data/CC-MAIN-2022-05/segments/1642320306346.64/warc/CC-MAIN-20220128212503-20220129002503-00719.warc.gz 37 | ``` 38 | 39 | You can also import WARC files contained in [WACZ](https://specs.webrecorder.net/wacz/latest) files, that are created by tools like [ArchiveWeb.Page](https://archiveweb.page), [Browsertrix-Crawler](https://github.com/webrecorder/browsertrix-crawler), and [Scoop](https://github.com/harvard-lil/scoop). 40 | 41 | ```shell 42 | warcdb import archive.warcdb tests/scoop.wacz 43 | ``` 44 | 45 | ## How It Works 46 | 47 | Individual `.warc` files are read and parsed and their data is inserted into an SQLite database with the relational schema seen below. 48 | 49 | ## Schema 50 | 51 | If there is a new major or minor version of warcdb you may need to migrate existing databases to use the new database schema (if there have been any changes). To do this you first upgrade warcdb, and then import into the database, which will make sure all migrations have been run. If you want to migrate the database explicitly you can: 52 | 53 | ```shell 54 | warcdb migrate archive.warcdb 55 | ``` 56 | 57 | If there are no migrations to run the `migrate` command will do nothing. 58 | 59 | Here's the relational schema of the `.warcdb` file. 60 | 61 | ![WarcDB Schema](schema.png) 62 | 63 | ### Views 64 | 65 | In addition to the core tables that map to the WARC record types there are also helper *views* that make it a bit easier to query data: 66 | 67 | #### v_request_http_header 68 | 69 | A view of HTTP headers in WARC request records: 70 | 71 | | Column Name | Column Type | Description | 72 | | -------------- | ----------- | ---------------------------------------------------------------------- | 73 | | warc_record_id | text | The WARC-Record-Id for the *request* record that it was extracted from. | 74 | | name | text | The lowercased HTTP header name (e.g. content-type) | 75 | | value | text | The HTTP header value (e.g. text/html) | 76 | 77 | #### v_response_http_header 78 | 79 | A view of HTTP headers in WARC response records: 80 | 81 | | Column Name | Column Type | Description | 82 | | -------------- | ----------- | ---------------------------------------------------------------------- | 83 | | warc_record_id | text | The WARC-Record-Id for the *response* record that it was extracted from. | 84 | | name | text | The lowercased HTTP header name (e.g. content-type) | 85 | | value | text | The HTTP header value (e.g. text/html) | 86 | 87 | ## Motivation 88 | 89 | From the `WARC` [formal specification](https://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.1/): 90 | 91 | > The WARC (Web ARChive) file format offers a convention for concatenating multiple resource records (data objects), 92 | > each consisting of a set of simple text headers and an arbitrary data block into one long file. 93 | 94 | Many organizations such as Commoncrawl, WebRecorder, Archive.org and libraries around the world, use the `warc` format 95 | to archive and store web data. 96 | 97 | The full datasets of these services range in the few pebibytes(PiB), 98 | making them impractical to query using non-distributed systems. 99 | 100 | This project aims to make **subsets** such data easier to access and query using SQL. 101 | 102 | Currently, this is implemented on top of SQLite and is a wrapper around the 103 | excellent [SQLite-Utils](https://sqlite-utils.datasette.io/en/stable/) utility. 104 | 105 | `"wrapper"` means that all 106 | existing `sqlite-utils` [CLI commands](https://sqlite-utils.datasette.io/en/stable/cli-reference.html) 107 | can be called as expected like 108 | 109 | ```shell 110 | sqlite-utils archive.warcdb` 111 | ``` 112 | or 113 | ```shell 114 | warcdb example.warcdb 115 | ``` 116 | 117 | ## Examples 118 | 119 | ### Populate with `wget` 120 | 121 | ```shell 122 | wget --warc-file tselai "https://tselai.com" 123 | 124 | warcdb import archive.warcdb tselai.warc.gz 125 | ``` 126 | 127 | ### Get all response headers 128 | 129 | ```shell 130 | sqlite3 archive.warcdb <=2.0 -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "warcdb" 3 | version = "0.2.2" 4 | description = "WarcDB: Web crawl data as SQLite databases" 5 | authors = ["Florents Tselai "] 6 | readme = "README.md" 7 | license = "Apache License, Version 2.0" 8 | repository = "https://github.com/Florents-Tselai/warcdb" 9 | classifiers = [ 10 | "Intended Audience :: Developers", 11 | "Intended Audience :: Science/Research", 12 | ] 13 | 14 | [tool.poetry.scripts] 15 | warcdb = "warcdb:warcdb_cli" 16 | 17 | [tool.poetry.dependencies] 18 | python = "^3.9" 19 | sqlite-utils = "^3.26" 20 | warcio = "^1.7" 21 | click = "^8.1" 22 | more-itertools = "^10.1" 23 | tqdm = "^4.66" 24 | requests = "^2.31" 25 | sqlite-migrate = "0.1a2" 26 | 27 | [tool.poetry.group.test.dependencies] 28 | pytest = "^7.4" 29 | black = "^23.10" 30 | 31 | [tool.pytest.ini_options] 32 | testpaths = [ 33 | "tests" 34 | ] 35 | 36 | [build-system] 37 | requires = ["poetry-core"] 38 | build-backend = "poetry.core.masonry.api" 39 | -------------------------------------------------------------------------------- /schema.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Florents-Tselai/WarcDB/93df323255927f4fdf5d56edfb4fa788d50f2c0e/schema.png -------------------------------------------------------------------------------- /tests/Makefile: -------------------------------------------------------------------------------- 1 | frontpages: 2 | wget --warc-file frontpages "https://nytimes.com" "wsj.com" -------------------------------------------------------------------------------- /tests/frontpages.warc.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Florents-Tselai/WarcDB/93df323255927f4fdf5d56edfb4fa788d50f2c0e/tests/frontpages.warc.gz -------------------------------------------------------------------------------- /tests/google.warc: -------------------------------------------------------------------------------- 1 | WARC/1.0 2 | WARC-Type: warcinfo 3 | Content-Type: application/warc-fields 4 | WARC-Date: 2022-06-18T17:36:17Z 5 | WARC-Record-ID: 6 | WARC-Filename: google.warc.warc 7 | WARC-Block-Digest: sha1:LE3JTBJBI2JB55NHYJKVHUZXS2VVQXZB 8 | Content-Length: 257 9 | 10 | software: Wget/1.21.3 (darwin21.3.0) 11 | format: WARC File Format 1.0 12 | conformsTo: http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf 13 | robots: classic 14 | wget-arguments: "--warc-file" "google.warc" "https://google.com" "--no-warc-compression" 15 | 16 | 17 | 18 | WARC/1.0 19 | WARC-Type: request 20 | WARC-Target-URI: 21 | Content-Type: application/http;msgtype=request 22 | WARC-Date: 2022-06-18T17:36:18Z 23 | WARC-Record-ID: 24 | WARC-IP-Address: 142.250.187.142 25 | WARC-Warcinfo-ID: 26 | WARC-Block-Digest: sha1:6QD37LJCQAUCIGVLHWBTO4UCYNS4UNNO 27 | Content-Length: 125 28 | 29 | GET / HTTP/1.1 30 | Host: google.com 31 | User-Agent: Wget/1.21.3 32 | Accept: */* 33 | Accept-Encoding: identity 34 | Connection: Keep-Alive 35 | 36 | 37 | 38 | WARC/1.0 39 | WARC-Type: response 40 | WARC-Record-ID: 41 | WARC-Warcinfo-ID: 42 | WARC-Concurrent-To: 43 | WARC-Target-URI: 44 | WARC-Date: 2022-06-18T17:36:18Z 45 | WARC-IP-Address: 142.250.187.142 46 | WARC-Block-Digest: sha1:BDI5FUGI5N7RDNELBXU3VJ7XI5IZBKNS 47 | WARC-Payload-Digest: sha1:WUUFJUPXTXS6X3V7AFQEI6QJY6UMFTPE 48 | Content-Type: application/http;msgtype=response 49 | Content-Length: 881 50 | 51 | HTTP/1.1 301 Moved Permanently 52 | Location: https://www.google.com/ 53 | Content-Type: text/html; charset=UTF-8 54 | Date: Sat, 18 Jun 2022 17:36:18 GMT 55 | Expires: Sat, 18 Jun 2022 17:36:18 GMT 56 | Cache-Control: private, max-age=2592000 57 | Server: gws 58 | Content-Length: 220 59 | X-XSS-Protection: 0 60 | X-Frame-Options: SAMEORIGIN 61 | Set-Cookie: CONSENT=PENDING+975; expires=Mon, 17-Jun-2024 17:36:18 GMT; path=/; domain=.google.com; Secure 62 | P3P: CP="This is not a P3P policy! See g.co/p3phelp for more info." 63 | Alt-Svc: h3=":443"; ma=2592000,h3-29=":443"; ma=2592000,h3-Q050=":443"; ma=2592000,h3-Q046=":443"; ma=2592000,h3-Q043=":443"; ma=2592000,quic=":443"; ma=2592000; v="46,43" 64 | 65 | 66 | 301 Moved 67 |

301 Moved

68 | The document has moved 69 | here. 70 | 71 | 72 | 73 | WARC/1.0 74 | WARC-Type: request 75 | WARC-Target-URI: 76 | Content-Type: application/http;msgtype=request 77 | WARC-Date: 2022-06-18T17:36:18Z 78 | WARC-Record-ID: 79 | WARC-IP-Address: 172.217.20.68 80 | WARC-Warcinfo-ID: 81 | WARC-Block-Digest: sha1:TDTBDPCZVGKAAVS36S6ZATZQCS2SMYPL 82 | Content-Length: 158 83 | 84 | GET / HTTP/1.1 85 | Host: www.google.com 86 | User-Agent: Wget/1.21.3 87 | Accept: */* 88 | Accept-Encoding: identity 89 | Connection: Keep-Alive 90 | Cookie: CONSENT=PENDING+975 91 | 92 | 93 | 94 | WARC/1.0 95 | WARC-Type: response 96 | WARC-Record-ID: 97 | WARC-Warcinfo-ID: 98 | WARC-Concurrent-To: 99 | WARC-Target-URI: 100 | WARC-Date: 2022-06-18T17:36:18Z 101 | WARC-IP-Address: 172.217.20.68 102 | WARC-Block-Digest: sha1:F3KE4IPCCIODCSTLV4ASBPAV4V5DSBBF 103 | WARC-Payload-Digest: sha1:Z7VI6NJ5HY5T5X43SLR3KX7BFRSOFZEU 104 | Content-Type: application/http;msgtype=response 105 | Content-Length: 1366 106 | 107 | HTTP/1.1 302 Found 108 | Location: https://consent.google.com/ml?continue=https://www.google.com/&gl=GR&m=0&pc=shp&uxe=eomcs4e&hl=el&src=1 109 | Cache-Control: private 110 | Content-Type: text/html; charset=UTF-8 111 | P3P: CP="This is not a P3P policy! See g.co/p3phelp for more info." 112 | Date: Sat, 18 Jun 2022 17:36:18 GMT 113 | Server: gws 114 | Content-Length: 324 115 | X-XSS-Protection: 0 116 | X-Frame-Options: SAMEORIGIN 117 | Set-Cookie: AEC=AakniGN2LXYNvtskvUaFRazU7A4OSsrbGUxW9jfg_h7umtDr8vFSUWzUFA; expires=Thu, 15-Dec-2022 17:36:18 GMT; path=/; domain=.google.com; Secure; HttpOnly; SameSite=lax 118 | Set-Cookie: __Secure-ENID=5.SE=lBM1gn4kCsJJYZBWCv6D2XUTZyNVOT2ajQIxT-_-YNkNC49-MBDQdlC9W3P-2FTCYvPVEs3s6melWLKKKDkrDRXMnpP-J5ZZyddN1eEeW-VmkPWgFvo1nhEts_WubiwVEVRzLnw7RXXDf9SJkcTtAwaqSSzLgXwp-DmQta5Y0iw; expires=Wed, 19-Jul-2023 09:54:36 GMT; path=/; domain=.google.com; Secure; HttpOnly; SameSite=lax 119 | Alt-Svc: h3=":443"; ma=2592000,h3-29=":443"; ma=2592000,h3-Q050=":443"; ma=2592000,h3-Q046=":443"; ma=2592000,h3-Q043=":443"; ma=2592000,quic=":443"; ma=2592000; v="46,43" 120 | 121 | 122 | 302 Moved 123 |

302 Moved

124 | The document has moved 125 | here. 126 | 127 | 128 | 129 | WARC/1.0 130 | WARC-Type: request 131 | WARC-Target-URI: 132 | Content-Type: application/http;msgtype=request 133 | WARC-Date: 2022-06-18T17:36:19Z 134 | WARC-Record-ID: 135 | WARC-IP-Address: 142.250.187.142 136 | WARC-Warcinfo-ID: 137 | WARC-Block-Digest: sha1:FSOV5Q3EF45TCKYZKV3U5HKGZHXQ6QGU 138 | Content-Length: 494 139 | 140 | GET /ml?continue=https://www.google.com/&gl=GR&m=0&pc=shp&uxe=eomcs4e&hl=el&src=1 HTTP/1.1 141 | Host: consent.google.com 142 | User-Agent: Wget/1.21.3 143 | Accept: */* 144 | Accept-Encoding: identity 145 | Connection: Keep-Alive 146 | Cookie: AEC=AakniGN2LXYNvtskvUaFRazU7A4OSsrbGUxW9jfg_h7umtDr8vFSUWzUFA; CONSENT=PENDING+975; __Secure-ENID=5.SE=lBM1gn4kCsJJYZBWCv6D2XUTZyNVOT2ajQIxT-_-YNkNC49-MBDQdlC9W3P-2FTCYvPVEs3s6melWLKKKDkrDRXMnpP-J5ZZyddN1eEeW-VmkPWgFvo1nhEts_WubiwVEVRzLnw7RXXDf9SJkcTtAwaqSSzLgXwp-DmQta5Y0iw 147 | 148 | 149 | 150 | WARC/1.0 151 | WARC-Type: response 152 | WARC-Record-ID: 153 | WARC-Warcinfo-ID: 154 | WARC-Concurrent-To: 155 | WARC-Target-URI: 156 | WARC-Date: 2022-06-18T17:36:19Z 157 | WARC-IP-Address: 142.250.187.142 158 | WARC-Block-Digest: sha1:3TA46FB2BAPBGWFP7GXSF5O54NYP5TBZ 159 | WARC-Payload-Digest: sha1:J6CP2JM67GL3R5CFX2YB7DV3Q2K67U5E 160 | Content-Type: application/http;msgtype=response 161 | Content-Length: 13825 162 | 163 | HTTP/1.1 200 OK 164 | Content-Type: text/html; charset=utf-8 165 | Cache-Control: no-cache, no-store, max-age=0, must-revalidate 166 | Pragma: no-cache 167 | Expires: Mon, 01 Jan 1990 00:00:00 GMT 168 | Date: Sat, 18 Jun 2022 17:36:19 GMT 169 | Cross-Origin-Resource-Policy: same-site 170 | Accept-CH: Sec-CH-UA-Arch, Sec-CH-UA-Bitness, Sec-CH-UA-Full-Version, Sec-CH-UA-Full-Version-List, Sec-CH-UA-Model, Sec-CH-UA-Platform, Sec-CH-UA-Platform-Version 171 | Content-Security-Policy: require-trusted-types-for 'script';report-uri /_/ConsentHttp/cspreport 172 | Content-Security-Policy: script-src 'nonce-Zmfu_SkIxG5aQqcfe4I6Tg' 'unsafe-inline';object-src 'none';base-uri 'self';report-uri /_/ConsentHttp/cspreport;worker-src 'self' 173 | Permissions-Policy: ch-ua-arch=*, ch-ua-bitness=*, ch-ua-full-version=*, ch-ua-full-version-list=*, ch-ua-model=*, ch-ua-platform=*, ch-ua-platform-version=* 174 | Cross-Origin-Opener-Policy: unsafe-none 175 | Server: ESF 176 | X-XSS-Protection: 0 177 | X-Frame-Options: SAMEORIGIN 178 | X-Content-Type-Options: nosniff 179 | Alt-Svc: h3=":443"; ma=2592000,h3-29=":443"; ma=2592000,h3-Q050=":443"; ma=2592000,h3-Q046=":443"; ma=2592000,h3-Q043=":443"; ma=2592000,quic=":443"; ma=2592000; v="46,43" 180 | Accept-Ranges: none 181 | Vary: Sec-Fetch-Dest, Sec-Fetch-Mode, Sec-Fetch-Site,Accept-Encoding 182 | Transfer-Encoding: chunked 183 | 184 | 211a 185 | Πριν συνεχίσετε στην Αναζήτηση Google
Google

Πριν συνεχίσετε στην Αναζήτηση Google

Η Google χρησιμοποιεί cookie και δεδομένα για:
  • Την παροχή και τη διατήρηση υπηρεσιών, όπως την παρακολούθηση διακοπών λειτουργίας και την προστασία από ανεπιθύμητο περιεχόμενο, απάτη και κατάχρηση
  • Τη μέτρηση της αφοσίωσης του κοινού και την ανάλυση των στατιστικών στοιχείων ιστοτόπων για την κατανόηση του τρόπου με τον οποίο χρησιμοποιούνται οι υπηρεσίες μας
Εάν συμφωνείτε, θα χρησιμοποιούμε επίσης cookie και δεδομένα για:
  • Τη βελτίωση της ποιότητας των υπηρεσιών μας και την ανάπτυξη νέων
  • Την προβολή διαφημίσεων και τη μέτρηση της αποτελεσματικότητάς τους
  • Την εμφάνιση εξατομικευμένου περιεχομένου, ανάλογα με τις ρυθμίσεις σας
  • Την προβολή εξατομικευμένων ή γενικών διαφημίσεων, ανάλογα με τις ρυθμίσεις σας, στο Google και στον ιστό
Για μη εξατομικευμένο περιεχόμενο και διαφημίσεις, αυτό που βλέπετε μπορεί να επηρεάζεται από παράγοντες όπως το περιεχόμενο που προβάλετε τη συγκεκριμένη στιγμή και η τοποθεσία σας (η προβολή διαφημίσεων βασίζεται μόνο στη γενική τοποθεσία σας). Το εξατομικευμένο περιεχόμενο και οι διαφημίσεις ενδέχεται να βασίζονται σε αυτούς τους παράγοντες και στη δραστηριότητά σας, όπως τις αναζητήσεις σας στο Google και τα βίντεο που παρακολουθείτε στο YouTube. Στο εξατομικευμένο περιεχόμενο και τις διαφημίσεις περιλαμβάνονται διάφορα στοιχεία, όπως πιο συναφή αποτελέσματα και προτάσεις, προσαρμογή της αρχικής σελίδας YouTube και διαφημίσεις προσαρμοσμένες στα ενδιαφέροντά σας.

Κάντε κλικ στην Προσαρμογή για να δείτε επιλογές, όπως στοιχεία ελέγχου με τα οποία μπορείτε να απορρίψετε τη χρήση των cookie για εξατομίκευση, καθώς και πληροφορίες σχετικά με στοιχεία ελέγχου σε επίπεδο προγράμματος περιήγησης για να απορρίψετε ορισμένα ή όλα τα cookie για άλλες χρήσεις. Επίσης, μπορείτε να επισκεφτείτε τη διεύθυνση g.co/privacytools ανά πάσα στιγμή.

363 | 0 364 | 365 | 366 | 367 | WARC/1.0 368 | WARC-Type: metadata 369 | WARC-Record-ID: 370 | WARC-Warcinfo-ID: 371 | WARC-Target-URI: 372 | WARC-Date: 2022-06-18T17:36:19Z 373 | WARC-Block-Digest: sha1:3BEL2GZLALJKCBILAG7NY4T7GBWHCSHE 374 | Content-Type: text/plain 375 | Content-Length: 48 376 | 377 | 378 | 379 | 380 | WARC/1.0 381 | WARC-Type: resource 382 | WARC-Record-ID: 383 | WARC-Warcinfo-ID: 384 | WARC-Concurrent-To: 385 | WARC-Target-URI: 386 | WARC-Date: 2022-06-18T17:36:19Z 387 | WARC-Block-Digest: sha1:MFBWQUHB26BXHAD2RDFLKQ53U3EOECOZ 388 | Content-Type: text/plain 389 | Content-Length: 74 390 | 391 | "--warc-file" "google.warc" "https://google.com" "--no-warc-compression" 392 | 393 | 394 | WARC/1.0 395 | WARC-Type: resource 396 | WARC-Record-ID: 397 | WARC-Warcinfo-ID: 398 | WARC-Concurrent-To: 399 | WARC-Target-URI: 400 | WARC-Date: 2022-06-18T17:36:19Z 401 | WARC-Block-Digest: sha1:C5YKOUFCANRKDLAWTUWYGSST5OWJFTSB 402 | Content-Type: text/plain 403 | Content-Length: 1381 404 | 405 | Opening WARC file ‘google.warc.warc’. 406 | 407 | --2022-06-18 20:36:17-- https://google.com/ 408 | Resolving google.com (google.com)... 142.250.187.142 409 | Connecting to google.com (google.com)|142.250.187.142|:443... connected. 410 | HTTP request sent, awaiting response... 301 Moved Permanently 411 | Location: https://www.google.com/ [following] 412 | 413 | 0K 100% 69,9M=0s 414 | 415 | --2022-06-18 20:36:18-- https://www.google.com/ 416 | Resolving www.google.com (www.google.com)... 172.217.20.68 417 | Connecting to www.google.com (www.google.com)|172.217.20.68|:443... connected. 418 | HTTP request sent, awaiting response... 302 Found 419 | Location: https://consent.google.com/ml?continue=https://www.google.com/&gl=GR&m=0&pc=shp&uxe=eomcs4e&hl=el&src=1 [following] 420 | 421 | 0K 100% 77,2M=0s 422 | 423 | --2022-06-18 20:36:19-- https://consent.google.com/ml?continue=https://www.google.com/&gl=GR&m=0&pc=shp&uxe=eomcs4e&hl=el&src=1 424 | Resolving consent.google.com (consent.google.com)... 142.250.187.142 425 | Connecting to consent.google.com (consent.google.com)|142.250.187.142|:443... connected. 426 | HTTP request sent, awaiting response... 200 OK 427 | Length: unspecified [text/html] 428 | Saving to: ‘index.html.4’ 429 | 430 | 0K .......... .. 859K=0,01s 431 | 432 | 2022-06-18 20:36:19 (859 KB/s) - ‘index.html.4’ saved [12526] 433 | 434 | 435 | 436 | -------------------------------------------------------------------------------- /tests/google.warc.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Florents-Tselai/WarcDB/93df323255927f4fdf5d56edfb4fa788d50f2c0e/tests/google.warc.gz -------------------------------------------------------------------------------- /tests/no-warc-info.warc: -------------------------------------------------------------------------------- 1 | WARC/1.0 2 | WARC-Type: request 3 | WARC-Target-URI: 4 | Content-Type: application/http;msgtype=request 5 | WARC-Date: 2022-06-18T17:36:18Z 6 | WARC-Record-ID: 7 | WARC-IP-Address: 142.250.187.142 8 | WARC-Warcinfo-ID: 9 | WARC-Block-Digest: sha1:6QD37LJCQAUCIGVLHWBTO4UCYNS4UNNO 10 | Content-Length: 125 11 | 12 | GET / HTTP/1.1 13 | Host: google.com 14 | User-Agent: Wget/1.21.3 15 | Accept: */* 16 | Accept-Encoding: identity 17 | Connection: Keep-Alive 18 | 19 | 20 | 21 | WARC/1.0 22 | WARC-Type: response 23 | WARC-Record-ID: 24 | WARC-Warcinfo-ID: 25 | WARC-Concurrent-To: 26 | WARC-Target-URI: 27 | WARC-Date: 2022-06-18T17:36:18Z 28 | WARC-IP-Address: 142.250.187.142 29 | WARC-Block-Digest: sha1:BDI5FUGI5N7RDNELBXU3VJ7XI5IZBKNS 30 | WARC-Payload-Digest: sha1:WUUFJUPXTXS6X3V7AFQEI6QJY6UMFTPE 31 | Content-Type: application/http;msgtype=response 32 | Content-Length: 881 33 | 34 | HTTP/1.1 301 Moved Permanently 35 | Location: https://www.google.com/ 36 | Content-Type: text/html; charset=UTF-8 37 | Date: Sat, 18 Jun 2022 17:36:18 GMT 38 | Expires: Sat, 18 Jun 2022 17:36:18 GMT 39 | Cache-Control: private, max-age=2592000 40 | Server: gws 41 | Content-Length: 220 42 | X-XSS-Protection: 0 43 | X-Frame-Options: SAMEORIGIN 44 | Set-Cookie: CONSENT=PENDING+975; expires=Mon, 17-Jun-2024 17:36:18 GMT; path=/; domain=.google.com; Secure 45 | P3P: CP="This is not a P3P policy! See g.co/p3phelp for more info." 46 | Alt-Svc: h3=":443"; ma=2592000,h3-29=":443"; ma=2592000,h3-Q050=":443"; ma=2592000,h3-Q046=":443"; ma=2592000,h3-Q043=":443"; ma=2592000,quic=":443"; ma=2592000; v="46,43" 47 | 48 | 49 | 301 Moved 50 |

301 Moved

51 | The document has moved 52 | here. 53 | 54 | 55 | 56 | WARC/1.0 57 | WARC-Type: request 58 | WARC-Target-URI: 59 | Content-Type: application/http;msgtype=request 60 | WARC-Date: 2022-06-18T17:36:18Z 61 | WARC-Record-ID: 62 | WARC-IP-Address: 172.217.20.68 63 | WARC-Warcinfo-ID: 64 | WARC-Block-Digest: sha1:TDTBDPCZVGKAAVS36S6ZATZQCS2SMYPL 65 | Content-Length: 158 66 | 67 | GET / HTTP/1.1 68 | Host: www.google.com 69 | User-Agent: Wget/1.21.3 70 | Accept: */* 71 | Accept-Encoding: identity 72 | Connection: Keep-Alive 73 | Cookie: CONSENT=PENDING+975 74 | 75 | 76 | 77 | WARC/1.0 78 | WARC-Type: response 79 | WARC-Record-ID: 80 | WARC-Warcinfo-ID: 81 | WARC-Concurrent-To: 82 | WARC-Target-URI: 83 | WARC-Date: 2022-06-18T17:36:18Z 84 | WARC-IP-Address: 172.217.20.68 85 | WARC-Block-Digest: sha1:F3KE4IPCCIODCSTLV4ASBPAV4V5DSBBF 86 | WARC-Payload-Digest: sha1:Z7VI6NJ5HY5T5X43SLR3KX7BFRSOFZEU 87 | Content-Type: application/http;msgtype=response 88 | Content-Length: 1366 89 | 90 | HTTP/1.1 302 Found 91 | Location: https://consent.google.com/ml?continue=https://www.google.com/&gl=GR&m=0&pc=shp&uxe=eomcs4e&hl=el&src=1 92 | Cache-Control: private 93 | Content-Type: text/html; charset=UTF-8 94 | P3P: CP="This is not a P3P policy! See g.co/p3phelp for more info." 95 | Date: Sat, 18 Jun 2022 17:36:18 GMT 96 | Server: gws 97 | Content-Length: 324 98 | X-XSS-Protection: 0 99 | X-Frame-Options: SAMEORIGIN 100 | Set-Cookie: AEC=AakniGN2LXYNvtskvUaFRazU7A4OSsrbGUxW9jfg_h7umtDr8vFSUWzUFA; expires=Thu, 15-Dec-2022 17:36:18 GMT; path=/; domain=.google.com; Secure; HttpOnly; SameSite=lax 101 | Set-Cookie: __Secure-ENID=5.SE=lBM1gn4kCsJJYZBWCv6D2XUTZyNVOT2ajQIxT-_-YNkNC49-MBDQdlC9W3P-2FTCYvPVEs3s6melWLKKKDkrDRXMnpP-J5ZZyddN1eEeW-VmkPWgFvo1nhEts_WubiwVEVRzLnw7RXXDf9SJkcTtAwaqSSzLgXwp-DmQta5Y0iw; expires=Wed, 19-Jul-2023 09:54:36 GMT; path=/; domain=.google.com; Secure; HttpOnly; SameSite=lax 102 | Alt-Svc: h3=":443"; ma=2592000,h3-29=":443"; ma=2592000,h3-Q050=":443"; ma=2592000,h3-Q046=":443"; ma=2592000,h3-Q043=":443"; ma=2592000,quic=":443"; ma=2592000; v="46,43" 103 | 104 | 105 | 302 Moved 106 |

302 Moved

107 | The document has moved 108 | here. 109 | 110 | 111 | 112 | WARC/1.0 113 | WARC-Type: request 114 | WARC-Target-URI: 115 | Content-Type: application/http;msgtype=request 116 | WARC-Date: 2022-06-18T17:36:19Z 117 | WARC-Record-ID: 118 | WARC-IP-Address: 142.250.187.142 119 | WARC-Warcinfo-ID: 120 | WARC-Block-Digest: sha1:FSOV5Q3EF45TCKYZKV3U5HKGZHXQ6QGU 121 | Content-Length: 494 122 | 123 | GET /ml?continue=https://www.google.com/&gl=GR&m=0&pc=shp&uxe=eomcs4e&hl=el&src=1 HTTP/1.1 124 | Host: consent.google.com 125 | User-Agent: Wget/1.21.3 126 | Accept: */* 127 | Accept-Encoding: identity 128 | Connection: Keep-Alive 129 | Cookie: AEC=AakniGN2LXYNvtskvUaFRazU7A4OSsrbGUxW9jfg_h7umtDr8vFSUWzUFA; CONSENT=PENDING+975; __Secure-ENID=5.SE=lBM1gn4kCsJJYZBWCv6D2XUTZyNVOT2ajQIxT-_-YNkNC49-MBDQdlC9W3P-2FTCYvPVEs3s6melWLKKKDkrDRXMnpP-J5ZZyddN1eEeW-VmkPWgFvo1nhEts_WubiwVEVRzLnw7RXXDf9SJkcTtAwaqSSzLgXwp-DmQta5Y0iw 130 | 131 | 132 | 133 | WARC/1.0 134 | WARC-Type: response 135 | WARC-Record-ID: 136 | WARC-Warcinfo-ID: 137 | WARC-Concurrent-To: 138 | WARC-Target-URI: 139 | WARC-Date: 2022-06-18T17:36:19Z 140 | WARC-IP-Address: 142.250.187.142 141 | WARC-Block-Digest: sha1:3TA46FB2BAPBGWFP7GXSF5O54NYP5TBZ 142 | WARC-Payload-Digest: sha1:J6CP2JM67GL3R5CFX2YB7DV3Q2K67U5E 143 | Content-Type: application/http;msgtype=response 144 | Content-Length: 13825 145 | 146 | HTTP/1.1 200 OK 147 | Content-Type: text/html; charset=utf-8 148 | Cache-Control: no-cache, no-store, max-age=0, must-revalidate 149 | Pragma: no-cache 150 | Expires: Mon, 01 Jan 1990 00:00:00 GMT 151 | Date: Sat, 18 Jun 2022 17:36:19 GMT 152 | Cross-Origin-Resource-Policy: same-site 153 | Accept-CH: Sec-CH-UA-Arch, Sec-CH-UA-Bitness, Sec-CH-UA-Full-Version, Sec-CH-UA-Full-Version-List, Sec-CH-UA-Model, Sec-CH-UA-Platform, Sec-CH-UA-Platform-Version 154 | Content-Security-Policy: require-trusted-types-for 'script';report-uri /_/ConsentHttp/cspreport 155 | Content-Security-Policy: script-src 'nonce-Zmfu_SkIxG5aQqcfe4I6Tg' 'unsafe-inline';object-src 'none';base-uri 'self';report-uri /_/ConsentHttp/cspreport;worker-src 'self' 156 | Permissions-Policy: ch-ua-arch=*, ch-ua-bitness=*, ch-ua-full-version=*, ch-ua-full-version-list=*, ch-ua-model=*, ch-ua-platform=*, ch-ua-platform-version=* 157 | Cross-Origin-Opener-Policy: unsafe-none 158 | Server: ESF 159 | X-XSS-Protection: 0 160 | X-Frame-Options: SAMEORIGIN 161 | X-Content-Type-Options: nosniff 162 | Alt-Svc: h3=":443"; ma=2592000,h3-29=":443"; ma=2592000,h3-Q050=":443"; ma=2592000,h3-Q046=":443"; ma=2592000,h3-Q043=":443"; ma=2592000,quic=":443"; ma=2592000; v="46,43" 163 | Accept-Ranges: none 164 | Vary: Sec-Fetch-Dest, Sec-Fetch-Mode, Sec-Fetch-Site,Accept-Encoding 165 | Transfer-Encoding: chunked 166 | 167 | 211a 168 | Πριν συνεχίσετε στην Αναζήτηση Google
Google

Πριν συνεχίσετε στην Αναζήτηση Google

Η Google χρησιμοποιεί cookie και δεδομένα για:
  • Την παροχή και τη διατήρηση υπηρεσιών, όπως την παρακολούθηση διακοπών λειτουργίας και την προστασία από ανεπιθύμητο περιεχόμενο, απάτη και κατάχρηση
  • Τη μέτρηση της αφοσίωσης του κοινού και την ανάλυση των στατιστικών στοιχείων ιστοτόπων για την κατανόηση του τρόπου με τον οποίο χρησιμοποιούνται οι υπηρεσίες μας
Εάν συμφωνείτε, θα χρησιμοποιούμε επίσης cookie και δεδομένα για:
  • Τη βελτίωση της ποιότητας των υπηρεσιών μας και την ανάπτυξη νέων
  • Την προβολή διαφημίσεων και τη μέτρηση της αποτελεσματικότητάς τους
  • Την εμφάνιση εξατομικευμένου περιεχομένου, ανάλογα με τις ρυθμίσεις σας
  • Την προβολή εξατομικευμένων ή γενικών διαφημίσεων, ανάλογα με τις ρυθμίσεις σας, στο Google και στον ιστό
Για μη εξατομικευμένο περιεχόμενο και διαφημίσεις, αυτό που βλέπετε μπορεί να επηρεάζεται από παράγοντες όπως το περιεχόμενο που προβάλετε τη συγκεκριμένη στιγμή και η τοποθεσία σας (η προβολή διαφημίσεων βασίζεται μόνο στη γενική τοποθεσία σας). Το εξατομικευμένο περιεχόμενο και οι διαφημίσεις ενδέχεται να βασίζονται σε αυτούς τους παράγοντες και στη δραστηριότητά σας, όπως τις αναζητήσεις σας στο Google και τα βίντεο που παρακολουθείτε στο YouTube. Στο εξατομικευμένο περιεχόμενο και τις διαφημίσεις περιλαμβάνονται διάφορα στοιχεία, όπως πιο συναφή αποτελέσματα και προτάσεις, προσαρμογή της αρχικής σελίδας YouTube και διαφημίσεις προσαρμοσμένες στα ενδιαφέροντά σας.

Κάντε κλικ στην Προσαρμογή για να δείτε επιλογές, όπως στοιχεία ελέγχου με τα οποία μπορείτε να απορρίψετε τη χρήση των cookie για εξατομίκευση, καθώς και πληροφορίες σχετικά με στοιχεία ελέγχου σε επίπεδο προγράμματος περιήγησης για να απορρίψετε ορισμένα ή όλα τα cookie για άλλες χρήσεις. Επίσης, μπορείτε να επισκεφτείτε τη διεύθυνση g.co/privacytools ανά πάσα στιγμή.

346 | 0 347 | 348 | 349 | 350 | WARC/1.0 351 | WARC-Type: metadata 352 | WARC-Record-ID: 353 | WARC-Warcinfo-ID: 354 | WARC-Target-URI: 355 | WARC-Date: 2022-06-18T17:36:19Z 356 | WARC-Block-Digest: sha1:3BEL2GZLALJKCBILAG7NY4T7GBWHCSHE 357 | Content-Type: text/plain 358 | Content-Length: 48 359 | 360 | 361 | 362 | 363 | WARC/1.0 364 | WARC-Type: resource 365 | WARC-Record-ID: 366 | WARC-Warcinfo-ID: 367 | WARC-Concurrent-To: 368 | WARC-Target-URI: 369 | WARC-Date: 2022-06-18T17:36:19Z 370 | WARC-Block-Digest: sha1:MFBWQUHB26BXHAD2RDFLKQ53U3EOECOZ 371 | Content-Type: text/plain 372 | Content-Length: 74 373 | 374 | "--warc-file" "google.warc" "https://google.com" "--no-warc-compression" 375 | 376 | 377 | WARC/1.0 378 | WARC-Type: resource 379 | WARC-Record-ID: 380 | WARC-Warcinfo-ID: 381 | WARC-Concurrent-To: 382 | WARC-Target-URI: 383 | WARC-Date: 2022-06-18T17:36:19Z 384 | WARC-Block-Digest: sha1:C5YKOUFCANRKDLAWTUWYGSST5OWJFTSB 385 | Content-Type: text/plain 386 | Content-Length: 1381 387 | 388 | Opening WARC file ‘google.warc.warc’. 389 | 390 | --2022-06-18 20:36:17-- https://google.com/ 391 | Resolving google.com (google.com)... 142.250.187.142 392 | Connecting to google.com (google.com)|142.250.187.142|:443... connected. 393 | HTTP request sent, awaiting response... 301 Moved Permanently 394 | Location: https://www.google.com/ [following] 395 | 396 | 0K 100% 69,9M=0s 397 | 398 | --2022-06-18 20:36:18-- https://www.google.com/ 399 | Resolving www.google.com (www.google.com)... 172.217.20.68 400 | Connecting to www.google.com (www.google.com)|172.217.20.68|:443... connected. 401 | HTTP request sent, awaiting response... 302 Found 402 | Location: https://consent.google.com/ml?continue=https://www.google.com/&gl=GR&m=0&pc=shp&uxe=eomcs4e&hl=el&src=1 [following] 403 | 404 | 0K 100% 77,2M=0s 405 | 406 | --2022-06-18 20:36:19-- https://consent.google.com/ml?continue=https://www.google.com/&gl=GR&m=0&pc=shp&uxe=eomcs4e&hl=el&src=1 407 | Resolving consent.google.com (consent.google.com)... 142.250.187.142 408 | Connecting to consent.google.com (consent.google.com)|142.250.187.142|:443... connected. 409 | HTTP request sent, awaiting response... 200 OK 410 | Length: unspecified [text/html] 411 | Saving to: ‘index.html.4’ 412 | 413 | 0K .......... .. 859K=0,01s 414 | 415 | 2022-06-18 20:36:19 (859 KB/s) - ‘index.html.4’ saved [12526] 416 | 417 | 418 | 419 | -------------------------------------------------------------------------------- /tests/scoop.wacz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Florents-Tselai/WarcDB/93df323255927f4fdf5d56edfb4fa788d50f2c0e/tests/scoop.wacz -------------------------------------------------------------------------------- /tests/test_warcdb.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pathlib 3 | import re 4 | 5 | import pytest 6 | import sqlite_utils 7 | from click.testing import CliRunner 8 | from warcdb import warcdb_cli 9 | 10 | db_file = "test_warc.db" 11 | tests_dir = pathlib.Path(__file__).parent 12 | 13 | # all these WARC files were created with wget except for apod.warc.gz which was 14 | # created with browsertrix-crawler 15 | 16 | 17 | @pytest.mark.parametrize( 18 | "warc_path", 19 | [ 20 | str(tests_dir / "google.warc"), 21 | str(tests_dir / "google.warc.gz"), 22 | str(tests_dir / "no-warc-info.warc"), 23 | str(tests_dir / "scoop.wacz"), 24 | "https://tselai.com/data/google.warc", 25 | "https://tselai.com/data/google.warc.gz", 26 | ], 27 | ) 28 | def test_import(warc_path): 29 | runner = CliRunner() 30 | args = ["import", db_file, warc_path] 31 | result = runner.invoke(warcdb_cli, args) 32 | assert result.exit_code == 0 33 | db = sqlite_utils.Database(db_file) 34 | assert set(db.table_names()) == { 35 | "metadata", 36 | "request", 37 | "resource", 38 | "response", 39 | "warcinfo", 40 | "_sqlite_migrations", 41 | } 42 | 43 | if warc_path == str(tests_dir / "google.warc"): 44 | assert db.table("warcinfo").get( 45 | "" 46 | ) 47 | assert db.table("request").get( 48 | "" 49 | ) 50 | 51 | os.remove(db_file) 52 | 53 | 54 | def test_column_names(): 55 | runner = CliRunner() 56 | runner.invoke( 57 | warcdb_cli, ["import", db_file, str(pathlib.Path("tests/google.warc"))] 58 | ) 59 | 60 | # make sure that the columns are named correctly (lowercase with underscores) 61 | db = sqlite_utils.Database(db_file) 62 | for table in db.tables: 63 | for col in table.columns: 64 | assert re.match(r"^[a-z_]+", col.name), f"column {col.name} named correctly" 65 | 66 | os.remove(db_file) 67 | 68 | 69 | def test_http_header(): 70 | runner = CliRunner() 71 | runner.invoke( 72 | warcdb_cli, ["import", db_file, str(pathlib.Path("tests/google.warc"))] 73 | ) 74 | 75 | db = sqlite_utils.Database(db_file) 76 | 77 | resp_headers = list(db["v_response_http_header"].rows) 78 | assert len(resp_headers) == 43 79 | assert { 80 | "name": "content-type", 81 | "value": "text/html; charset=UTF-8", 82 | "warc_record_id": "", 83 | } in resp_headers 84 | 85 | req_headers = list(db["v_request_http_header"].rows) 86 | assert len(req_headers) == 17 87 | assert { 88 | "name": "user-agent", 89 | "value": "Wget/1.21.3", 90 | "warc_record_id": "", 91 | } in req_headers 92 | 93 | 94 | def test_http_header(): 95 | runner = CliRunner() 96 | runner.invoke( 97 | warcdb_cli, ["import", db_file, str(pathlib.Path("tests/google.warc"))] 98 | ) 99 | db = sqlite_utils.Database(db_file) 100 | responses = db["response"].rows 101 | assert next(responses)["http_status"] == 301 102 | assert next(responses)["http_status"] == 302 103 | assert next(responses)["http_status"] == 200 104 | -------------------------------------------------------------------------------- /warcdb/__init__.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import zipfile 3 | from collections.abc import MutableMapping 4 | from functools import cache 5 | from itertools import chain 6 | from json import dumps 7 | 8 | import click 9 | import requests as req 10 | import sqlite_utils 11 | from more_itertools import always_iterable 12 | from tqdm import tqdm 13 | from warcio import ArchiveIterator, StatusAndHeaders 14 | from warcio.recordloader import ArcWarcRecord 15 | 16 | from warcdb.migrations import migration 17 | 18 | 19 | def dict_union(*args): 20 | """Utility function to union multiple dicts""" 21 | # https://stackoverflow.com/a/15936211/1333954 22 | return dict(chain.from_iterable(d.iteritems() for d in args)) 23 | 24 | 25 | """ Monkeypatch warcio.StatusAndHeaders.to_json() """ 26 | 27 | 28 | def headers_to_json(self): 29 | return dumps([{"header": h, "value": v} for h, v in self.headers]) 30 | 31 | 32 | setattr(StatusAndHeaders, "to_json", headers_to_json) 33 | 34 | """ Monkeypatch warcio.ArcWarcRecord.payload """ 35 | 36 | 37 | @cache # It's important that we cache this, as the content_stream() can only be consumed once. 38 | def record_payload(self: ArcWarcRecord): 39 | return self.content_stream().read() 40 | 41 | 42 | setattr(ArcWarcRecord, "payload", record_payload) 43 | 44 | """ Monkeypatch warcio.ArcWarcRecord.as_dict() """ 45 | 46 | 47 | @cache 48 | def record_as_dict(self: ArcWarcRecord): 49 | """Method to easily represent a record as a dict, to be fed into db_utils.Database.insert()""" 50 | return {k.lower().replace("-", "_"): v for k, v in self.rec_headers.headers} 51 | 52 | 53 | setattr(ArcWarcRecord, "as_dict", record_as_dict) 54 | 55 | """ Monkeypatch warcio.ArcWarcRecord.to_json() """ 56 | 57 | 58 | # def record_to_json(self): 59 | # return dumps(self.as_dict()) 60 | # 61 | # 62 | # setattr(ArcWarcRecord, 'to_json', record_to_json) 63 | 64 | 65 | class WarcDB(MutableMapping): 66 | """ 67 | Wrapper around sqlite_utils.Database 68 | 69 | WarcDB acts as a Mapping (id: str -> r: ArcWarcRecord). 70 | 71 | 72 | The schema defined is table storing warcio.ArcWarcRecord objects 73 | 74 | (self.format, self.rec_type, self.rec_headers, self.raw_stream, 75 | self.http_headers, self.content_type, self.length) = args 76 | self.payload_length = kwargs.get('payload_length', -1) 77 | """ 78 | 79 | def __init__(self, *args, **kwargs): 80 | # First pop warcdb - specific params 81 | self._batch_size = kwargs.pop("batch_size", 1000) 82 | self._records_table = kwargs.get("records_table", "records") 83 | 84 | # Pass the rest to sqlite_utils 85 | self._db = sqlite_utils.Database(*args, **kwargs) 86 | 87 | @property 88 | def db(self) -> sqlite_utils.Database: 89 | return self._db 90 | 91 | def table(self, table_name, **kwargs): 92 | """Convenience method to fetch table by name""" 93 | return self.db.table(table_name, **kwargs) 94 | 95 | @property 96 | def records(self): 97 | """Returns the db table the records are stored""" 98 | return self.table(self._records_table) 99 | 100 | @property 101 | def http_headers(self): 102 | return self.table("http_headers") 103 | 104 | @property 105 | def payloads(self): 106 | return self.table("payloads") 107 | 108 | """MutableMapping abstract methods""" 109 | 110 | def __setitem__(self, key, value: ArcWarcRecord): 111 | """This is the only client-facing way to mutate the file. 112 | Any normalization should happen here. 113 | """ 114 | # Any normalizations happens here 115 | raise NotImplemented 116 | 117 | def __getitem__(self, item) -> ArcWarcRecord: 118 | # Any denormalization happens here 119 | raise NotImplemented 120 | 121 | def __delitem__(self, key): 122 | raise NotImplemented 123 | 124 | def __iter__(self): 125 | raise NotImplemented 126 | 127 | def __len__(self): 128 | return self.records.count 129 | 130 | """ API Methods """ 131 | 132 | def __iadd__(self, r: ArcWarcRecord): 133 | """ 134 | TODO 135 | ==== 136 | 137 | * For all rec_types: also store WARC/1.0 field (warc and version?) 138 | * Todo pass conversions: {'Content-Length': int, warc-date: datet 139 | * All 'response', 'resource', 'request', 'revisit', 'conversion' and 'continuation' records may have a payload. 140 | All 'warcinfo' and 'metadata' records shall not have a payload. 141 | """ 142 | col_type_conversions = { 143 | "content_length": int, 144 | "payload": str, 145 | "warc_date": datetime.datetime, 146 | } 147 | record_dict = r.as_dict() 148 | 149 | # Certain rec_types have payload 150 | has_payload = r.rec_type in [ 151 | "warcinfo", 152 | "request", 153 | "response", 154 | "metadata", 155 | "resource", 156 | ] 157 | if has_payload: 158 | record_dict["payload"] = r.payload() 159 | 160 | # Certain rec_types have http_headers 161 | has_http_headers = r.http_headers is not None 162 | if has_http_headers: 163 | record_dict["http_headers"] = r.http_headers.to_json() 164 | 165 | """Depending on the record type we insert to appropriate record""" 166 | if r.rec_type == "warcinfo": 167 | self.db.table("warcinfo").insert( 168 | record_dict, 169 | pk="warc_record_id", 170 | alter=True, 171 | ignore=True, 172 | columns=col_type_conversions, 173 | ) 174 | elif r.rec_type == "request": 175 | self.db.table("request").insert( 176 | record_dict, 177 | pk="warc_record_id", 178 | foreign_keys=[("warc_warcinfo_id", "warcinfo", "warc-record-id")], 179 | alter=True, 180 | ignore=True, 181 | columns=col_type_conversions, 182 | ) 183 | 184 | elif r.rec_type == "response": 185 | if r.http_headers: 186 | record_dict["http_status"] = r.http_headers.get_statuscode() 187 | self.db.table("response").insert( 188 | record_dict, 189 | pk="warc_record_id", 190 | foreign_keys=[ 191 | ("warc_warcinfo_id", "warcinfo", "warc_record_id"), 192 | ("warc_concurrent_to", "request", "warc_record_id"), 193 | ], 194 | alter=True, 195 | ignore=True, 196 | columns=col_type_conversions, 197 | ) 198 | 199 | elif r.rec_type == "metadata": 200 | self.db.table("metadata").insert( 201 | record_dict, 202 | pk="warc_record_id", 203 | foreign_keys=[ 204 | ("warc-warcinfo-id", "warcinfo", "warc_record_id"), 205 | ("warc_concurrent_to", "response", "warc_record_id"), 206 | ], 207 | alter=True, 208 | ignore=True, 209 | columns=col_type_conversions, 210 | ) 211 | 212 | elif r.rec_type == "resource": 213 | self.db.table("resource").insert( 214 | record_dict, 215 | pk="warc_record_id", 216 | foreign_keys=[ 217 | ("warc-warcinfo-id", "warcinfo", "warc_record_id"), 218 | ("warc_concurrent_to", "metadata", "warc_record_id"), 219 | ], 220 | alter=True, 221 | ignore=True, 222 | columns=col_type_conversions, 223 | ) 224 | 225 | else: 226 | raise ValueError( 227 | f"Record type <{r.rec_type}> is not supported" 228 | f"Only [warcinfo, request, response, metadata, resource] are." 229 | ) 230 | return self 231 | 232 | 233 | from sqlite_utils import cli as sqlite_utils_cli 234 | 235 | warcdb_cli = sqlite_utils_cli.cli 236 | warcdb_cli.help = "Commands for interacting with .warcdb files\n\nBased on SQLite-Utils" 237 | 238 | 239 | @warcdb_cli.command("import") 240 | @click.argument( 241 | "db_path", 242 | type=click.Path(file_okay=True, dir_okay=False, allow_dash=False), 243 | ) 244 | @click.argument("warc_path", type=click.STRING, nargs=-1) 245 | @click.option( 246 | "--batch-size", 247 | type=click.INT, 248 | default=1000, 249 | help="Batch size for chunked INSERTs [Note: ignored for now]", 250 | ) 251 | def import_(db_path, warc_path, batch_size): 252 | """ 253 | Import a WARC file into the database 254 | """ 255 | db = WarcDB(db_path, batch_size=batch_size) 256 | 257 | # ensure the schema is there and up to date 258 | migration.apply(db.db) 259 | 260 | # if batch_size: 261 | # warnings.warn("--batch-size has been temporarily disabled") 262 | 263 | def to_import(): 264 | for f in always_iterable(warc_path): 265 | if f.startswith("http"): 266 | yield from tqdm( 267 | ArchiveIterator(req.get(f, stream=True).raw, arc2warc=True), desc=f 268 | ) 269 | elif f.endswith(".wacz"): 270 | # TODO: can we support loading WACZ files by URL? 271 | wacz = zipfile.ZipFile(f) 272 | warcs = filter( 273 | lambda f: f.filename.endswith("warc.gz"), wacz.infolist() 274 | ) 275 | for warc in warcs: 276 | yield from tqdm( 277 | ArchiveIterator(wacz.open(warc.filename, "r"), arc2warc=True), 278 | desc=warc.filename, 279 | ) 280 | else: 281 | yield from tqdm(ArchiveIterator(open(f, "rb"), arc2warc=True), desc=f) 282 | 283 | for r in to_import(): 284 | db += r 285 | -------------------------------------------------------------------------------- /warcdb/migrations.py: -------------------------------------------------------------------------------- 1 | from sqlite_migrate import Migrations 2 | 3 | migration = Migrations("warcdb") 4 | 5 | 6 | @migration() 7 | def m001_initial(db): 8 | db["warcinfo"].create( 9 | { 10 | "warc_type": str, 11 | "content_type": str, 12 | "warc_date": str, 13 | "warc_record_id": str, 14 | "warc_filename": str, 15 | "warc_block_digest": str, 16 | "content_length": int, 17 | "payload": str, 18 | }, 19 | pk="warc_record_id", 20 | ) 21 | 22 | db["request"].create( 23 | { 24 | "warc_type": str, 25 | "warc_target_uri": str, 26 | "content_type": str, 27 | "warc_date": str, 28 | "warc_record_id": str, 29 | "warc_ip_address": str, 30 | "warc_warcinfo_id": str, 31 | "warc_block_digest": str, 32 | "content_length": int, 33 | "payload": str, 34 | "http_headers": str, 35 | }, 36 | pk="warc_record_id", 37 | foreign_keys=[("warc_warcinfo_id", "warcinfo", "warc_record_id")], 38 | ) 39 | 40 | db["response"].create( 41 | { 42 | "warc_type": str, 43 | "warc_record_id": str, 44 | "warc_warcinfo_id": str, 45 | "warc_concurrent_to": str, 46 | "warc_target_uri": str, 47 | "warc_date": str, 48 | "warc_ip_address": str, 49 | "warc_block_digest": str, 50 | "warc_payload_digest": str, 51 | "content_type": str, 52 | "content_length": int, 53 | "payload": str, 54 | "http_headers": str, 55 | }, 56 | pk="warc_record_id", 57 | foreign_keys=[ 58 | ("warc_warcinfo_id", "warcinfo", "warc_record_id"), 59 | ("warc_concurrent_to", "request", "warc_record_id"), 60 | ], 61 | ) 62 | 63 | db["metadata"].create( 64 | { 65 | "warc_type": str, 66 | "warc_record_id": str, 67 | "warc_warcinfo_id": str, 68 | "warc_target_uri": str, 69 | "warc_date": str, 70 | "warc_block_digest": str, 71 | "content_type": str, 72 | "content_length": int, 73 | "payload": str, 74 | }, 75 | pk="warc_record_id", 76 | foreign_keys=[("warc_warcinfo_id", "warcinfo", "warc_record_id")], 77 | ) 78 | 79 | db["resource"].create( 80 | { 81 | "warc_type": str, 82 | "warc_record_id": str, 83 | "warc_warcinfo_id": str, 84 | "warc_concurrent_to": str, 85 | "warc_target_uri": str, 86 | "warc_date": str, 87 | "warc_block_digest": str, 88 | "content_type": str, 89 | "content_length": int, 90 | "payload": str, 91 | }, 92 | pk="warc_record_id", 93 | foreign_keys=[ 94 | ("warc_warcinfo_id", "warcinfo", "warc_record_id"), 95 | ("warc_concurrent_to", "metadata", "warc_record_id"), 96 | ], 97 | ) 98 | 99 | 100 | @migration() 101 | def m002_headers(db): 102 | db.create_view( 103 | "v_request_http_header", 104 | """ 105 | SELECT 106 | request.warc_record_id AS warc_record_id, 107 | LOWER(JSON_EXTRACT(header.VALUE, '$.header')) AS name, 108 | JSON_EXTRACT(header.VALUE, '$.value') AS value 109 | FROM request, JSON_EACH(request.http_headers) AS header 110 | """, 111 | ) 112 | db.create_view( 113 | "v_response_http_header", 114 | """ 115 | SELECT 116 | response.warc_record_id AS warc_record_id, 117 | LOWER(JSON_EXTRACT(header.VALUE, '$.header')) AS name, 118 | JSON_EXTRACT(header.VALUE, '$.value') AS value 119 | FROM response, JSON_EACH(response.http_headers) AS header 120 | """, 121 | ) 122 | 123 | 124 | @migration() 125 | def m003_status(db): 126 | db["response"].add_column("http_status", int) 127 | --------------------------------------------------------------------------------