├── .gitignore ├── docs ├── requirements.txt ├── parser.rst ├── errors.rst ├── utils.rst ├── conf.py ├── index.rst ├── changelog.rst └── directives.rst ├── .readthedocs.yaml ├── .pre-commit-config.yaml ├── .github ├── dependabot.yml └── workflows │ └── test.yml ├── test ├── data │ └── vhost_combined.log ├── test_bad_directives.py ├── test_decoding.py ├── test_parse_apache_timestamp.py ├── test_general.py ├── test_assemble_datetime.py ├── test_parse.py └── test_parse_custom_time.py ├── LICENSE ├── tox.ini ├── pyproject.toml ├── src └── apachelogs │ ├── errors.py │ ├── __init__.py │ ├── util.py │ ├── strftime.py │ ├── parser.py │ ├── directives.py │ └── timeutil.py ├── CHANGELOG.md └── README.rst /.gitignore: -------------------------------------------------------------------------------- 1 | .coverage* 2 | .tox/ 3 | __pycache__/ 4 | dist/ 5 | docs/_build/ 6 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | Sphinx~=8.0 2 | sphinx-copybutton~=0.5.0 3 | sphinx_rtd_theme~=3.0 4 | -------------------------------------------------------------------------------- /docs/parser.rst: -------------------------------------------------------------------------------- 1 | .. currentmodule:: apachelogs 2 | 3 | Parsing 4 | ======= 5 | 6 | .. autoclass:: LogParser 7 | .. autoclass:: LogEntry() 8 | .. autofunction:: parse 9 | .. autofunction:: parse_lines 10 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | version: 2 2 | formats: all 3 | python: 4 | install: 5 | - requirements: docs/requirements.txt 6 | - method: pip 7 | path: . 8 | build: 9 | os: ubuntu-22.04 10 | tools: 11 | python: "3" 12 | sphinx: 13 | configuration: docs/conf.py 14 | fail_on_warning: true 15 | -------------------------------------------------------------------------------- /docs/errors.rst: -------------------------------------------------------------------------------- 1 | .. currentmodule:: apachelogs 2 | 3 | Exceptions 4 | ========== 5 | .. autoexception:: Error() 6 | :show-inheritance: 7 | 8 | .. autoexception:: InvalidDirectiveError() 9 | :show-inheritance: 10 | 11 | .. autoexception:: InvalidEntryError() 12 | :show-inheritance: 13 | 14 | .. autoexception:: UnknownDirectiveError() 15 | :show-inheritance: 16 | -------------------------------------------------------------------------------- /docs/utils.rst: -------------------------------------------------------------------------------- 1 | .. currentmodule:: apachelogs 2 | 3 | Utilities 4 | ========= 5 | .. autofunction:: parse_apache_timestamp 6 | 7 | Log Format Constants 8 | -------------------- 9 | The following standard log formats are available as string constants in this 10 | package so that you don't have to keep typing out the full log format strings: 11 | 12 | .. autodata:: COMBINED 13 | .. autodata:: COMBINED_DEBIAN 14 | .. autodata:: COMMON 15 | .. autodata:: COMMON_DEBIAN 16 | .. autodata:: VHOST_COMBINED 17 | .. autodata:: VHOST_COMMON 18 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v6.0.0 4 | hooks: 5 | - id: check-added-large-files 6 | - id: check-json 7 | - id: check-toml 8 | - id: check-yaml 9 | - id: end-of-file-fixer 10 | - id: trailing-whitespace 11 | 12 | - repo: https://github.com/psf/black 13 | rev: 25.9.0 14 | hooks: 15 | - id: black 16 | 17 | - repo: https://github.com/PyCQA/isort 18 | rev: 7.0.0 19 | hooks: 20 | - id: isort 21 | 22 | - repo: https://github.com/PyCQA/flake8 23 | rev: 7.3.0 24 | hooks: 25 | - id: flake8 26 | additional_dependencies: 27 | - flake8-bugbear 28 | - flake8-builtins 29 | - flake8-unused-arguments 30 | exclude: ^test/data 31 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: pip 4 | directory: / 5 | exclude-paths: 6 | - docs/** 7 | schedule: 8 | interval: weekly 9 | commit-message: 10 | prefix: "[python]" 11 | labels: 12 | - dependencies 13 | - d:python 14 | 15 | - package-ecosystem: pip 16 | directory: /docs 17 | versioning-strategy: increase-if-necessary 18 | schedule: 19 | interval: weekly 20 | commit-message: 21 | prefix: "[python+docs]" 22 | labels: 23 | - dependencies 24 | - d:python 25 | 26 | - package-ecosystem: github-actions 27 | directory: / 28 | schedule: 29 | interval: weekly 30 | commit-message: 31 | prefix: "[gh-actions]" 32 | include: scope 33 | labels: 34 | - dependencies 35 | - d:github-actions 36 | -------------------------------------------------------------------------------- /test/data/vhost_combined.log: -------------------------------------------------------------------------------- 1 | www.varonathe.org:80 203.62.1.80 - - [06/May/2019:06:28:20 +0000] "GET / HTTP/1.1" 301 577 "-" "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:58.0) Gecko/20100101 Firefox/58.0" 2 | www.varonathe.org:80 203.62.1.80 - - [06/May/2019:06:28:20 +0000] "GET /robots.txt HTTP/1.1" 301 596 "-" "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:58.0) Gecko/20100101 Firefox/58.0" 3 | www.varonathe.org:80 203.62.1.80 - - [06/May/2019:06:28:21 +0000] "POST /App6079ec68.php HTTP/1.1" 301 606 "-" "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:58.0) Gecko/20100101 Firefox/58.0" 4 | www.varonathe.org:80 203.62.1.80 - - [06/May/2019:06:28:21 +0000] "GET /webdav/ HTTP/1.1" 301 554 "-" "Mozilla/5.0" 5 | Bad line 6 | www.varonathe.org:80 203.62.1.80 - - [06/May/2019:06:28:21 +0000] "GET /help.php HTTP/1.1" 301 592 "-" "Mozilla/5.0 (Windows NT 5.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0" 7 | www.varonathe.org:80 203.62.1.80 - - [06/May/2019:06:28:22 +0000] "GET /java.php HTTP/1.1" 301 592 "-" "Mozilla/5.0 (Windows NT 5.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0" 8 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | from apachelogs import __version__ 2 | 3 | project = "apachelogs" 4 | author = "John Thorvald Wodder II" 5 | copyright = "2017-2024 John Thorvald Wodder II" # noqa: A001 6 | 7 | extensions = [ 8 | "sphinx.ext.autodoc", 9 | "sphinx.ext.intersphinx", 10 | "sphinx.ext.viewcode", 11 | "sphinx_copybutton", 12 | ] 13 | 14 | autodoc_default_options = { 15 | "members": True, 16 | "undoc-members": True, 17 | } 18 | 19 | intersphinx_mapping = { 20 | "python": ("https://docs.python.org/3", None), 21 | } 22 | 23 | exclude_patterns = ["_build"] 24 | source_suffix = ".rst" 25 | source_encoding = "utf-8" 26 | master_doc = "index" 27 | version = __version__ 28 | release = __version__ 29 | today_fmt = "%Y %b %d" 30 | default_role = "py:obj" 31 | pygments_style = "sphinx" 32 | 33 | html_theme = "sphinx_rtd_theme" 34 | html_theme_options = { 35 | "collapse_navigation": False, 36 | "prev_next_buttons_location": "both", 37 | } 38 | html_last_updated_fmt = "%Y %b %d" 39 | html_show_sourcelink = True 40 | html_show_sphinx = True 41 | html_show_copyright = True 42 | 43 | copybutton_prompt_text = r">>> |\.\.\. |\$ " 44 | copybutton_prompt_is_regexp = True 45 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2017-2024 John Thorvald Wodder II and contributors 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Test 2 | 3 | on: 4 | pull_request: 5 | push: 6 | branches: 7 | - master 8 | schedule: 9 | - cron: '0 6 * * *' 10 | 11 | concurrency: 12 | group: ${{ github.workflow }}-${{ github.event_name }}-${{ github.ref_name }} 13 | cancel-in-progress: true 14 | 15 | jobs: 16 | test: 17 | runs-on: ubuntu-latest 18 | strategy: 19 | fail-fast: false 20 | matrix: 21 | python-version: 22 | - '3.10' 23 | - '3.11' 24 | - '3.12' 25 | - '3.13' 26 | - '3.14' 27 | - 'pypy-3.10' 28 | - 'pypy-3.11' 29 | toxenv: [py] 30 | include: 31 | - python-version: '3.10' 32 | toxenv: lint 33 | steps: 34 | - name: Check out repository 35 | uses: actions/checkout@v6 36 | 37 | - name: Set up Python 38 | uses: actions/setup-python@v6 39 | with: 40 | python-version: ${{ matrix.python-version }} 41 | 42 | - name: Install dependencies 43 | run: | 44 | python -m pip install --upgrade pip wheel 45 | python -m pip install --upgrade --upgrade-strategy=eager coverage tox 46 | 47 | - name: Run tests 48 | run: tox -e ${{ matrix.toxenv }} 49 | 50 | - name: Generate XML coverage report 51 | if: matrix.toxenv == 'py' 52 | run: coverage xml 53 | 54 | - name: Upload coverage to Codecov 55 | if: matrix.toxenv == 'py' 56 | uses: codecov/codecov-action@v5 57 | with: 58 | fail_ci_if_error: false 59 | token: ${{ secrets.CODECOV_TOKEN }} 60 | name: ${{ matrix.python-version }} 61 | 62 | # vim:set et sts=2: 63 | -------------------------------------------------------------------------------- /test/test_bad_directives.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from apachelogs import InvalidDirectiveError, LogParser, UnknownDirectiveError 3 | 4 | 5 | @pytest.mark.parametrize( 6 | "fmt", 7 | [ 8 | "%", 9 | "% ", 10 | "%^x", 11 | "%^", 12 | "%{param", 13 | ], 14 | ) 15 | def test_malformed_directive(fmt): 16 | with pytest.raises(InvalidDirectiveError) as excinfo: 17 | LogParser(fmt) 18 | assert str(excinfo.value) == f"Invalid log format directive at index 0 of {fmt!r}" 19 | assert excinfo.value.pos == 0 20 | assert excinfo.value.format == fmt 21 | 22 | 23 | @pytest.mark.parametrize( 24 | "fmt", 25 | [ 26 | "%x", 27 | "%^xx", 28 | "%{param}z", 29 | "%{x}a", 30 | "%{x}b", 31 | "%{%{x}a", 32 | "%C", 33 | ], 34 | ) 35 | def test_unknown_directive(fmt): 36 | with pytest.raises(UnknownDirectiveError) as excinfo: 37 | LogParser(fmt) 38 | assert str(excinfo.value) == f"Unknown log format directive: {fmt!r}" 39 | assert excinfo.value.directive == fmt 40 | 41 | 42 | @pytest.mark.parametrize( 43 | "fmt", 44 | [ 45 | "%", 46 | "% ", 47 | "%^x", 48 | "%^", 49 | "%{param", 50 | # '%{x}a', # actually parsed as an unknown directive 51 | "%= 17.1", 42 | "pydicti ~= 1.1", 43 | ] 44 | 45 | [project.urls] 46 | "Source Code" = "https://github.com/jwodder/apachelogs" 47 | "Bug Tracker" = "https://github.com/jwodder/apachelogs/issues" 48 | "Documentation" = "https://apachelogs.readthedocs.io" 49 | 50 | [tool.hatch.version] 51 | path = "src/apachelogs/__init__.py" 52 | 53 | [tool.hatch.build.targets.sdist] 54 | include = [ 55 | "/docs", 56 | "/src", 57 | "/test", 58 | "CHANGELOG.*", 59 | "CONTRIBUTORS.*", 60 | "tox.ini", 61 | ] 62 | 63 | [tool.hatch.envs.default] 64 | python = "3" 65 | -------------------------------------------------------------------------------- /src/apachelogs/errors.py: -------------------------------------------------------------------------------- 1 | class Error(Exception): 2 | """The base class for all custom exceptions raised by `apachelogs`""" 3 | 4 | pass 5 | 6 | 7 | class InvalidEntryError(Error, ValueError): 8 | """ 9 | Raised when a attempting to parse a log entry that does not match the given 10 | log format 11 | """ 12 | 13 | def __init__(self, entry, format): # noqa: A002 14 | #: The invalid log entry 15 | self.entry = entry 16 | #: The log format string the entry failed to match against 17 | self.format = format 18 | super().__init__(entry, format) 19 | 20 | def __str__(self): 21 | return ( 22 | f"Could not match log entry {self.entry!r}" 23 | f" against log format {self.format!r}" 24 | ) 25 | 26 | 27 | class InvalidDirectiveError(Error, ValueError): 28 | """ 29 | Raised by the `LogParser` constructor when given a log format containing an 30 | invalid or malformed directive 31 | """ 32 | 33 | def __init__(self, format, pos): # noqa: A002 34 | #: The log format string containing the invalid directive 35 | self.format = format 36 | #: The position in the log format string at which the invalid directive 37 | #: occurs 38 | self.pos = pos 39 | super().__init__(format, pos) 40 | 41 | def __str__(self): 42 | return f"Invalid log format directive at index {self.pos} of {self.format!r}" 43 | 44 | 45 | class UnknownDirectiveError(Error, ValueError): 46 | """ 47 | Raised by the `LogParser` constructor when given a log format containing an 48 | unknown or unsupported directive 49 | """ 50 | 51 | def __init__(self, directive): 52 | #: The unknown or unsupported directive 53 | self.directive = directive 54 | super().__init__(directive) 55 | 56 | def __str__(self): 57 | return f"Unknown log format directive: {self.directive!r}" 58 | -------------------------------------------------------------------------------- /test/test_decoding.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timezone 2 | import pytest 3 | from apachelogs import COMBINED, LogParser 4 | 5 | ENTRY = '66.240.205.34 - - [18/Nov/2017:12:30:55 +0000] "Gh0st\\xad" 400 0 "-" "-"' 6 | 7 | NON_STR_FIELDS = { 8 | "remote_logname": None, 9 | "remote_user": None, 10 | "request_time": datetime(2017, 11, 18, 12, 30, 55, tzinfo=timezone.utc), 11 | "final_status": 400, 12 | "bytes_sent": 0, 13 | "headers_in": { 14 | "Referer": None, 15 | "User-Agent": None, 16 | }, 17 | } 18 | 19 | 20 | def test_bytes_parse(): 21 | log_entry = LogParser(COMBINED, encoding="bytes").parse(ENTRY) 22 | for k, v in NON_STR_FIELDS.items(): 23 | assert getattr(log_entry, k) == v 24 | assert log_entry.request_line == log_entry.directives["%r"] == b"Gh0st\xad" 25 | assert log_entry.remote_host == log_entry.directives["%h"] == b"66.240.205.34" 26 | 27 | 28 | def test_parse_latin1(): 29 | log_entry = LogParser(COMBINED).parse(ENTRY) 30 | for k, v in NON_STR_FIELDS.items(): 31 | assert getattr(log_entry, k) == v 32 | assert log_entry.request_line == log_entry.directives["%r"] == "Gh0st\xad" 33 | assert log_entry.remote_host == log_entry.directives["%h"] == "66.240.205.34" 34 | 35 | 36 | def test_parse_bad_utf8(): 37 | with pytest.raises(UnicodeDecodeError): 38 | LogParser(COMBINED, encoding="utf-8").parse(ENTRY) 39 | 40 | 41 | def test_parse_utf8_surrogateescape(): 42 | log_entry = LogParser(COMBINED, encoding="utf-8", errors="surrogateescape").parse( 43 | ENTRY 44 | ) 45 | for k, v in NON_STR_FIELDS.items(): 46 | assert getattr(log_entry, k) == v 47 | assert log_entry.request_line == log_entry.directives["%r"] == "Gh0st\udcad" 48 | assert log_entry.remote_host == log_entry.directives["%h"] == "66.240.205.34" 49 | 50 | 51 | @pytest.mark.parametrize("encoding", [None, "iso-8859-1", "utf-8"]) 52 | def test_parse_ip_address(encoding): 53 | assert ( 54 | LogParser("%a", encoding=encoding).parse("127.0.0.1").remote_address 55 | == "127.0.0.1" 56 | ) 57 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. module:: apachelogs 2 | 3 | ===================================== 4 | apachelogs — Parse Apache access logs 5 | ===================================== 6 | 7 | `GitHub `_ 8 | | `PyPI `_ 9 | | `Documentation `_ 10 | | `Issues `_ 11 | | :doc:`Changelog ` 12 | 13 | .. toctree:: 14 | :hidden: 15 | 16 | parser 17 | utils 18 | errors 19 | directives 20 | changelog 21 | 22 | `apachelogs` parses Apache access log files. Pass it a `log format string 23 | `_ and get back a 24 | parser for logfile entries in that format. `apachelogs` even takes care of 25 | decoding escape sequences and converting things like timestamps, integers, and 26 | bare hyphens to `~datetime.datetime` values, `int`\s, and `None`\s. 27 | 28 | 29 | Installation 30 | ============ 31 | `apachelogs` requires Python 3.10 or higher. Just use `pip 32 | `_ for Python 3 (You have pip, right?) to install 33 | `apachelogs` and its dependencies:: 34 | 35 | python3 -m pip install apachelogs 36 | 37 | 38 | Examples 39 | ======== 40 | 41 | Parse a single log entry: 42 | 43 | >>> from apachelogs import LogParser 44 | >>> parser = LogParser("%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"") 45 | >>> # The above log format is also available as the constant `apachelogs.COMBINED`. 46 | >>> entry = parser.parse('209.126.136.4 - - [01/Nov/2017:07:28:29 +0000] "GET / HTTP/1.1" 301 521 "-" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36"\n') 47 | >>> entry.remote_host 48 | '209.126.136.4' 49 | >>> entry.request_time 50 | datetime.datetime(2017, 11, 1, 7, 28, 29, tzinfo=datetime.timezone.utc) 51 | >>> entry.request_line 52 | 'GET / HTTP/1.1' 53 | >>> entry.final_status 54 | 301 55 | >>> entry.bytes_sent 56 | 521 57 | >>> entry.headers_in["Referer"] is None 58 | True 59 | >>> entry.headers_in["User-Agent"] 60 | 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36' 61 | >>> # Log entry components can also be looked up by directive: 62 | >>> entry.directives["%r"] 63 | 'GET / HTTP/1.1' 64 | >>> entry.directives["%>s"] 65 | 301 66 | >>> entry.directives["%t"] 67 | datetime.datetime(2017, 11, 1, 7, 28, 29, tzinfo=datetime.timezone.utc) 68 | 69 | Parse a file full of log entries: 70 | 71 | >>> with open('/var/log/apache2/access.log') as fp: # doctest: +SKIP 72 | ... for entry in parser.parse_lines(fp): 73 | ... print(str(entry.request_time), entry.request_line) 74 | ... 75 | 2019-01-01 12:34:56-05:00 GET / HTTP/1.1 76 | 2019-01-01 12:34:57-05:00 GET /favicon.ico HTTP/1.1 77 | 2019-01-01 12:34:57-05:00 GET /styles.css HTTP/1.1 78 | # etc. 79 | 80 | 81 | Indices and tables 82 | ================== 83 | * :ref:`genindex` 84 | * :ref:`search` 85 | -------------------------------------------------------------------------------- /src/apachelogs/__init__.py: -------------------------------------------------------------------------------- 1 | r""" 2 | Parse Apache access logs 3 | 4 | ``apachelogs`` parses Apache access log files. Pass it a `log format string 5 | `_ and get back a 6 | parser for logfile entries in that format. ``apachelogs`` even takes care of 7 | decoding escape sequences and converting things like timestamps, integers, and 8 | bare hyphens to ``datetime`` values, ``int``\s, and ``None``\s. 9 | 10 | Visit or 11 | for more information. 12 | """ 13 | 14 | __version__ = "0.7.0.dev1" 15 | __author__ = "John Thorvald Wodder II" 16 | __author_email__ = "apachelogs@varonathe.org" 17 | __license__ = "MIT" 18 | __url__ = "https://github.com/jwodder/apachelogs" 19 | 20 | from .errors import ( 21 | Error, 22 | InvalidDirectiveError, 23 | InvalidEntryError, 24 | UnknownDirectiveError, 25 | ) 26 | from .parser import LogEntry, LogParser 27 | from .timeutil import parse_apache_timestamp 28 | 29 | __all__ = [ 30 | "COMBINED", 31 | "COMBINED_DEBIAN", 32 | "COMMON", 33 | "COMMON_DEBIAN", 34 | "Error", 35 | "InvalidDirectiveError", 36 | "InvalidEntryError", 37 | "LogEntry", 38 | "LogParser", 39 | "UnknownDirectiveError", 40 | "VHOST_COMBINED", 41 | "VHOST_COMMON", 42 | "parse", 43 | "parse_apache_timestamp", 44 | "parse_lines", 45 | ] 46 | 47 | #: Common log format (CLF) 48 | COMMON = '%h %l %u %t "%r" %>s %b' 49 | 50 | #: `COMMON` with virtual host prepended 51 | VHOST_COMMON = '%v %h %l %u %t "%r" %>s %b' 52 | 53 | #: NCSA extended/combined log format 54 | COMBINED = '%h %l %u %t "%r" %>s %b "%{Referer}i" "%{User-Agent}i"' 55 | 56 | #: Like `COMMON`, but with ``%O`` (total bytes sent including headers) in place 57 | #: of ``%b`` (size of response excluding headers) 58 | COMMON_DEBIAN = '%h %l %u %t "%r" %>s %O' 59 | 60 | #: Like `COMBINED`, but with ``%O`` (total bytes sent including headers) in 61 | #: place of ``%b`` (size of response excluding headers) 62 | COMBINED_DEBIAN = '%h %l %u %t "%r" %>s %O "%{Referer}i" "%{User-Agent}i"' 63 | 64 | #: `COMBINED_DEBIAN` with virtual host & port prepended 65 | VHOST_COMBINED = '%v:%p %h %l %u %t "%r" %>s %O "%{Referer}i" "%{User-Agent}i"' 66 | 67 | 68 | def parse(format, entry, encoding="iso-8859-1", errors=None): # noqa: A002 69 | """ 70 | A convenience function for parsing a single logfile entry without having 71 | to directly create a `LogParser` object. 72 | 73 | ``encoding`` and ``errors`` have the same meaning as for `LogParser`. 74 | """ 75 | return LogParser(format, encoding=encoding, errors=errors).parse(entry) 76 | 77 | 78 | def parse_lines( 79 | format, # noqa: A002 80 | entries, 81 | encoding="iso-8859-1", 82 | errors=None, 83 | ignore_invalid=False, 84 | ): 85 | """ 86 | A convenience function for parsing an iterable of logfile entries without 87 | having to directly create a `LogParser` object. 88 | 89 | ``encoding`` and ``errors`` have the same meaning as for `LogParser`. 90 | ``ignore_invalid`` has the same meaning as for `LogParser.parse_lines()`. 91 | """ 92 | return LogParser(format, encoding=encoding, errors=errors).parse_lines( 93 | entries, ignore_invalid 94 | ) 95 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | v0.7.0 (in development) 2 | ----------------------- 3 | - Support Python 3.14 4 | - Drop support for Python 3.8 and 3.9 5 | 6 | v0.6.1 (2024-12-01) 7 | ------------------- 8 | - Support Python 3.9, 3.10, 3.11, 3.12, and 3.13 9 | - Drop support for Python 3.5, 3.6, and 3.7 10 | - `LogEntry`'s `__eq__` method now returns `NotImplemented` instead of `False` 11 | when comparing against non-`LogEntry` values 12 | - Migrated from setuptools to hatch 13 | 14 | v0.6.0 (2020-10-13) 15 | ------------------- 16 | - Support Python 3.8 17 | - `%s` now matches any sequence of exactly three digits. Previously, it 18 | matched either '0' or any sequence of digits not beginning with '0'. Thanks 19 | to [@chosak](https://github.com/chosak) for the patch. 20 | 21 | v0.5.0 (2019-05-21) 22 | ------------------- 23 | - Improved the routine for assembling `request_time` from 24 | `request_time_fields`: 25 | - If the month is only available as a full or abbreviated name and the name 26 | is not in English, try looking it up in the current locale 27 | - If the year is only available in abbreviated form (the `%y` directive) 28 | without a century (`%C`), treat years less than 69 as part of the 29 | twenty-first century and other years as part of the twentieth 30 | - When necessary, use the values of the `%G`, `%g`, `%u`, `%V`, `%U`, `%W`, 31 | and `%w` time directives to derive the date 32 | - If `%Z` equals `"GMT"`, `"UTC"`, or one of the names in `time.tzname`, 33 | produce an aware `datetime` 34 | - `%{%n}t` and `%{%t}t` now match any amount of any whitespace, in order to 35 | match `strptime(3)`'s behavior 36 | - **Breaking**: Renamed the `request_time_fields` keys for `%{%G}t` and 37 | `%{%g}t` from `"week_year"` and `"abbrev_week_year"` to `"iso_year"` and 38 | `"abbrev_iso_year"`, respectively 39 | - `%{%p}t` can now match the empty string (its value in certain locales) 40 | - `%{%Z}t` can now match the empty string 41 | 42 | v0.4.0 (2019-05-19) 43 | ------------------- 44 | - Support the `%{c}h` log directive 45 | - `%f` and `%R` can now be `None` 46 | - **Bugfix**: `%u` can now match the string `""` (two double quotes) 47 | - Support `mod_ssl`'s `%{*}c` and `%{*}x` directives 48 | - Support the `%{hextid}P` directive (as a hexadecimal integer) 49 | - Support the `%L` and `%{c}L` directives 50 | - Parameters to `%{*}p`, `%{*}P`, and `%{*}T` are now treated 51 | case-insensitively in order to mirror Apache's behavior 52 | - Refined some directives to better match only the values emitted by Apache: 53 | - `%l` and `%m` no longer accept whitespace 54 | - `%s` and `%{tid}P` now only match unsigned integers 55 | - `%{*}C` no longer accepts semicolons or leading or trailing spaces 56 | - `%q` no longer accepts whitespace or pound/hash signs 57 | 58 | v0.3.0 (2019-05-12) 59 | ------------------- 60 | - Gave `LogEntry` a `directives` attribute for looking up directive values by 61 | the corresponding log format directives 62 | 63 | v0.2.0 (2019-05-09) 64 | ------------------- 65 | - Changed the capitalization of "User-agent" in the log format string constants 66 | to "User-Agent" 67 | - The `cookies`, `env_vars`, `headers_in`, `headers_out`, `notes`, 68 | `trailers_in`, and `trailers_out` attributes of `LogEntry` are now all 69 | case-insensitive `dict`s. 70 | 71 | v0.1.0 (2019-05-06) 72 | ------------------- 73 | Initial release 74 | -------------------------------------------------------------------------------- /docs/changelog.rst: -------------------------------------------------------------------------------- 1 | .. currentmodule:: apachelogs 2 | 3 | Changelog 4 | ========= 5 | 6 | v0.7.0 (in development) 7 | ----------------------- 8 | - Support Python 3.14 9 | - Drop support for Python 3.8 and 3.9 10 | 11 | 12 | v0.6.1 (2024-12-01) 13 | ------------------- 14 | - Support Python 3.9, 3.10, 3.11, 3.12, and 3.13 15 | - Drop support for Python 3.5, 3.6, and 3.7 16 | - `LogEntry`'s ``__eq__`` method now returns `NotImplemented` instead of 17 | `False` when comparing against non-`LogEntry` values 18 | - Migrated from setuptools to hatch 19 | 20 | 21 | v0.6.0 (2020-10-13) 22 | ------------------- 23 | - Support Python 3.8 24 | - ``%s`` now matches any sequence of exactly three digits. Previously, it 25 | matched either '0' or any sequence of digits not beginning with '0'. Thanks 26 | to `@chosak `_ for the patch. 27 | 28 | 29 | v0.5.0 (2019-05-21) 30 | ------------------- 31 | - Improved the routine for assembling ``request_time`` from 32 | ``request_time_fields``: 33 | 34 | - If the month is only available as a full or abbreviated name and the name 35 | is not in English, try looking it up in the current locale 36 | - If the year is only available in abbreviated form (the ``%y`` directive) 37 | without a century (``%C``), treat years less than 69 as part of the 38 | twenty-first century and other years as part of the twentieth 39 | - When necessary, use the values of the ``%G``, ``%g``, ``%u``, ``%V``, 40 | ``%U``, ``%W``, and ``%w`` time directives to derive the date 41 | - If ``%Z`` equals ``"GMT"``, ``"UTC"``, or one of the names in `time.tzname`, 42 | produce an aware `~datetime.datetime` 43 | 44 | - ``%{%n}t`` and ``%{%t}t`` now match any amount of any whitespace, in order to 45 | match :manpage:`strptime(3)`'s behavior 46 | - **Breaking**: Renamed the ``request_time_fields`` keys for ``%{%G}t`` and 47 | ``%{%g}t`` from ``"week_year"`` and ``"abbrev_week_year"`` to ``"iso_year"`` 48 | and ``"abbrev_iso_year"``, respectively 49 | - ``%{%p}t`` can now match the empty string (its value in certain locales) 50 | - ``%{%Z}t`` can now match the empty string 51 | 52 | 53 | v0.4.0 (2019-05-19) 54 | ------------------- 55 | - Support the ``%{c}h`` log directive 56 | - ``%f`` and ``%R`` can now be `None` 57 | - **Bugfix**: ``%u`` can now match the string ``""`` (two double quotes) 58 | - Support ``mod_ssl``'s ``%{*}c`` and ``%{*}x`` directives 59 | - Support the ``%{hextid}P`` directive (as a hexadecimal integer) 60 | - Support the ``%L`` and ``%{c}L`` directives 61 | - Parameters to ``%{*}p``, ``%{*}P``, and ``%{*}T`` are now treated 62 | case-insensitively in order to mirror Apache's behavior 63 | - Refined some directives to better match only the values emitted by Apache: 64 | 65 | - ``%l`` and ``%m`` no longer accept whitespace 66 | - ``%s`` and ``%{tid}P`` now only match unsigned integers 67 | - ``%{*}C`` no longer accepts semicolons or leading or trailing spaces 68 | - ``%q`` no longer accepts whitespace or pound/hash signs 69 | 70 | 71 | v0.3.0 (2019-05-12) 72 | ------------------- 73 | - Gave `LogEntry` a `~LogEntry.directives` attribute for looking up directive 74 | values by the corresponding log format directives 75 | 76 | 77 | v0.2.0 (2019-05-09) 78 | ------------------- 79 | - Changed the capitalization of "User-agent" in the log format string constants 80 | to "User-Agent" 81 | - The ``cookies``, ``env_vars``, ``headers_in``, ``headers_out``, ``notes``, 82 | ``trailers_in``, and ``trailers_out`` attributes of `LogEntry` are now all 83 | case-insensitive `dict`\s. 84 | 85 | 86 | v0.1.0 (2019-05-06) 87 | ------------------- 88 | Initial release 89 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | |repostatus| |ci-status| |coverage| |pyversions| |license| 2 | 3 | .. |repostatus| image:: https://www.repostatus.org/badges/latest/active.svg 4 | :target: https://www.repostatus.org/#active 5 | :alt: Project Status: Active — The project has reached a stable, usable 6 | state and is being actively developed. 7 | 8 | .. |ci-status| image:: https://github.com/jwodder/apachelogs/actions/workflows/test.yml/badge.svg 9 | :target: https://github.com/jwodder/apachelogs/actions/workflows/test.yml 10 | :alt: CI Status 11 | 12 | .. |coverage| image:: https://codecov.io/gh/jwodder/apachelogs/branch/master/graph/badge.svg 13 | :target: https://codecov.io/gh/jwodder/apachelogs 14 | 15 | .. |pyversions| image:: https://img.shields.io/pypi/pyversions/apachelogs.svg 16 | :target: https://pypi.org/project/apachelogs/ 17 | 18 | .. |license| image:: https://img.shields.io/github/license/jwodder/apachelogs.svg 19 | :target: https://opensource.org/licenses/MIT 20 | :alt: MIT License 21 | 22 | `GitHub `_ 23 | | `PyPI `_ 24 | | `Documentation `_ 25 | | `Issues `_ 26 | | `Changelog `_ 27 | 28 | ``apachelogs`` parses Apache access log files. Pass it a `log format string 29 | `_ and get back a 30 | parser for logfile entries in that format. ``apachelogs`` even takes care of 31 | decoding escape sequences and converting things like timestamps, integers, and 32 | bare hyphens to ``datetime`` values, ``int``\s, and ``None``\s. 33 | 34 | 35 | Installation 36 | ============ 37 | ``apachelogs`` requires Python 3.10 or higher. Just use `pip 38 | `_ for Python 3 (You have pip, right?) to install 39 | ``apachelogs`` and its dependencies:: 40 | 41 | python3 -m pip install apachelogs 42 | 43 | 44 | Examples 45 | ======== 46 | 47 | Parse a single log entry: 48 | 49 | >>> from apachelogs import LogParser 50 | >>> parser = LogParser("%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"") 51 | >>> # The above log format is also available as the constant `apachelogs.COMBINED`. 52 | >>> entry = parser.parse('209.126.136.4 - - [01/Nov/2017:07:28:29 +0000] "GET / HTTP/1.1" 301 521 "-" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36"\n') 53 | >>> entry.remote_host 54 | '209.126.136.4' 55 | >>> entry.request_time 56 | datetime.datetime(2017, 11, 1, 7, 28, 29, tzinfo=datetime.timezone.utc) 57 | >>> entry.request_line 58 | 'GET / HTTP/1.1' 59 | >>> entry.final_status 60 | 301 61 | >>> entry.bytes_sent 62 | 521 63 | >>> entry.headers_in["Referer"] is None 64 | True 65 | >>> entry.headers_in["User-Agent"] 66 | 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36' 67 | >>> # Log entry components can also be looked up by directive: 68 | >>> entry.directives["%r"] 69 | 'GET / HTTP/1.1' 70 | >>> entry.directives["%>s"] 71 | 301 72 | >>> entry.directives["%t"] 73 | datetime.datetime(2017, 11, 1, 7, 28, 29, tzinfo=datetime.timezone.utc) 74 | 75 | Parse a file full of log entries: 76 | 77 | >>> with open('/var/log/apache2/access.log') as fp: # doctest: +SKIP 78 | ... for entry in parser.parse_lines(fp): 79 | ... print(str(entry.request_time), entry.request_line) 80 | ... 81 | 2019-01-01 12:34:56-05:00 GET / HTTP/1.1 82 | 2019-01-01 12:34:57-05:00 GET /favicon.ico HTTP/1.1 83 | 2019-01-01 12:34:57-05:00 GET /styles.css HTTP/1.1 84 | # etc. 85 | -------------------------------------------------------------------------------- /src/apachelogs/util.py: -------------------------------------------------------------------------------- 1 | from collections import namedtuple 2 | import re 3 | 4 | #: `collections.namedtuple` class for describing how to match various types and 5 | #: how to convert them from strings. The two attributes are ``regex``, a regex 6 | #: string, and ``converter``, a function that takes a `str` and returns an 7 | #: value of the appropriate type. 8 | FieldType = namedtuple("FieldType", "regex converter") 9 | 10 | 11 | def clf(ftype): 12 | """ 13 | Convert a `FieldType` instance to one whose ``regex`` accepts the string 14 | ``-`` and whose ``converter`` converts the string ``-`` to `None` 15 | 16 | :param FieldType ftype: 17 | :rtype: FieldType 18 | """ 19 | return FieldType( 20 | regex=rf"(?:{ftype.regex}|-)", 21 | converter=lambda s: None if s == "-" else ftype.converter(s), 22 | ) 23 | 24 | 25 | def unescape(s): 26 | """ 27 | Unescape the escape sequences in the string ``s``, returning a `bytes` 28 | string 29 | """ 30 | # Escape sequences used by Apache: \b \n \r \t \v \\ \" \xHH 31 | # cf. ap_escape_logitem() in server/util.c 32 | return re.sub(r"\\(x[0-9A-Fa-f]{2}|.)", _unesc, s).encode("iso-8859-1") 33 | 34 | 35 | _unescapes = { 36 | "t": "\t", 37 | "n": "\n", 38 | "r": "\r", 39 | "b": "\b", 40 | "v": "\v", 41 | # Not emitted by Apache (as of v2.4), but other servers might use it: 42 | "f": "\f", 43 | } 44 | 45 | 46 | def _unesc(m): 47 | esc = m.group(1) 48 | if esc[0] == "x": 49 | return chr(int(esc[1:], 16)) 50 | else: 51 | return _unescapes.get(esc, esc) 52 | 53 | 54 | #: Regex matching a base-10 integer from 0 to 255 55 | BYTE = r"(?:[1-9]?[0-9]|[1-9][0-9][0-9]|2[0-4][0-9]|25[0-5])" 56 | 57 | #: Regex matching one to four hexadecimal digits 58 | HEXTET = r"[0-9A-Fa-f]{1,4}" 59 | 60 | #: Regex for an IPv4 address 61 | IPv4 = rf"{BYTE}(?:\.{BYTE}){{3}}" 62 | 63 | #: Regex for an IP address, either IPv4 or IPv6 64 | IP_ADDRESS_RGX = ( 65 | f"{IPv4}" 66 | # Adapted from : 68 | f"|(?:{HEXTET}:){{7}}(?:{HEXTET}|:)" 69 | f"|(?:{HEXTET}:){{6}}(?::{HEXTET}|{IPv4}|:)" 70 | f"|(?:{HEXTET}:){{5}}(?:(?::{HEXTET}){{1,2}}|:{IPv4}|:)" 71 | f"|(?:{HEXTET}:){{4}}(?:(?::{HEXTET}){{1,3}}|(?::{HEXTET})?:{IPv4}|:)" 72 | f"|(?:{HEXTET}:){{3}}(?:(?::{HEXTET}){{1,4}}|(?::{HEXTET}){{0,2}}:{IPv4}|:)" 73 | f"|(?:{HEXTET}:){{2}}(?:(?::{HEXTET}){{1,5}}|(?::{HEXTET}){{0,3}}:{IPv4}|:)" 74 | f"|(?:{HEXTET}:){{1}}(?:(?::{HEXTET}){{1,6}}|(?::{HEXTET}){{0,4}}:{IPv4}|:)" 75 | f"|:(?:(?::{HEXTET}){{1,7}}|(?::{HEXTET}){{0,5}}:{IPv4}|:)" 76 | ) 77 | 78 | #: `FieldType` instance for an IP address, either IPv4 or IPv6 79 | ip_address = FieldType(IP_ADDRESS_RGX, str) 80 | 81 | #: `FieldType` instance for a base-10 integer 82 | integer = FieldType(r"(?:0|-?[1-9][0-9]*)", int) 83 | 84 | #: `FieldType` instance for an unsigned base-10 integer 85 | uinteger = FieldType(r"(?:0|[1-9][0-9]*)", int) 86 | 87 | #: `FieldType` instance for a 3-digit integer HTTP status code 88 | status_code = FieldType(r"[0-9]{3}", int) 89 | 90 | #: `FieldType` instance for a string containing escape sequences that is 91 | #: converted to `bytes` 92 | esc_string = FieldType( 93 | # The following characters are escaped in logfiles: ", \, control 94 | # characters (everything accepted by `apr_iscntrl` = `iscntrl`, i.e., 95 | # everything less than 0x20 and also 0x7F), non-printable characters 96 | # (everything rejected by `apr_isprint` = `isprint`, i.e., control 97 | # characters plus everything over 0x7F). 98 | # cf. server/gen_test_char.c 99 | r"(?:[ !\x23-\x5B\x5D-\x7E]|\\x[0-9A-Fa-f]{2}|\\.)*?", 100 | unescape, 101 | ) 102 | 103 | #: Like `esc_string`, but without any whitespace. (Whitespace escape sequences 104 | #: are still allowed just because it's easier.) 105 | esc_word = FieldType( 106 | r"(?:[!\x23-\x5B\x5D-\x7E]|\\x[0-9A-Fa-f]{2}|\\.)*?", 107 | unescape, 108 | ) 109 | 110 | #: `FieldType` instance for a "Common Log Format" string: either a string with 111 | #: escape sequences or else a single hyphen, representing `None` 112 | clf_string = clf(esc_string) 113 | 114 | #: Like `clf_string`, but without any whitespace. (Whitespace escape sequences 115 | #: are still allowed just because it's easier.) 116 | clf_word = clf(esc_word) 117 | 118 | #: `FieldType` instance for a remote user (directive ``%u``). This is the same 119 | #: as `clf_string`, but ``""`` (two double-quotes) is accepted and converted to 120 | #: an empty string, as that is how ``%u`` represents empty names. 121 | remote_user = FieldType( 122 | rf'(?:{clf_string.regex}|"")', 123 | lambda s: clf_string.converter("" if s == '""' else s), 124 | ) 125 | 126 | #: Regex for a single non-space atom in a cookie value; this is the same as an 127 | #: `esc_word` atom, except semicolons are not matched 128 | CRUMB = r"(?:[!\x23-\x3A\x3C-\x5B\x5D-\x7E]|\\x[0-9A-Fa-f]{2}|\\.)" 129 | 130 | #: `FieldType` instance for a cookie value; like `clf_string`, but with no 131 | #: leading or trailing spaces and no semicolons 132 | cookie_value = clf( 133 | FieldType( 134 | rf"{CRUMB}(?:(?:{CRUMB}|[ ])*{CRUMB})?", 135 | unescape, 136 | ) 137 | ) 138 | -------------------------------------------------------------------------------- /src/apachelogs/strftime.py: -------------------------------------------------------------------------------- 1 | # cf. 2 | 3 | # Apache implements `%{*}t` via `apr_strftime()`, which just calls the native 4 | # platform's `strftime()`. 5 | 6 | from datetime import datetime 7 | import re 8 | from .timeutil import parse_apache_timestamp 9 | from .util import FieldType, integer 10 | 11 | YEAR = r"[0-9]{4,}" 12 | MONTH = r"(?:0[1-9]|1[012])" 13 | MDAY = r"(?:[ 0][1-9]|[12][0-9]|3[01])" 14 | HOUR = r"(?:[ 01][0-9]|2[0-3])" 15 | HOUR12 = r"(?:[ 0][1-9]|1[0-2])" 16 | MINUTE = r"[ 0-5][0-9]" 17 | SECOND = r"(?:[0-5][0-9]|60)" 18 | 19 | WEEKNUM = r"(?:[0-4][0-9]|5[0-3])" # 00-53 20 | ISO_WEEKNUM = r"(?:0[1-9]|[1-4][0-9]|5[0-3])" # 01-53 21 | 22 | # All strftime converters must pass `None` through unmodified in order to 23 | # handle directives like `%200{%Y-%m-%d}t` matching "-". 24 | 25 | word = FieldType(r"\w+", lambda s: s) 26 | word0 = FieldType(r"\w*", lambda s: s) 27 | 28 | 29 | def none_int(s): 30 | return None if s is None else int(s) 31 | 32 | 33 | none_integer = integer._replace(converter=none_int) 34 | 35 | STRFTIME_DIRECTIVES = { 36 | "%": (None, FieldType("%", None)), 37 | "a": ("abbrev_wday", word), 38 | "A": ("full_wday", word), 39 | "b": ("abbrev_mon", word), 40 | "B": ("full_mon", word), 41 | "C": ("century", FieldType(r"[0-9]{2,}", none_int)), 42 | "d": ("mday", FieldType(MDAY, none_int)), 43 | "D": ( 44 | "date", 45 | FieldType( 46 | f"{MONTH}/{MDAY}/[0-9][0-9]", 47 | lambda s: s and datetime.strptime(s, "%m/%d/%y").date(), 48 | ), 49 | ), 50 | "e": ("mday", FieldType(MDAY, none_int)), 51 | "F": ( 52 | "date", 53 | FieldType( 54 | f"{YEAR}-{MONTH}-{MDAY}", 55 | lambda s: s and datetime.strptime(s, "%Y-%m-%d").date(), 56 | ), 57 | ), 58 | "g": ("abbrev_iso_year", FieldType(r"[0-9][0-9]", none_int)), 59 | "G": ("iso_year", FieldType(YEAR, none_int)), 60 | "h": ("abbrev_mon", word), 61 | "H": ("hour", FieldType(HOUR, none_int)), 62 | "I": ("hour12", FieldType(HOUR12, none_int)), 63 | "j": ( 64 | "yday", 65 | FieldType( 66 | # 001−366: 67 | "0(?:0[1-9]|[1-9][0-9])|[12][0-9][0-9]|3(?:[0-5][0-9]|6[0-6])", 68 | none_int, 69 | ), 70 | ), 71 | "m": ("mon", FieldType(MONTH, none_int)), 72 | "M": ("min", FieldType(MINUTE, none_int)), 73 | "n": (None, FieldType(r"\s*", None)), 74 | # `%p` is the empty string in certain locales (e.g., de_DE.utf8 on Ubuntu 75 | # Bionic) 76 | "p": ("am_pm", word0), 77 | "R": ( 78 | "hour_min", 79 | FieldType( 80 | f"{HOUR}:{MINUTE}", 81 | lambda s: s and datetime.strptime(s, "%H:%M").time(), 82 | ), 83 | ), 84 | "s": ("epoch", none_integer), 85 | "S": ("sec", FieldType(SECOND, none_int)), 86 | "t": (None, FieldType(r"\s*", None)), 87 | "T": ( 88 | "time", 89 | FieldType( 90 | f"{HOUR}:{MINUTE}:{SECOND}", 91 | lambda s: s and datetime.strptime(s, "%H:%M:%S").time(), 92 | ), 93 | ), 94 | "u": ("iso_wday", FieldType(r"[1-7]", none_int)), 95 | "U": ("sunday_weeknum", FieldType(WEEKNUM, none_int)), 96 | "V": ("iso_weeknum", FieldType(ISO_WEEKNUM, none_int)), 97 | "w": ("wday", FieldType(r"[0-6]", none_int)), 98 | "W": ("monday_weeknum", FieldType(WEEKNUM, none_int)), 99 | "y": ("abbrev_year", FieldType(r"[0-9][0-9]", none_int)), 100 | "Y": ("year", FieldType(YEAR, none_int)), 101 | "z": ( 102 | "timezone", 103 | FieldType( 104 | r"(?:[-+](?:[01][0-9]|2[0-3])[0-5][0-9])?", 105 | lambda s: datetime.strptime(s, "%z").tzinfo if s else None, 106 | ), 107 | ), 108 | "Z": ("tzname", word0), 109 | # 'c': # C locale: %a %b %e %T %Y 110 | # 'r': # C locale: %I:%M:%S %p 111 | # 'x': # C locale: %m/%d/%y 112 | # 'X': # C locale: %T 113 | # 'E*', 'O*': No. 114 | } 115 | 116 | SPECIAL_PARAMETERS = { 117 | "": ("timestamp", FieldType(r"\[[^]]+\]", parse_apache_timestamp)), 118 | "sec": ("epoch", none_integer), 119 | "msec": ("milliepoch", none_integer), 120 | "usec": ("microepoch", none_integer), 121 | "msec_frac": ("msec_frac", FieldType(r"[0-9]{3}", none_int)), 122 | "usec_frac": ("usec_frac", FieldType(r"[0-9]{6}", none_int)), 123 | } 124 | 125 | 126 | def strftime2regex(param): 127 | m = re.match(r"^(begin|end)(?::|\Z)", param) 128 | if m: 129 | param = param[m.end() :] 130 | prefix = m.group(1) + "_" 131 | modifier = m.group(0) 132 | else: 133 | prefix = "" 134 | modifier = "" 135 | if param in SPECIAL_PARAMETERS: 136 | name, dtype = SPECIAL_PARAMETERS[param] 137 | return ( 138 | [ 139 | ( 140 | (prefix + "request_time_fields", name), 141 | modifier + param, 142 | dtype.converter, 143 | ) 144 | ], 145 | rf"({dtype.regex})", 146 | ) 147 | else: 148 | from .directives import format2regex 149 | 150 | groups, rgx = format2regex(param, STRFTIME_DIRECTIVES, {}, simple=True) 151 | groups = [ 152 | ( 153 | (prefix + "request_time_fields", name), 154 | modifier + directive, 155 | converter, 156 | ) 157 | for (name, directive, converter) in groups 158 | ] 159 | return (groups, rgx) 160 | -------------------------------------------------------------------------------- /test/test_parse_apache_timestamp.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta, timezone 2 | import pytest 3 | from apachelogs import parse_apache_timestamp 4 | 5 | 6 | def mktz(hours, mins=0): 7 | return timezone(timedelta(hours=hours, minutes=mins)) 8 | 9 | 10 | e8 = mktz(8) 11 | e7 = mktz(7) 12 | e5 = mktz(5) 13 | utc = timezone.utc 14 | w4 = mktz(-4) 15 | w5 = mktz(-5) 16 | w7 = mktz(-7) 17 | w8 = mktz(-8) 18 | 19 | 20 | @pytest.mark.parametrize( 21 | "ts,dt", 22 | [ 23 | ("31/Dec/1969:19:00:00 +0500", datetime(1969, 12, 31, 19, 0, 0, tzinfo=e5)), 24 | ("[31/Dec/1969:19:00:00 +0500]", datetime(1969, 12, 31, 19, 0, 0, tzinfo=e5)), 25 | ("[31/Dec/1969:19:00:00 +0500", datetime(1969, 12, 31, 19, 0, 0, tzinfo=e5)), 26 | ("31/Dec/1969:19:00:00 +0500]", datetime(1969, 12, 31, 19, 0, 0, tzinfo=e5)), 27 | ("31/Dec/1969:19:00:00 -0500", datetime(1969, 12, 31, 19, 0, 0, tzinfo=w5)), 28 | ( 29 | "31/Dec/1969:19:00:00 +0130", 30 | datetime(1969, 12, 31, 19, 0, 0, tzinfo=mktz(1, 30)), 31 | ), 32 | ( 33 | "31/Dec/1969:19:00:00 +0030", 34 | datetime(1969, 12, 31, 19, 0, 0, tzinfo=mktz(0, 30)), 35 | ), 36 | ( 37 | "31/Dec/1969:19:00:00 -0030", 38 | datetime(1969, 12, 31, 19, 0, 0, tzinfo=mktz(-0, -30)), 39 | ), 40 | ( 41 | "31/Dec/1969:19:00:00 -0130", 42 | datetime(1969, 12, 31, 19, 0, 0, tzinfo=mktz(-1, -30)), 43 | ), 44 | ("02/Apr/2006:01:59:59 +0800", datetime(2006, 4, 2, 1, 59, 59, tzinfo=e8)), 45 | ("02/Apr/2006:01:59:59 -0800", datetime(2006, 4, 2, 1, 59, 59, tzinfo=w8)), 46 | ("02/Apr/2006:02:30:00 +0700", datetime(2006, 4, 2, 2, 30, 0, tzinfo=e7)), 47 | ("02/Apr/2006:02:30:00 -0700", datetime(2006, 4, 2, 2, 30, 0, tzinfo=w7)), 48 | ("02/Apr/2006:03:00:01 +0700", datetime(2006, 4, 2, 3, 0, 1, tzinfo=e7)), 49 | ("02/Apr/2006:03:00:01 -0700", datetime(2006, 4, 2, 3, 0, 1, tzinfo=w7)), 50 | ("29/Oct/2006:00:59:59 +0700", datetime(2006, 10, 29, 0, 59, 59, tzinfo=e7)), 51 | ("29/Oct/2006:00:59:59 -0700", datetime(2006, 10, 29, 0, 59, 59, tzinfo=w7)), 52 | ("29/Oct/2006:01:30:00 +0700", datetime(2006, 10, 29, 1, 30, 0, tzinfo=e7)), 53 | ("29/Oct/2006:01:30:00 -0700", datetime(2006, 10, 29, 1, 30, 0, tzinfo=w7)), 54 | ("29/Oct/2006:01:30:00 +0800", datetime(2006, 10, 29, 1, 30, 0, tzinfo=e8)), 55 | ("29/Oct/2006:01:30:00 -0800", datetime(2006, 10, 29, 1, 30, 0, tzinfo=w8)), 56 | ("29/Oct/2006:02:00:01 +0800", datetime(2006, 10, 29, 2, 0, 1, tzinfo=e8)), 57 | ("29/Oct/2006:02:00:01 -0800", datetime(2006, 10, 29, 2, 0, 1, tzinfo=w8)), 58 | ("13/Feb/2009:18:31:30 +0500", datetime(2009, 2, 13, 18, 31, 30, tzinfo=e5)), 59 | ("13/Feb/2009:18:31:30 -0500", datetime(2009, 2, 13, 18, 31, 30, tzinfo=w5)), 60 | ("01/Jan/2016:00:00:00 -0500", datetime(2016, 1, 1, 0, 0, 0, tzinfo=w5)), 61 | ("03/Jan/2016:06:00:00 -0500", datetime(2016, 1, 3, 6, 0, 0, tzinfo=w5)), 62 | ("04/Jan/2016:00:00:00 -0500", datetime(2016, 1, 4, 0, 0, 0, tzinfo=w5)), 63 | ("05/Jan/2016:01:00:00 -0500", datetime(2016, 1, 5, 1, 0, 0, tzinfo=w5)), 64 | ("06/Jan/2016:02:00:00 -0500", datetime(2016, 1, 6, 2, 0, 0, tzinfo=w5)), 65 | ("07/Jan/2016:03:00:00 -0500", datetime(2016, 1, 7, 3, 0, 0, tzinfo=w5)), 66 | ("08/Jan/2016:04:00:00 -0500", datetime(2016, 1, 8, 4, 0, 0, tzinfo=w5)), 67 | ("09/Jan/2016:05:00:00 -0500", datetime(2016, 1, 9, 5, 0, 0, tzinfo=w5)), 68 | ("02/Feb/2016:02:02:02 -0500", datetime(2016, 2, 2, 2, 2, 2, tzinfo=w5)), 69 | ("29/Feb/2016:03:14:15 -0500", datetime(2016, 2, 29, 3, 14, 15, tzinfo=w5)), 70 | ("03/Mar/2016:03:03:03 -0500", datetime(2016, 3, 3, 3, 3, 3, tzinfo=w5)), 71 | ("13/Mar/2016:01:59:59 -0500", datetime(2016, 3, 13, 1, 59, 59, tzinfo=w5)), 72 | ("13/Mar/2016:03:00:01 -0400", datetime(2016, 3, 13, 3, 0, 1, tzinfo=w4)), 73 | ("13/Mar/2016:03:30:00 -0400", datetime(2016, 3, 13, 3, 30, 0, tzinfo=w4)), 74 | ("04/Apr/2016:04:04:04 -0400", datetime(2016, 4, 4, 4, 4, 4, tzinfo=w4)), 75 | ("05/May/2016:05:05:05 -0400", datetime(2016, 5, 5, 5, 5, 5, tzinfo=w4)), 76 | ("13/May/2016:13:13:13 -0400", datetime(2016, 5, 13, 13, 13, 13, tzinfo=w4)), 77 | ("06/Jun/2016:06:06:06 -0400", datetime(2016, 6, 6, 6, 6, 6, tzinfo=w4)), 78 | ("07/Jul/2016:07:07:07 -0400", datetime(2016, 7, 7, 7, 7, 7, tzinfo=w4)), 79 | ("08/Aug/2016:08:08:08 -0400", datetime(2016, 8, 8, 8, 8, 8, tzinfo=w4)), 80 | ("09/Sep/2016:09:09:09 -0400", datetime(2016, 9, 9, 9, 9, 9, tzinfo=w4)), 81 | ("10/Oct/2016:10:10:10 -0400", datetime(2016, 10, 10, 10, 10, 10, tzinfo=w4)), 82 | ("06/Nov/2016:00:59:59 -0400", datetime(2016, 11, 6, 0, 59, 59, tzinfo=w4)), 83 | ("06/Nov/2016:01:30:00 -0400", datetime(2016, 11, 6, 1, 30, 0, tzinfo=w4)), 84 | ("06/Nov/2016:01:59:59 -0400", datetime(2016, 11, 6, 1, 59, 59, tzinfo=w4)), 85 | ("06/Nov/2016:01:00:01 -0500", datetime(2016, 11, 6, 1, 0, 1, tzinfo=w5)), 86 | ("06/Nov/2016:01:30:00 -0500", datetime(2016, 11, 6, 1, 30, 0, tzinfo=w5)), 87 | ("06/Nov/2016:02:00:01 -0500", datetime(2016, 11, 6, 2, 0, 1, tzinfo=w5)), 88 | ("07/Nov/2016:15:29:40 -0500", datetime(2016, 11, 7, 15, 29, 40, tzinfo=w5)), 89 | ("11/Nov/2016:11:11:11 -0500", datetime(2016, 11, 11, 11, 11, 11, tzinfo=w5)), 90 | ("12/Dec/2016:12:12:12 -0500", datetime(2016, 12, 12, 12, 12, 12, tzinfo=w5)), 91 | ("01/Nov/2017:07:28:29 +0000", datetime(2017, 11, 1, 7, 28, 29, tzinfo=utc)), 92 | ("01/Nov/2017:07:28:29 -0400", datetime(2017, 11, 1, 7, 28, 29, tzinfo=w4)), 93 | ("05/Nov/2017:01:01:01 -0400", datetime(2017, 11, 5, 1, 1, 1, tzinfo=w4)), 94 | ("05/Nov/2017:01:59:59 -0400", datetime(2017, 11, 5, 1, 59, 59, tzinfo=w4)), 95 | ("05/Nov/2017:02:01:01 -0400", datetime(2017, 11, 5, 2, 1, 1, tzinfo=w4)), 96 | ("05/Nov/2017:01:01:01 -0500", datetime(2017, 11, 5, 1, 1, 1, tzinfo=w5)), 97 | ("05/Nov/2017:01:59:59 -0500", datetime(2017, 11, 5, 1, 59, 59, tzinfo=w5)), 98 | ("05/Nov/2017:02:01:01 -0500", datetime(2017, 11, 5, 2, 1, 1, tzinfo=w5)), 99 | ], 100 | ) 101 | def test_parse_apache_timestamp(ts, dt): 102 | apts = parse_apache_timestamp(ts) 103 | assert apts == dt 104 | assert apts.replace(tzinfo=None) == dt.replace(tzinfo=None) 105 | assert apts.tzinfo == dt.tzinfo 106 | 107 | 108 | @pytest.mark.parametrize( 109 | "ts", 110 | [ 111 | "13/Mar/2016:01:59:59 -05:00", 112 | "13/Mar/2016:01:59:59", 113 | "13/Mar/2016:01:59:59 -05", 114 | "13/03/2016:01:59:59 -0500", 115 | "13/Mar/2016 01:59:59 -0500", 116 | "13/Mar/2016T01:59:59 -0500", 117 | "13/Sma/2016:01:59:59 -0500", 118 | ], 119 | ) 120 | def test_parse_bad_apache_timestamp(ts): 121 | with pytest.raises(ValueError) as excinfo: 122 | parse_apache_timestamp(ts) 123 | assert str(excinfo.value) == ts 124 | -------------------------------------------------------------------------------- /src/apachelogs/parser.py: -------------------------------------------------------------------------------- 1 | import re 2 | import attr 3 | from pydicti import dicti 4 | from .directives import format2regex 5 | from .errors import InvalidEntryError 6 | from .timeutil import assemble_datetime 7 | 8 | # The parameterized directives corresponding to the following `dict` attributes 9 | # all look up their parameters case-insensitively (either because Apache stores 10 | # the corresponding data in a case-insensitive dictionary structure or (in the 11 | # case of 'cookies') because Apache parses the relevant header 12 | # case-insensitively). As a result, we want to store the directive values in 13 | # case-insensitive `dict`s. 14 | NOCASEDICTS = { 15 | "cookies", 16 | "cryptography", 17 | "env_vars", 18 | "headers_in", 19 | "headers_out", 20 | "notes", 21 | "trailers_in", 22 | "trailers_out", 23 | "variables", 24 | } 25 | 26 | 27 | @attr.s 28 | class LogParser: 29 | """ 30 | A class for parsing Apache access log entries in a given log format. 31 | Instantiate with a log format string, and then use the `~LogParser.parse()` 32 | and/or `~LogParser.parse_lines()` methods to parse log entries in that 33 | format. 34 | 35 | :param str format: an Apache log format 36 | :param str encoding: The encoding to use for decoding certain strings in 37 | log entries (see :ref:`directives`); defaults to ``'iso-8859-1'``. Set 38 | to ``'bytes'`` to cause the strings to be returned as `bytes` values 39 | instead of `str`. 40 | :param str errors: the error handling scheme to use when decoding; defaults 41 | to ``'strict'`` 42 | :raises InvalidDirectiveError: if an invalid directive occurs in ``format`` 43 | :raises UnknownDirectiveError: if an unknown directive occurs in ``format`` 44 | """ 45 | 46 | format = attr.ib() # noqa: A003 47 | encoding = attr.ib(default="iso-8859-1") 48 | errors = attr.ib(default=None) 49 | 50 | def __attrs_post_init__(self): 51 | self._group_defs, self._rgx = format2regex(self.format) 52 | self._rgx = re.compile(self._rgx) 53 | 54 | def parse(self, entry): 55 | """ 56 | Parse an access log entry according to the log format and return a 57 | `LogEntry` object. 58 | 59 | :param str entry: an access log entry to parse 60 | :rtype: LogEntry 61 | :raises InvalidEntryError: if ``entry`` does not match the log format 62 | """ 63 | entry = entry.rstrip("\r\n") 64 | m = self._rgx.fullmatch(entry) 65 | if not m: 66 | raise InvalidEntryError(entry, self.format) 67 | groups = [conv(gr) for (_, _, conv), gr in zip(self._group_defs, m.groups())] 68 | if self.encoding != "bytes": 69 | groups = [ 70 | ( 71 | gr.decode(self.encoding, self.errors or "strict") 72 | if isinstance(gr, bytes) 73 | else gr 74 | ) 75 | for gr in groups 76 | ] 77 | return LogEntry( 78 | entry, 79 | self.format, 80 | [gdef[:2] for gdef in self._group_defs], 81 | groups, 82 | ) 83 | 84 | def parse_lines(self, entries, ignore_invalid=False): 85 | r""" 86 | Parse the elements in an iterable of access log entries (e.g., an open 87 | text file handle) and return a generator of `LogEntry`\s. If 88 | ``ignore_invalid`` is `True`, any entries that do not match the log 89 | format will be silently discarded; otherwise, such an entry will cause 90 | an `InvalidEntryError` to be raised. 91 | 92 | :param entries: an iterable of `str` 93 | :param bool ignore_invalid: whether to silently discard entries that do 94 | not match the log format 95 | :rtype: `LogEntry` generator 96 | :raises InvalidEntryError: if an element of ``entries`` does not match 97 | the log format and ``ignore_invalid`` is `False` 98 | """ 99 | for e in entries: 100 | try: 101 | yield self.parse(e) 102 | except InvalidEntryError: 103 | if not ignore_invalid: 104 | raise 105 | 106 | 107 | class LogEntry: 108 | """ 109 | A parsed Apache access log entry. The value associated with each directive 110 | in the log format is stored as an attribute on the `LogEntry` object; for 111 | example, if the log format contains a ``%s`` directive, the `LogEntry` for 112 | a parsed entry will have a ``status`` attribute containing the status value 113 | from the entry as an `int`. See :ref:`directives` for the attribute names 114 | & types of each directive supported by this library. 115 | 116 | If the log format contains two or more directives that are stored in the 117 | same attribute (e.g., ``%D`` and ``%{us}T``), the given attribute will 118 | contain the first non-`None` directive value. 119 | 120 | The values of date & time directives are stored in a ``request_time_fields: 121 | dict`` attribute. If this `dict` contains enough information to assemble a 122 | complete (possibly naïve) `datetime.datetime`, then the `LogEntry` will 123 | have a ``request_time`` attribute equal to that `datetime.datetime`. 124 | """ 125 | 126 | def __init__(self, entry, format, group_names, groups): # noqa: A002 127 | #: The original logfile entry with trailing newlines removed 128 | self.entry = entry 129 | #: The entry's log format string 130 | self.format = format 131 | #: .. versionadded:: 0.3.0 132 | #: 133 | #: A `dict` mapping individual log format directives (e.g., ``"%h"`` or 134 | #: ``"%`` 15 | modifier will be stored at :samp:`entry.final_{attribute_name}` 16 | 17 | A type of `str` marked with an asterisk (\*) means that the directive's values 18 | are decoded according to the ``encoding`` option to `LogParser`. 19 | 20 | Any directive may evaluate to `None` when it is modified by a set of status 21 | codes (e.g., ``%400,501T`` or ``%!200T``). 22 | 23 | See `the Apache documentation 24 | `_ for 25 | information on the meaning of each directive. 26 | 27 | 28 | .. list-table:: 29 | :header-rows: 1 30 | 31 | * - Directive 32 | - `LogEntry` Attribute 33 | - Type 34 | * - ``%%`` 35 | - N/A 36 | - N/A 37 | * - ``%a`` 38 | - ``entry.remote_address`` 39 | - `str` 40 | * - ``%{c}a`` 41 | - ``entry.remote_client_address`` 42 | - `str` 43 | * - ``%A`` 44 | - ``entry.local_address`` 45 | - `str` 46 | * - ``%b`` 47 | - ``entry.bytes_sent`` 48 | - `int` or `None` 49 | * - ``%B`` 50 | - ``entry.bytes_sent`` 51 | - `int` 52 | * - :samp:`%\\{{name}\\}c` 53 | - :samp:`entry.cryptography[{name}]` [#f1]_ 54 | - `str` or `None` 55 | * - :samp:`%\\{{name}\\}C` 56 | - :samp:`entry.cookies[{name}]` [#f1]_ 57 | - `str`\* or `None` 58 | * - ``%D`` 59 | - ``entry.request_duration_microseconds`` 60 | - `int` 61 | * - :samp:`%\\{{name}\\}e` 62 | - :samp:`entry.env_vars[{name}]` [#f1]_ 63 | - `str`\* or `None` 64 | * - ``%f`` 65 | - ``entry.request_file`` 66 | - `str`\* or `None` 67 | * - ``%h`` 68 | - ``entry.remote_host`` 69 | - `str`\* 70 | * - ``%{c}h`` 71 | - ``entry.remote_underlying_host`` 72 | - `str`\* 73 | * - ``%H`` 74 | - ``entry.request_protocol`` 75 | - `str`\* or `None` 76 | * - :samp:`%\\{{name}\\}i` 77 | - :samp:`entry.headers_in[{name}]` [#f1]_ 78 | - `str`\* or `None` 79 | * - ``%I`` 80 | - ``entry.bytes_in`` 81 | - `int` 82 | * - ``%k`` 83 | - ``entry.requests_on_connection`` 84 | - `int` 85 | * - ``%l`` 86 | - ``entry.remote_logname`` 87 | - `str`\* or `None` 88 | * - ``%L`` 89 | - ``entry.request_log_id`` 90 | - `str` or `None` 91 | * - ``%{c}L`` 92 | - ``entry.connection_log_id`` 93 | - `str` or `None` 94 | * - ``%m`` 95 | - ``entry.request_method`` 96 | - `str`\* or `None` 97 | * - :samp:`%\\{{name}\\}n` 98 | - :samp:`entry.notes[{name}]` [#f1]_ 99 | - `str`\* or `None` 100 | * - :samp:`%\\{{name}\\}o` 101 | - :samp:`entry.headers_out[{name}]` [#f1]_ 102 | - `str`\* or `None` 103 | * - ``%O`` 104 | - ``entry.bytes_out`` 105 | - `int` 106 | * - ``%p`` 107 | - ``entry.server_port`` 108 | - `int` 109 | * - ``%{canonical}p`` 110 | - ``entry.server_port`` 111 | - `int` 112 | * - ``%{local}p`` 113 | - ``entry.local_port`` 114 | - `int` 115 | * - ``%{remote}p`` 116 | - ``entry.remote_port`` 117 | - `int` 118 | * - ``%P`` 119 | - ``entry.pid`` 120 | - `int` 121 | * - ``%{hextid}P`` [#f2]_ 122 | - ``entry.tid`` 123 | - `int` 124 | * - ``%{pid}P`` 125 | - ``entry.pid`` 126 | - `int` 127 | * - ``%{tid}P`` 128 | - ``entry.tid`` 129 | - `int` 130 | * - ``%q`` 131 | - ``entry.request_query`` 132 | - `str`\* 133 | * - ``%r`` 134 | - ``entry.request_line`` 135 | - `str`\* or `None` 136 | * - ``%R`` 137 | - ``entry.handler`` 138 | - `str`\* or `None` 139 | * - ``%s`` 140 | - ``entry.status`` 141 | - `int` or `None` 142 | * - ``%S`` 143 | - ``entry.bytes_combined`` 144 | - `int` 145 | * - ``%t`` 146 | - ``entry.request_time_fields["timestamp"]`` 147 | - aware `datetime.datetime` 148 | * - ``%{sec}t`` 149 | - ``entry.request_time_fields["epoch"]`` 150 | - `int` 151 | * - ``%{msec}t`` 152 | - ``entry.request_time_fields["milliepoch"]`` 153 | - `int` 154 | * - ``%{usec}t`` 155 | - ``entry.request_time_fields["microepoch"]`` 156 | - `int` 157 | * - ``%{msec_frac}t`` 158 | - ``entry.request_time_fields["msec_frac"]`` 159 | - `int` 160 | * - ``%{usec_frac}t`` 161 | - ``entry.request_time_fields["usec_frac"]`` 162 | - `int` 163 | * - :samp:`%\\{{strftime_format}\\}t` 164 | - ``entry.request_time_fields`` (See below) 165 | - (See below) 166 | * - ``%T`` 167 | - ``entry.request_duration_seconds`` 168 | - `int` 169 | * - ``%{ms}T`` 170 | - ``entry.request_duration_milliseconds`` 171 | - `int` 172 | * - ``%{us}T`` 173 | - ``entry.request_duration_microseconds`` 174 | - `int` 175 | * - ``%{s}T`` 176 | - ``entry.request_duration_seconds`` 177 | - `int` 178 | * - ``%u`` 179 | - ``entry.remote_user`` 180 | - `str`\* or `None` 181 | * - ``%U`` 182 | - ``entry.request_uri`` 183 | - `str`\* or `None` 184 | * - ``%v`` 185 | - ``entry.virtual_host`` 186 | - `str`\* 187 | * - ``%V`` 188 | - ``entry.server_name`` 189 | - `str`\* 190 | * - :samp:`%\\{{name}\\}x` 191 | - :samp:`entry.variables[{name}]` [#f1]_ 192 | - `str` or `None` 193 | * - ``%X`` 194 | - ``entry.connection_status`` 195 | - `str` 196 | * - ``%^FB`` 197 | - ``entry.ttfb`` 198 | - `int` or `None` 199 | * - :samp:`%\\{{name}\\}^ti` 200 | - :samp:`entry.trailers_in[{name}]` [#f1]_ 201 | - `str`\* or `None` 202 | * - :samp:`%\\{{name}\\}^to` 203 | - :samp:`entry.trailers_out[{name}]` [#f1]_ 204 | - `str`\* or `None` 205 | 206 | 207 | Supported ``strftime`` Directives 208 | --------------------------------- 209 | 210 | The following table lists the ``strftime`` directives supported for use in the 211 | parameter of a ``%{*}t`` directive along with the keys & types at which they 212 | are stored in the `dict` ``entry.request_time_fields``. See any C 213 | documentation for information on the meaning of each directive. 214 | 215 | A ``%{*}t`` directive with the ``begin:`` modifier (e.g., 216 | ``%{begin:%Y-%m-%d}t``) will have its subdirectives stored in 217 | ``entry.begin_request_time_fields`` (in turn used to set 218 | ``entry.begin_request_time``), and likewise for the ``end:`` modifier. 219 | 220 | .. list-table:: 221 | :header-rows: 1 222 | 223 | * - Directive 224 | - ``request_time_fields`` key 225 | - Type 226 | * - ``%%`` 227 | - N/A 228 | - N/A 229 | * - ``%a`` 230 | - ``"abbrev_wday"`` 231 | - `str` 232 | * - ``%A`` 233 | - ``"full_wday"`` 234 | - `str` 235 | * - ``%b`` 236 | - ``"abbrev_mon"`` 237 | - `str` 238 | * - ``%B`` 239 | - ``"full_mon"`` 240 | - `str` 241 | * - ``%C`` 242 | - ``"century"`` 243 | - `int` 244 | * - ``%d`` 245 | - ``"mday"`` 246 | - `int` 247 | * - ``%D`` 248 | - ``"date"`` 249 | - `datetime.date` 250 | * - ``%e`` 251 | - ``"mday"`` 252 | - `int` 253 | * - ``%F`` 254 | - ``"date"`` 255 | - `datetime.date` 256 | * - ``%g`` 257 | - ``"abbrev_iso_year"`` 258 | - `int` 259 | * - ``%G`` 260 | - ``"iso_year"`` 261 | - `int` 262 | * - ``%h`` 263 | - ``"abbrev_mon"`` 264 | - `str` 265 | * - ``%H`` 266 | - ``"hour"`` 267 | - `int` 268 | * - ``%I`` 269 | - ``"hour12"`` 270 | - `int` 271 | * - ``%j`` 272 | - ``"yday"`` 273 | - `int` 274 | * - ``%m`` 275 | - ``"mon"`` 276 | - `int` 277 | * - ``%M`` 278 | - ``"min"`` 279 | - `int` 280 | * - ``%n`` 281 | - N/A 282 | - N/A 283 | * - ``%p`` 284 | - ``"am_pm"`` 285 | - `str` 286 | * - ``%R`` 287 | - ``"hour_min"`` 288 | - `datetime.time` 289 | * - ``%s`` 290 | - ``"epoch"`` 291 | - `int` 292 | * - ``%S`` 293 | - ``"sec"`` 294 | - `int` 295 | * - ``%t`` 296 | - N/A 297 | - N/A 298 | * - ``%T`` 299 | - ``"time"`` 300 | - `datetime.time` 301 | * - ``%u`` 302 | - ``"iso_wday"`` 303 | - `int` 304 | * - ``%U`` 305 | - ``"sunday_weeknum"`` 306 | - `int` 307 | * - ``%V`` 308 | - ``"iso_weeknum"`` 309 | - `int` 310 | * - ``%w`` 311 | - ``"wday"`` 312 | - `int` 313 | * - ``%W`` 314 | - ``"monday_weeknum"`` 315 | - `int` 316 | * - ``%y`` 317 | - ``"abbrev_year"`` 318 | - `int` 319 | * - ``%Y`` 320 | - ``"year"`` 321 | - `int` 322 | * - ``%z`` 323 | - ``"timezone"`` 324 | - `datetime.timezone` or `None` 325 | * - ``%Z`` 326 | - ``"tzname"`` 327 | - `str` 328 | 329 | 330 | .. rubric:: Footnotes 331 | 332 | .. [#f1] The ``cookies``, ``cryptography``, ``env_vars``, ``headers_in``, 333 | ``headers_out``, ``notes``, ``trailers_in``, ``trailers_out``, and 334 | ``variables`` attributes are case-insensitive `dict`\s. 335 | 336 | .. [#f2] Apache renders ``%{hextid}P`` as either a decimal integer or a 337 | hexadecimal integer depending on the APR version available. 338 | `apachelogs` expects ``%{hextid}P`` to always be in hexadecimal; if 339 | your Apache produces decimal integers instead, you must instead use 340 | ``%{tid}P`` in the log format passed to `apachelogs`. 341 | -------------------------------------------------------------------------------- /src/apachelogs/directives.py: -------------------------------------------------------------------------------- 1 | import re 2 | from pydicti import dicti 3 | from .errors import InvalidDirectiveError, UnknownDirectiveError 4 | from .strftime import strftime2regex 5 | from .timeutil import parse_apache_timestamp 6 | from .util import ( 7 | FieldType, 8 | clf, 9 | clf_string, 10 | clf_word, 11 | cookie_value, 12 | esc_string, 13 | integer, 14 | ip_address, 15 | remote_user, 16 | status_code, 17 | uinteger, 18 | unescape, 19 | ) 20 | 21 | PLAIN_DIRECTIVES = { 22 | "%": (None, FieldType("%", None)), 23 | "a": ("remote_address", ip_address), 24 | "A": ("local_address", ip_address), 25 | "b": ("bytes_sent", clf(integer)), 26 | "B": ("bytes_sent", integer), 27 | "D": ("request_duration_microseconds", integer), 28 | # `%f` is '-' for malformed requests. 29 | "f": ("request_file", clf_string), 30 | "h": ("remote_host", esc_string), 31 | # In some versions of Apache (I think this includes 2.4.18, the version 32 | # available to Xenial), `%H` is everything in the request line from the 33 | # third word onward, and thus it can be anything. In some other versions 34 | # (including 2.4.7, Trusty's version?), `%H` can even be '-' for certain 35 | # very malformed request lines. 36 | "H": ("request_protocol", clf_string), 37 | "k": ("requests_on_connection", integer), 38 | # As of v2.4.39, Apache uses the `%s` sscanf() format to extract the value 39 | # that becomes `remote_logname`, and so it does not contain any whitespace. 40 | "l": ("remote_logname", clf_word), 41 | # `%L` is the base64-encoding of a byte sequence with trailing '=' removed. 42 | # Depending on whether mod_unique_id is loaded, the encoding will use 43 | # either '+' and '/' or '@' and '-'. 44 | "L": ("request_log_id", clf(FieldType(r"[-@/+A-Za-z0-9]+", str))), 45 | # `%m` is the first word of the request line, i.e., it does not contain any 46 | # whitespace. 47 | # `%m` is '-' when the request line is malformed. 48 | "m": ("request_method", clf_word), 49 | "p": ("server_port", integer), 50 | "P": ("pid", integer), 51 | # As of httpd v2.4.29, `%q` is formatted as either an empty string or `?` 52 | # followed by a (possibly empty, in the case where the requested URI ends 53 | # with '?') escaped string. Moreover, due to the way the query string is 54 | # parsed from the request URI, it will never contain a '#', and due to the 55 | # way the request URI is parsed from the request line, `%q` will never 56 | # contain whitespace. 57 | "q": ( 58 | "request_query", 59 | FieldType( 60 | r"(?:\?(?:[!\x24-\x5B\x5D-\x7E]|\\x[0-9A-Fa-f]{2}|\\.)*?)?", 61 | unescape, 62 | ), 63 | ), 64 | # `%r` is just the first line sent by the client, and so it can be 65 | # anything. I've even seen it as '-' in logs, though I can't quite figure 66 | # out how to reproduce that. 67 | "r": ("request_line", clf_string), 68 | "R": ("handler", clf_string), 69 | # httpd v2.4.29 has a provision in its code for converting statuses less 70 | # than or equal to zero to "-". I'm not sure when that can happen, but 71 | # apparently it can. 72 | "s": ("status", clf(status_code)), 73 | "t": ( 74 | ("request_time_fields", "timestamp"), 75 | FieldType(r"\[[^]]+\]", parse_apache_timestamp), 76 | ), 77 | "T": ("request_duration_seconds", integer), 78 | "u": ("remote_user", remote_user), 79 | # Starting somewhere between versions 2.4.18 and 2.4.29 of Apache (or maybe 80 | # earlier?), `%U` has (some?) percent-escapes decoded and thus may contain 81 | # whitespace and '?' (and just about any other ASCII character?). 82 | # `%U` is '-' when the request line is malformed. 83 | "U": ("request_uri", clf_string), 84 | "v": ("virtual_host", esc_string), 85 | "V": ("server_name", esc_string), 86 | "X": ("connection_status", FieldType("[-+X]", str)), 87 | # Defined by mod_logio: 88 | "I": ("bytes_in", integer), 89 | "O": ("bytes_out", integer), 90 | "S": ("bytes_combined", integer), 91 | "^FB": ("ttfb", clf(integer)), 92 | } 93 | 94 | PARAMETERIZED_DIRECTIVES = { 95 | "a": { 96 | "c": ("remote_client_address", ip_address), 97 | }, 98 | "C": ("cookies", cookie_value), 99 | "e": ("env_vars", clf_string), 100 | "h": { 101 | "c": ("remote_underlying_host", esc_string), 102 | }, 103 | "i": ("headers_in", clf_string), 104 | # `%{c}L` is derived the same way as `%L`; see above. 105 | "L": { 106 | "c": ("connection_log_id", clf(FieldType(r"[-@/+A-Za-z0-9]+", str))), 107 | }, 108 | "n": ("notes", clf_string), 109 | "o": ("headers_out", clf_string), 110 | "p": dicti( 111 | { 112 | "canonical": ("server_port", integer), 113 | "local": ("local_port", integer), 114 | "remote": ("remote_port", integer), 115 | } 116 | ), 117 | "P": dicti( 118 | { 119 | "pid": ("pid", integer), 120 | # `%{tid}P` is formatted as an unsigned integer. 121 | "tid": ("tid", uinteger), 122 | "hextid": ("tid", FieldType(r"[0-9A-Fa-f]+", lambda s: int(s, 16))), 123 | } 124 | ), 125 | "t": strftime2regex, 126 | "T": dicti( 127 | { 128 | "ms": ("request_duration_milliseconds", integer), 129 | "us": ("request_duration_microseconds", integer), 130 | "s": ("request_duration_seconds", integer), 131 | } 132 | ), 133 | "^ti": ("trailers_in", clf_string), 134 | "^to": ("trailers_out", clf_string), 135 | # Defined by mod_ssl: 136 | # As of Apache 2.4.39, no escaping is performed on the values of `%{*}x` 137 | # and `%{*}c` despite the fact that they can be just about anything. 138 | "c": ("cryptography", clf(FieldType(r".+?", str))), 139 | "x": ("variables", clf(FieldType(r".+?", str))), 140 | } 141 | 142 | DIRECTIVE_RGX = re.compile( 143 | r""" 144 | % (?P[0-9,!<>]*) 145 | (?:\{(?P[^}]*)\})? 146 | (?P[0-9,!<>]*) 147 | (?P\^[a-zA-Z%]{2}|[a-zA-Z%]) 148 | | (?P.) 149 | """, 150 | flags=re.X, 151 | ) 152 | 153 | 154 | def format2regex( 155 | fmt, plain_directives=None, parameterized_directives=None, simple=False 156 | ): 157 | """ 158 | Given a %-style format string ``fmt`` made up of a mixture of the "plain" 159 | directives (e.g., ``%q``) in ``plain_directives`` (default 160 | `PLAIN_DIRECTIVES`) and the parameterized directives (e.g., ``%{foo}q``) in 161 | ``parameterized_directives`` (default `PARAMETERIZED_DIRECTIVES`), return a 162 | pair ``(groups, rgx)`` where: 163 | 164 | - ``groups`` is a list of ``(name, directive, converter)`` triples, 165 | corresponding to the respective capturing groups in ``rgx``, where: 166 | 167 | - ``name`` is the name (if a `str`) or path (if a `tuple` of `str`) at 168 | which the converted captured value shall be saved in a `LogEntry` 169 | instance 170 | 171 | - ``directive`` is the complete directive in ``fmt`` that produced this 172 | triple and capturing group 173 | 174 | - ``converter`` is a function that takes a `str` (the captured value) and 175 | returns a value 176 | 177 | - ``rgx`` is a regex string that matches any string created from ``fmt``, 178 | with a capturing group around the substring corresponding to each 179 | non-``%%`` directive 180 | 181 | :param str fmt: 182 | :param dict plain_directives: A `dict` mapping plain directive names to 183 | ``(name, field_type)`` pairs. A ``name`` of `None` (as for the ``%%`` 184 | directive) indicates that input text matching the directive shall not 185 | be captured. 186 | :param dict parameterized_directives: A `dict` mapping parameterized 187 | directive names either to a ``(name, field_type)`` pair (where the 188 | ``name`` is the name of the `dict` attribute of `LogEntry` in which a 189 | key named after the parameter will store the converted captured value), 190 | or to a sub-`dict` mapping parameter values to ``(name, field_type)`` 191 | pairs, or to a callable that takes a parameter and returns the same 192 | return type as `format2regex()`. 193 | :param bool simple: If `True`, an `InvalidDirectiveError` will be raised if 194 | a directive with modifiers or a parameter is encountered 195 | :raises InvalidDirectiveError: if an invalid directive occurs in ``fmt`` 196 | :raises UnknownDirectiveError: if an unknown directive occurs in ``fmt`` 197 | """ 198 | 199 | if plain_directives is None: 200 | plain_directives = PLAIN_DIRECTIVES 201 | if parameterized_directives is None: 202 | parameterized_directives = PARAMETERIZED_DIRECTIVES 203 | groups = [] 204 | rgx = "" 205 | for m in DIRECTIVE_RGX.finditer(fmt): 206 | if m.group("literal") is not None: 207 | if m.group("literal") == "%": 208 | raise InvalidDirectiveError(fmt, m.start()) 209 | rgx += re.escape(m.group("literal")) 210 | continue 211 | multiple = False 212 | modifiers = m.group("modifiers1") + m.group("modifiers2") 213 | conditioned = any(c.isdigit() for c in modifiers) 214 | redirects = re.findall(r"[<>]", modifiers) 215 | if simple and (modifiers or m.group("param") is not None): 216 | raise InvalidDirectiveError(fmt, m.start()) 217 | try: 218 | if m.group("param") is not None: 219 | spec = parameterized_directives[m.group("directive")] 220 | param = m.group("param") 221 | if isinstance(spec, dict): 222 | name, dtype = spec[param] 223 | elif callable(spec): 224 | subgroups, subrgx = spec(param) 225 | subgroups = [ 226 | ( 227 | name, 228 | "%" 229 | + m.group("modifiers1") 230 | + "{" 231 | + directive 232 | + "}" 233 | + m.group("modifiers2") 234 | + m.group("directive"), 235 | converter, 236 | ) 237 | for (name, directive, converter) in subgroups 238 | ] 239 | multiple = True 240 | else: 241 | name = (spec[0], param) 242 | dtype = spec[1] 243 | else: 244 | name, dtype = plain_directives[m.group("directive")] 245 | except KeyError: 246 | raise UnknownDirectiveError(m.group(0)) 247 | if multiple: 248 | if conditioned: 249 | subrgx = f"(?:{subrgx}|-)" 250 | rgx += subrgx 251 | else: 252 | if name is None: 253 | rgx += f"(?:{dtype.regex})" 254 | continue 255 | if conditioned: 256 | dtype = clf(dtype) 257 | rgx += rf"({dtype.regex})" 258 | subgroups = [(name, m.group(0), dtype.converter)] 259 | if redirects: 260 | prefix = "original_" if redirects[-1] == "<" else "final_" 261 | for i, (name, directive, converter) in enumerate(subgroups): 262 | if isinstance(name, tuple): 263 | name = (prefix + name[0],) + name[1:] 264 | else: 265 | name = prefix + name 266 | subgroups[i] = (name, directive, converter) 267 | groups.extend(subgroups) 268 | return (groups, rgx) 269 | -------------------------------------------------------------------------------- /src/apachelogs/timeutil.py: -------------------------------------------------------------------------------- 1 | import calendar 2 | from datetime import date, datetime, timedelta, timezone 3 | import re 4 | import time 5 | 6 | #: The names of the months in English 7 | MONTH_FULL_NAMES = { 8 | "January": 1, 9 | "February": 2, 10 | "March": 3, 11 | "April": 4, 12 | "May": 5, 13 | "June": 6, 14 | "July": 7, 15 | "August": 8, 16 | "September": 9, 17 | "October": 10, 18 | "November": 11, 19 | "December": 12, 20 | } 21 | 22 | #: The abbreviated names of the months in English 23 | MONTH_SNAMES = { 24 | "Jan": 1, 25 | "Feb": 2, 26 | "Mar": 3, 27 | "Apr": 4, 28 | "May": 5, 29 | "Jun": 6, 30 | "Jul": 7, 31 | "Aug": 8, 32 | "Sep": 9, 33 | "Oct": 10, 34 | "Nov": 11, 35 | "Dec": 12, 36 | } 37 | 38 | #: The names of the days of the week in English 39 | WDAY_FULL_NAMES = { 40 | "Monday": 1, 41 | "Tuesday": 2, 42 | "Wednesday": 3, 43 | "Thursday": 4, 44 | "Friday": 5, 45 | "Saturday": 6, 46 | "Sunday": 7, 47 | } 48 | 49 | #: The abbreviated names of the days of the week in English 50 | WDAY_SNAMES = { 51 | "Mon": 1, 52 | "Tue": 2, 53 | "Wed": 3, 54 | "Thu": 4, 55 | "Fri": 5, 56 | "Sat": 6, 57 | "Sun": 7, 58 | } 59 | 60 | #: Compiled regex for an Apache timestamp 61 | APACHE_TS_RGX = re.compile( 62 | r""" 63 | ^\[? 64 | (?P\d\d) / (?P\w\w\w) / (?P\d{4,}) 65 | :(?P\d\d) : (?P\d\d) : (?P\d\d) 66 | \s* (?P[-+]) (?P\d\d) (?P\d\d) 67 | \]?$ 68 | """, 69 | flags=re.X, 70 | ) 71 | 72 | 73 | def parse_apache_timestamp(s): 74 | """ 75 | Parse an Apache timestamp into a `datetime.datetime` object. The month 76 | name in the timestamp is expected to be an abbreviated English name 77 | regardless of the current locale. 78 | 79 | >>> parse_apache_timestamp('[01/Nov/2017:07:28:29 +0000]') 80 | datetime.datetime(2017, 11, 1, 7, 28, 29, tzinfo=datetime.timezone.utc) 81 | 82 | :param str s: a string of the form ``DD/Mon/YYYY:HH:MM:SS +HHMM`` 83 | (optionally enclosed in square brackets) 84 | :return: an aware `datetime.datetime` 85 | :raises ValueError: if ``s`` is not in the expected format 86 | """ 87 | # Apache timestamps always use English month abbreviations. Thus, parsing 88 | # with strptime like the below will fail when in a locale with different 89 | # month snames: 90 | # return datetime.strptime(s.strip('[]'), '%d/%b/%Y:%H:%M:%S %z') 91 | if s is None: 92 | return None 93 | m = APACHE_TS_RGX.match(s) 94 | if not m: 95 | raise ValueError(s) 96 | data = m.groupdict() 97 | for k in "year day hour minute second".split(): 98 | data[k] = int(data[k]) 99 | try: 100 | data["month"] = MONTH_SNAMES[data["month"]] 101 | except KeyError: 102 | raise ValueError(s) 103 | tzoffset = timedelta( 104 | hours=int(data.pop("tzoffset_hour")), 105 | minutes=int(data.pop("tzoffset_min")), 106 | ) 107 | if data.pop("tzoffset_sign") == "-": 108 | tzoffset *= -1 109 | data["tzinfo"] = timezone(tzoffset) 110 | return datetime(**data) 111 | 112 | 113 | def assemble_datetime(fields): 114 | """ 115 | Given a `dict` of time fields, return a `datetime.datetime` object if there 116 | is enough information to create one, `None` otherwise. 117 | """ 118 | if fields.get("timezone") is not None: 119 | tz = fields["timezone"] 120 | elif fields.get("tzname") is not None: 121 | if fields["tzname"] in ("GMT", "UTC"): 122 | tz = timezone.utc 123 | elif fields["tzname"] == time.tzname[0]: 124 | tz = timezone(timedelta(seconds=-time.timezone)) 125 | elif time.daylight and fields["tzname"] == time.tzname[1]: 126 | tz = timezone(timedelta(seconds=-time.altzone)) 127 | else: 128 | tz = None 129 | else: 130 | tz = None 131 | 132 | if fields.get("timestamp") is not None: 133 | return fields["timestamp"] 134 | elif fields.get("microepoch") is not None: 135 | return datetime.fromtimestamp( 136 | fields["microepoch"] / 1000000, 137 | tz or timezone.utc, 138 | ) 139 | elif fields.get("milliepoch") is not None: 140 | return datetime.fromtimestamp( 141 | fields["milliepoch"] / 1000, 142 | tz or timezone.utc, 143 | ) 144 | elif fields.get("epoch") is not None: 145 | return datetime.fromtimestamp(fields["epoch"], tz or timezone.utc) 146 | else: 147 | locale_wday_names = {w: i for i, w in enumerate(calendar.day_name, start=1)} 148 | locale_wday_abbrevs = {w: i for i, w in enumerate(calendar.day_abbr, start=1)} 149 | 150 | if fields.get("iso_wday") is not None: 151 | iso_wday = fields["iso_wday"] 152 | elif fields.get("wday") is not None: 153 | iso_wday = fields["wday"] or 7 154 | elif ( 155 | fields.get("full_wday") is not None 156 | and fields["full_wday"] in WDAY_FULL_NAMES 157 | ): 158 | iso_wday = WDAY_FULL_NAMES[fields["full_wday"]] 159 | elif ( 160 | fields.get("full_wday") is not None 161 | and fields["full_wday"] in locale_wday_names 162 | ): 163 | iso_wday = locale_wday_names[fields["full_wday"]] 164 | elif ( 165 | fields.get("abbrev_wday") is not None 166 | and fields["abbrev_wday"] in WDAY_SNAMES 167 | ): 168 | iso_wday = WDAY_SNAMES[fields["abbrev_wday"]] 169 | elif ( 170 | fields.get("abbrev_wday") is not None 171 | and fields["abbrev_wday"] in locale_wday_abbrevs 172 | ): 173 | iso_wday = locale_wday_abbrevs[fields["abbrev_wday"]] 174 | else: 175 | iso_wday = None 176 | 177 | thedate = None 178 | 179 | if fields.get("year") is not None: 180 | year = fields["year"] 181 | elif fields.get("date") is not None: 182 | year = fields["date"].year 183 | elif fields.get("abbrev_year") is not None: 184 | if fields.get("century") is not None: 185 | year = fields["century"] * 100 + fields["abbrev_year"] 186 | elif fields["abbrev_year"] < 69: 187 | year = 2000 + fields["abbrev_year"] 188 | else: 189 | year = 1900 + fields["abbrev_year"] 190 | elif ( 191 | fields.get("iso_year") is not None 192 | and fields.get("iso_weeknum") is not None 193 | and iso_wday is not None 194 | ): 195 | thedate = fromisocalendar( 196 | fields["iso_year"], 197 | fields["iso_weeknum"], 198 | iso_wday, 199 | ) 200 | year = thedate.year 201 | elif ( 202 | fields.get("abbrev_iso_year") is not None 203 | and fields.get("iso_weeknum") is not None 204 | and iso_wday is not None 205 | ): 206 | iso_year = fields["abbrev_iso_year"] 207 | iso_year += 2000 if iso_year < 69 else 1900 208 | thedate = fromisocalendar(iso_year, fields["iso_weeknum"], iso_wday) 209 | year = thedate.year 210 | else: 211 | return None 212 | 213 | locale_month_names = {m: i for i, m in enumerate(calendar.month_name) if i != 0} 214 | locale_month_abbrevs = { 215 | m: i for i, m in enumerate(calendar.month_abbr) if i != 0 216 | } 217 | 218 | if thedate is None: 219 | if fields.get("date") is not None: 220 | thedate = fields["date"] 221 | elif fields.get("yday") is not None: 222 | thedate = date(year, 1, 1) + timedelta(days=fields["yday"] - 1) 223 | elif fields.get("sunday_weeknum") is not None and iso_wday is not None: 224 | thedate = datetime.strptime( 225 | f'{year} {fields["sunday_weeknum"]} {iso_wday % 7}', 226 | "%Y %U %w", 227 | ).date() 228 | elif fields.get("monday_weeknum") is not None and iso_wday is not None: 229 | thedate = datetime.strptime( 230 | f'{year} {fields["monday_weeknum"]} {iso_wday % 7}', 231 | "%Y %W %w", 232 | ).date() 233 | 234 | if fields.get("mon") is not None: 235 | month = fields["mon"] 236 | elif thedate is not None: 237 | month = thedate.month 238 | elif fields.get("full_mon") in MONTH_FULL_NAMES: 239 | month = MONTH_FULL_NAMES[fields["full_mon"]] 240 | elif fields.get("full_mon") in locale_month_names: 241 | month = locale_month_names[fields["full_mon"]] 242 | elif fields.get("abbrev_mon") in MONTH_SNAMES: 243 | month = MONTH_SNAMES[fields["abbrev_mon"]] 244 | elif fields.get("abbrev_mon") in locale_month_abbrevs: 245 | month = locale_month_abbrevs[fields["abbrev_mon"]] 246 | else: 247 | return None 248 | 249 | if fields.get("mday") is not None: 250 | day = fields["mday"] 251 | elif thedate is not None: 252 | day = thedate.day 253 | else: 254 | return None 255 | 256 | if fields.get("hour") is not None: 257 | hour = fields["hour"] 258 | elif fields.get("time") is not None: 259 | hour = fields["time"].hour 260 | elif fields.get("hour_min") is not None: 261 | hour = fields["hour_min"].hour 262 | elif ( 263 | fields.get("hour12") is not None 264 | and fields.get("am_pm") is not None 265 | and fields["am_pm"].upper() in ("AM", "PM") 266 | ): 267 | hour = fields["hour12"] % 12 268 | if fields["am_pm"].upper() == "PM": 269 | hour += 12 270 | else: 271 | return None 272 | 273 | if fields.get("min") is not None: 274 | minute = fields["min"] 275 | elif fields.get("time") is not None: 276 | minute = fields["time"].minute 277 | elif fields.get("hour_min") is not None: 278 | minute = fields["hour_min"].minute 279 | else: 280 | return None 281 | 282 | if fields.get("sec") is not None: 283 | second = fields["sec"] 284 | elif fields.get("time") is not None: 285 | second = fields["time"].second 286 | else: 287 | return None 288 | 289 | if fields.get("usec_frac") is not None: 290 | microsecond = fields["usec_frac"] 291 | elif fields.get("msec_frac") is not None: 292 | microsecond = fields["msec_frac"] * 1000 293 | else: 294 | microsecond = 0 295 | 296 | return datetime( 297 | year=year, 298 | month=month, 299 | day=day, 300 | hour=hour, 301 | minute=minute, 302 | second=second, 303 | microsecond=microsecond, 304 | tzinfo=tz, 305 | ) 306 | 307 | 308 | def fromisocalendar(iso_year, iso_weeknum, iso_wday): 309 | """ 310 | Convert an ISO year, ISO week number, and ISO weekday to a `datetime.date`. 311 | This is the inverse of `datetime.date.isocalendar()`. 312 | 313 | >>> fromisocalendar(2004, 1, 1) 314 | datetime.date(2003, 12, 29) 315 | >>> fromisocalendar(2004, 1, 7) 316 | datetime.date(2004, 1, 4) 317 | """ 318 | 319 | # Python 3.8+: 320 | # date.fromisocalendar(iso_year, iso_weeknum, iso_wday) 321 | 322 | return datetime.strptime( 323 | f"{iso_year} {iso_weeknum} {iso_wday}", 324 | "%G %V %u", 325 | ).date() 326 | -------------------------------------------------------------------------------- /test/test_general.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timezone 2 | from pathlib import Path 3 | import pytest 4 | from apachelogs import ( 5 | COMBINED, 6 | VHOST_COMBINED, 7 | InvalidEntryError, 8 | LogEntry, 9 | LogParser, 10 | parse, 11 | parse_lines, 12 | ) 13 | 14 | 15 | def mkentry(entry, format, **attrs): # noqa: A002 16 | logentry = LogEntry(entry, format, [], []) 17 | logentry.__dict__.update(attrs) 18 | return logentry 19 | 20 | 21 | VHOST_COMBINED_LOG_ENTRIES = [ 22 | mkentry( 23 | 'www.varonathe.org:80 203.62.1.80 - - [06/May/2019:06:28:20 +0000] "GET / HTTP/1.1" 301 577 "-" "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:58.0) Gecko/20100101 Firefox/58.0"', 24 | VHOST_COMBINED, 25 | virtual_host="www.varonathe.org", 26 | server_port=80, 27 | remote_host="203.62.1.80", 28 | remote_logname=None, 29 | remote_user=None, 30 | request_time=datetime(2019, 5, 6, 6, 28, 20, tzinfo=timezone.utc), 31 | request_time_fields={ 32 | "timestamp": datetime(2019, 5, 6, 6, 28, 20, tzinfo=timezone.utc), 33 | }, 34 | request_line="GET / HTTP/1.1", 35 | final_status=301, 36 | bytes_out=577, 37 | headers_in={ 38 | "Referer": None, 39 | "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:58.0) Gecko/20100101 Firefox/58.0", 40 | }, 41 | directives={ 42 | "%v": "www.varonathe.org", 43 | "%p": 80, 44 | "%h": "203.62.1.80", 45 | "%l": None, 46 | "%u": None, 47 | "%t": datetime(2019, 5, 6, 6, 28, 20, tzinfo=timezone.utc), 48 | "%r": "GET / HTTP/1.1", 49 | "%>s": 301, 50 | "%O": 577, 51 | "%{Referer}i": None, 52 | "%{User-Agent}i": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:58.0) Gecko/20100101 Firefox/58.0", 53 | }, 54 | ), 55 | mkentry( 56 | 'www.varonathe.org:80 203.62.1.80 - - [06/May/2019:06:28:20 +0000] "GET /robots.txt HTTP/1.1" 301 596 "-" "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:58.0) Gecko/20100101 Firefox/58.0"', 57 | VHOST_COMBINED, 58 | virtual_host="www.varonathe.org", 59 | server_port=80, 60 | remote_host="203.62.1.80", 61 | remote_logname=None, 62 | remote_user=None, 63 | request_time=datetime(2019, 5, 6, 6, 28, 20, tzinfo=timezone.utc), 64 | request_time_fields={ 65 | "timestamp": datetime(2019, 5, 6, 6, 28, 20, tzinfo=timezone.utc), 66 | }, 67 | request_line="GET /robots.txt HTTP/1.1", 68 | final_status=301, 69 | bytes_out=596, 70 | headers_in={ 71 | "Referer": None, 72 | "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:58.0) Gecko/20100101 Firefox/58.0", 73 | }, 74 | directives={ 75 | "%v": "www.varonathe.org", 76 | "%p": 80, 77 | "%h": "203.62.1.80", 78 | "%l": None, 79 | "%u": None, 80 | "%t": datetime(2019, 5, 6, 6, 28, 20, tzinfo=timezone.utc), 81 | "%r": "GET /robots.txt HTTP/1.1", 82 | "%>s": 301, 83 | "%O": 596, 84 | "%{Referer}i": None, 85 | "%{User-Agent}i": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:58.0) Gecko/20100101 Firefox/58.0", 86 | }, 87 | ), 88 | mkentry( 89 | 'www.varonathe.org:80 203.62.1.80 - - [06/May/2019:06:28:21 +0000] "POST /App6079ec68.php HTTP/1.1" 301 606 "-" "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:58.0) Gecko/20100101 Firefox/58.0"', 90 | VHOST_COMBINED, 91 | virtual_host="www.varonathe.org", 92 | server_port=80, 93 | remote_host="203.62.1.80", 94 | remote_logname=None, 95 | remote_user=None, 96 | request_time=datetime(2019, 5, 6, 6, 28, 21, tzinfo=timezone.utc), 97 | request_time_fields={ 98 | "timestamp": datetime(2019, 5, 6, 6, 28, 21, tzinfo=timezone.utc), 99 | }, 100 | request_line="POST /App6079ec68.php HTTP/1.1", 101 | final_status=301, 102 | bytes_out=606, 103 | headers_in={ 104 | "Referer": None, 105 | "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:58.0) Gecko/20100101 Firefox/58.0", 106 | }, 107 | directives={ 108 | "%v": "www.varonathe.org", 109 | "%p": 80, 110 | "%h": "203.62.1.80", 111 | "%l": None, 112 | "%u": None, 113 | "%t": datetime(2019, 5, 6, 6, 28, 21, tzinfo=timezone.utc), 114 | "%r": "POST /App6079ec68.php HTTP/1.1", 115 | "%>s": 301, 116 | "%O": 606, 117 | "%{Referer}i": None, 118 | "%{User-Agent}i": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:58.0) Gecko/20100101 Firefox/58.0", 119 | }, 120 | ), 121 | mkentry( 122 | 'www.varonathe.org:80 203.62.1.80 - - [06/May/2019:06:28:21 +0000] "GET /webdav/ HTTP/1.1" 301 554 "-" "Mozilla/5.0"', 123 | VHOST_COMBINED, 124 | virtual_host="www.varonathe.org", 125 | server_port=80, 126 | remote_host="203.62.1.80", 127 | remote_logname=None, 128 | remote_user=None, 129 | request_time=datetime(2019, 5, 6, 6, 28, 21, tzinfo=timezone.utc), 130 | request_time_fields={ 131 | "timestamp": datetime(2019, 5, 6, 6, 28, 21, tzinfo=timezone.utc), 132 | }, 133 | request_line="GET /webdav/ HTTP/1.1", 134 | final_status=301, 135 | bytes_out=554, 136 | headers_in={ 137 | "Referer": None, 138 | "User-Agent": "Mozilla/5.0", 139 | }, 140 | directives={ 141 | "%v": "www.varonathe.org", 142 | "%p": 80, 143 | "%h": "203.62.1.80", 144 | "%l": None, 145 | "%u": None, 146 | "%t": datetime(2019, 5, 6, 6, 28, 21, tzinfo=timezone.utc), 147 | "%r": "GET /webdav/ HTTP/1.1", 148 | "%>s": 301, 149 | "%O": 554, 150 | "%{Referer}i": None, 151 | "%{User-Agent}i": "Mozilla/5.0", 152 | }, 153 | ), 154 | mkentry( 155 | 'www.varonathe.org:80 203.62.1.80 - - [06/May/2019:06:28:21 +0000] "GET /help.php HTTP/1.1" 301 592 "-" "Mozilla/5.0 (Windows NT 5.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0"', 156 | VHOST_COMBINED, 157 | virtual_host="www.varonathe.org", 158 | server_port=80, 159 | remote_host="203.62.1.80", 160 | remote_logname=None, 161 | remote_user=None, 162 | request_time=datetime(2019, 5, 6, 6, 28, 21, tzinfo=timezone.utc), 163 | request_time_fields={ 164 | "timestamp": datetime(2019, 5, 6, 6, 28, 21, tzinfo=timezone.utc), 165 | }, 166 | request_line="GET /help.php HTTP/1.1", 167 | final_status=301, 168 | bytes_out=592, 169 | headers_in={ 170 | "Referer": None, 171 | "User-Agent": "Mozilla/5.0 (Windows NT 5.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0", 172 | }, 173 | directives={ 174 | "%v": "www.varonathe.org", 175 | "%p": 80, 176 | "%h": "203.62.1.80", 177 | "%l": None, 178 | "%u": None, 179 | "%t": datetime(2019, 5, 6, 6, 28, 21, tzinfo=timezone.utc), 180 | "%r": "GET /help.php HTTP/1.1", 181 | "%>s": 301, 182 | "%O": 592, 183 | "%{Referer}i": None, 184 | "%{User-Agent}i": "Mozilla/5.0 (Windows NT 5.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0", 185 | }, 186 | ), 187 | mkentry( 188 | 'www.varonathe.org:80 203.62.1.80 - - [06/May/2019:06:28:22 +0000] "GET /java.php HTTP/1.1" 301 592 "-" "Mozilla/5.0 (Windows NT 5.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0"', 189 | VHOST_COMBINED, 190 | virtual_host="www.varonathe.org", 191 | server_port=80, 192 | remote_host="203.62.1.80", 193 | remote_logname=None, 194 | remote_user=None, 195 | request_time=datetime(2019, 5, 6, 6, 28, 22, tzinfo=timezone.utc), 196 | request_time_fields={ 197 | "timestamp": datetime(2019, 5, 6, 6, 28, 22, tzinfo=timezone.utc), 198 | }, 199 | request_line="GET /java.php HTTP/1.1", 200 | final_status=301, 201 | bytes_out=592, 202 | headers_in={ 203 | "Referer": None, 204 | "User-Agent": "Mozilla/5.0 (Windows NT 5.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0", 205 | }, 206 | directives={ 207 | "%v": "www.varonathe.org", 208 | "%p": 80, 209 | "%h": "203.62.1.80", 210 | "%l": None, 211 | "%u": None, 212 | "%t": datetime(2019, 5, 6, 6, 28, 22, tzinfo=timezone.utc), 213 | "%r": "GET /java.php HTTP/1.1", 214 | "%>s": 301, 215 | "%O": 592, 216 | "%{Referer}i": None, 217 | "%{User-Agent}i": "Mozilla/5.0 (Windows NT 5.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0", 218 | }, 219 | ), 220 | ] 221 | 222 | 223 | @pytest.mark.parametrize("end", ["", "\n", "\r", "\r\n"]) 224 | def test_parse_general(end): 225 | ENTRY = '209.126.136.4 - - [01/Nov/2017:07:28:29 +0000] "GET / HTTP/1.1" 301 521 "-" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36"' 226 | parser = LogParser(COMBINED, encoding="utf-8") 227 | assert parser.format == COMBINED 228 | parsed = parser.parse(ENTRY + end) 229 | assert parsed.remote_host == "209.126.136.4" 230 | assert parsed.remote_logname is None 231 | assert parsed.remote_user is None 232 | assert parsed.request_time == datetime(2017, 11, 1, 7, 28, 29, tzinfo=timezone.utc) 233 | assert parsed.request_line == "GET / HTTP/1.1" 234 | assert parsed.final_status == 301 235 | assert parsed.bytes_sent == 521 236 | assert parsed.headers_in == { 237 | "Referer": None, 238 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36", 239 | } 240 | assert ( 241 | parsed.headers_in["User-Agent"] 242 | == parsed.headers_in["USER-AGENT"] 243 | == parsed.headers_in["user-agent"] 244 | == "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36" 245 | ) 246 | assert parsed.entry == ENTRY 247 | assert parsed.format == COMBINED 248 | assert parsed.request_time_fields == { 249 | "timestamp": datetime(2017, 11, 1, 7, 28, 29, tzinfo=timezone.utc) 250 | } 251 | assert parsed.directives == { 252 | "%h": "209.126.136.4", 253 | "%l": None, 254 | "%u": None, 255 | "%t": datetime(2017, 11, 1, 7, 28, 29, tzinfo=timezone.utc), 256 | "%r": "GET / HTTP/1.1", 257 | "%>s": 301, 258 | "%b": 521, 259 | "%{Referer}i": None, 260 | "%{User-Agent}i": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36", 261 | } 262 | 263 | 264 | def test_parse_lines_invalid(): 265 | with (Path(__file__).with_name("data") / "vhost_combined.log").open() as fp: 266 | entries = parse_lines(VHOST_COMBINED, fp) 267 | assert next(entries) == VHOST_COMBINED_LOG_ENTRIES[0] 268 | assert next(entries) == VHOST_COMBINED_LOG_ENTRIES[1] 269 | assert next(entries) == VHOST_COMBINED_LOG_ENTRIES[2] 270 | assert next(entries) == VHOST_COMBINED_LOG_ENTRIES[3] 271 | with pytest.raises(InvalidEntryError) as excinfo: 272 | next(entries) 273 | assert str(excinfo.value) == ( 274 | "Could not match log entry 'Bad line'" 275 | " against log format {!r}".format(VHOST_COMBINED) 276 | ) 277 | assert excinfo.value.entry == "Bad line" 278 | assert excinfo.value.format == VHOST_COMBINED 279 | 280 | 281 | def test_parse_lines_ignore_invalid(): 282 | with (Path(__file__).with_name("data") / "vhost_combined.log").open() as fp: 283 | entries = parse_lines(VHOST_COMBINED, fp, ignore_invalid=True) 284 | assert list(entries) == VHOST_COMBINED_LOG_ENTRIES 285 | 286 | 287 | def test_parse_default_enc(mocker): 288 | m = mocker.patch("apachelogs.LogParser", spec=LogParser) 289 | r = parse("%s", "200") 290 | m.assert_called_once_with("%s", encoding="iso-8859-1", errors=None) 291 | m.return_value.parse.assert_called_once_with("200") 292 | assert r is m.return_value.parse.return_value 293 | 294 | 295 | def test_parse_custom_enc(mocker): 296 | m = mocker.patch("apachelogs.LogParser", spec=LogParser) 297 | r = parse("%s", "200", encoding="utf-8", errors="surrogateescape") 298 | m.assert_called_once_with("%s", encoding="utf-8", errors="surrogateescape") 299 | m.return_value.parse.assert_called_once_with("200") 300 | assert r is m.return_value.parse.return_value 301 | 302 | 303 | def test_parse_lines_default_enc(mocker): 304 | m = mocker.patch("apachelogs.LogParser", spec=LogParser) 305 | r = parse_lines("%s", ["200"]) 306 | m.assert_called_once_with("%s", encoding="iso-8859-1", errors=None) 307 | m.return_value.parse_lines.assert_called_once_with(["200"], False) 308 | assert r is m.return_value.parse_lines.return_value 309 | 310 | 311 | def test_parse_lines_custom_enc(mocker): 312 | m = mocker.patch("apachelogs.LogParser", spec=LogParser) 313 | r = parse_lines("%s", ["200"], encoding="utf-8", errors="surrogateescape") 314 | m.assert_called_once_with("%s", encoding="utf-8", errors="surrogateescape") 315 | m.return_value.parse_lines.assert_called_once_with(["200"], False) 316 | assert r is m.return_value.parse_lines.return_value 317 | 318 | 319 | def test_case_insensitive_dicts(): 320 | entry = parse( 321 | "%{USER}e|%{Content-Type}i|%{flavor}C|%{ssl-secure-reneg}n" 322 | "|%{Content-Type}o|%{Foo}^ti|%{Baz}^to|%{HTTP_USER_AGENT}x|%{errcode}c", 323 | "www-data|application/x-www-form-urlencoded|chocolate|1|text/html" 324 | '|Bar|Quux|Web "Browsy" Browser|-', 325 | ) 326 | assert entry.env_vars == {"USER": "www-data"} 327 | assert ( 328 | entry.env_vars["USER"] 329 | == entry.env_vars["user"] 330 | == entry.env_vars["User"] 331 | == "www-data" 332 | ) 333 | assert entry.headers_in == {"Content-Type": "application/x-www-form-urlencoded"} 334 | assert ( 335 | entry.headers_in["Content-Type"] 336 | == entry.headers_in["CONTENT-TYPE"] 337 | == entry.headers_in["content-type"] 338 | == "application/x-www-form-urlencoded" 339 | ) 340 | assert entry.cookies == {"flavor": "chocolate"} 341 | assert ( 342 | entry.cookies["flavor"] 343 | == entry.cookies["FLAVOR"] 344 | == entry.cookies["Flavor"] 345 | == "chocolate" 346 | ) 347 | assert entry.notes == {"ssl-secure-reneg": "1"} 348 | assert ( 349 | entry.notes["ssl-secure-reneg"] 350 | == entry.notes["SSL-SECURE-RENEG"] 351 | == entry.notes["SSL-Secure-Reneg"] 352 | == "1" 353 | ) 354 | assert entry.headers_out == {"Content-Type": "text/html"} 355 | assert ( 356 | entry.headers_out["Content-Type"] 357 | == entry.headers_out["CONTENT-TYPE"] 358 | == entry.headers_out["content-type"] 359 | == "text/html" 360 | ) 361 | assert entry.trailers_in == {"Foo": "Bar"} 362 | assert ( 363 | entry.trailers_in["Foo"] 364 | == entry.trailers_in["FOO"] 365 | == entry.trailers_in["foo"] 366 | == "Bar" 367 | ) 368 | assert entry.trailers_out == {"Baz": "Quux"} 369 | assert ( 370 | entry.trailers_out["Baz"] 371 | == entry.trailers_out["BAZ"] 372 | == entry.trailers_out["baz"] 373 | == "Quux" 374 | ) 375 | assert entry.variables == {"HTTP_USER_AGENT": 'Web "Browsy" Browser'} 376 | assert ( 377 | entry.variables["HTTP_USER_AGENT"] 378 | == entry.variables["http_user_agent"] 379 | == entry.variables["Http_User_Agent"] 380 | == 'Web "Browsy" Browser' 381 | ) 382 | assert entry.cryptography == {"errcode": None} 383 | assert ( 384 | entry.cryptography["errcode"] 385 | is entry.cryptography["ERRCODE"] 386 | is entry.cryptography["Errcode"] 387 | is None 388 | ) 389 | -------------------------------------------------------------------------------- /test/test_assemble_datetime.py: -------------------------------------------------------------------------------- 1 | from datetime import date, datetime, time, timedelta, timezone 2 | import pytest 3 | from apachelogs.timeutil import assemble_datetime 4 | 5 | w4 = timezone(timedelta(hours=-4)) 6 | 7 | 8 | @pytest.mark.parametrize( 9 | "fields,dt", 10 | [ 11 | ( 12 | {"timestamp": datetime(2017, 11, 1, 7, 28, 29, tzinfo=timezone.utc)}, 13 | datetime(2017, 11, 1, 7, 28, 29, tzinfo=timezone.utc), 14 | ), 15 | ( 16 | { 17 | "timestamp": datetime(2017, 11, 1, 7, 28, 29, tzinfo=timezone.utc), 18 | "timezone": w4, 19 | }, 20 | datetime(2017, 11, 1, 7, 28, 29, tzinfo=timezone.utc), 21 | ), 22 | ( 23 | {"epoch": 1511642826}, 24 | datetime(2017, 11, 25, 20, 47, 6, tzinfo=timezone.utc), 25 | ), 26 | ( 27 | {"epoch": 1511642826, "timezone": w4}, 28 | datetime(2017, 11, 25, 16, 47, 6, tzinfo=w4), 29 | ), 30 | ( 31 | { 32 | "year": 2017, 33 | "mon": 11, 34 | "mday": 1, 35 | "hour": 7, 36 | "min": 28, 37 | "sec": 29, 38 | "timezone": w4, 39 | }, 40 | datetime(2017, 11, 1, 7, 28, 29, tzinfo=w4), 41 | ), 42 | ( 43 | {"year": 2017, "mon": 11, "mday": 1, "hour": 7, "min": 28, "sec": 29}, 44 | datetime(2017, 11, 1, 7, 28, 29), 45 | ), 46 | ( 47 | {"year": 2017, "mon": 11, "mday": 1, "hour": 7, "min": 28}, 48 | None, 49 | ), 50 | ( 51 | {"date": date(2017, 11, 1), "time": time(7, 28, 29)}, 52 | datetime(2017, 11, 1, 7, 28, 29), 53 | ), 54 | ( 55 | {"date": date(2017, 11, 1), "time": time(7, 28, 29), "timezone": w4}, 56 | datetime(2017, 11, 1, 7, 28, 29, tzinfo=w4), 57 | ), 58 | ( 59 | {"year": 2017, "mon": 11, "mday": 1, "time": time(7, 28, 29)}, 60 | datetime(2017, 11, 1, 7, 28, 29), 61 | ), 62 | ( 63 | {"date": date(2017, 11, 1), "hour": 7, "min": 28, "sec": 29}, 64 | datetime(2017, 11, 1, 7, 28, 29), 65 | ), 66 | ( 67 | {"year": 2017, "mon": 11, "mday": 1, "hour12": 7, "min": 28, "sec": 29}, 68 | None, 69 | ), 70 | ( 71 | { 72 | "year": 2017, 73 | "mon": 11, 74 | "mday": 1, 75 | "hour12": 7, 76 | "min": 28, 77 | "sec": 29, 78 | "am_pm": "AM", 79 | }, 80 | datetime(2017, 11, 1, 7, 28, 29), 81 | ), 82 | ( 83 | { 84 | "year": 2017, 85 | "mon": 11, 86 | "mday": 1, 87 | "hour12": 7, 88 | "min": 28, 89 | "sec": 29, 90 | "am_pm": "PM", 91 | }, 92 | datetime(2017, 11, 1, 19, 28, 29), 93 | ), 94 | ( 95 | { 96 | "year": 2017, 97 | "mon": 11, 98 | "mday": 1, 99 | "hour12": 12, 100 | "min": 28, 101 | "sec": 29, 102 | "am_pm": "AM", 103 | }, 104 | datetime(2017, 11, 1, 0, 28, 29), 105 | ), 106 | ( 107 | { 108 | "year": 2017, 109 | "mon": 11, 110 | "mday": 1, 111 | "hour12": 12, 112 | "min": 28, 113 | "sec": 29, 114 | "am_pm": "PM", 115 | }, 116 | datetime(2017, 11, 1, 12, 28, 29), 117 | ), 118 | ( 119 | {"milliepoch": 1511642826123}, 120 | datetime(2017, 11, 25, 20, 47, 6, 123000, tzinfo=timezone.utc), 121 | ), 122 | ( 123 | {"milliepoch": 1511642826123, "timezone": w4}, 124 | datetime(2017, 11, 25, 16, 47, 6, 123000, tzinfo=w4), 125 | ), 126 | ( 127 | {"microepoch": 1511642826123456}, 128 | datetime(2017, 11, 25, 20, 47, 6, 123456, tzinfo=timezone.utc), 129 | ), 130 | ( 131 | {"microepoch": 1511642826123456, "timezone": w4}, 132 | datetime(2017, 11, 25, 16, 47, 6, 123456, tzinfo=w4), 133 | ), 134 | ( 135 | {"milliepoch": 1511642826123, "epoch": 1511642826}, 136 | datetime(2017, 11, 25, 20, 47, 6, 123000, tzinfo=timezone.utc), 137 | ), 138 | ( 139 | {"milliepoch": 1511642826123, "microepoch": 1511642826123456}, 140 | datetime(2017, 11, 25, 20, 47, 6, 123456, tzinfo=timezone.utc), 141 | ), 142 | ( 143 | { 144 | "epoch": 1511642826, 145 | "milliepoch": 1511642826123, 146 | "microepoch": 1511642826123456, 147 | }, 148 | datetime(2017, 11, 25, 20, 47, 6, 123456, tzinfo=timezone.utc), 149 | ), 150 | ( 151 | {"year": 2017, "mon": 11, "mday": 1, "hour_min": time(7, 28)}, 152 | None, 153 | ), 154 | ( 155 | { 156 | "year": 2017, 157 | "mon": 11, 158 | "mday": 1, 159 | "hour_min": time(7, 28), 160 | "sec": 29, 161 | }, 162 | datetime(2017, 11, 1, 7, 28, 29), 163 | ), 164 | ( 165 | {"year": 2017, "yday": 305, "hour": 7, "min": 28, "sec": 29}, 166 | datetime(2017, 11, 1, 7, 28, 29), 167 | ), 168 | ( 169 | { 170 | "century": 20, 171 | "abbrev_year": 17, 172 | "mon": 11, 173 | "mday": 1, 174 | "hour": 7, 175 | "min": 28, 176 | "sec": 29, 177 | }, 178 | datetime(2017, 11, 1, 7, 28, 29), 179 | ), 180 | ( 181 | { 182 | "century": 20, 183 | "mon": 11, 184 | "mday": 1, 185 | "hour": 7, 186 | "min": 28, 187 | "sec": 29, 188 | }, 189 | None, 190 | ), 191 | ( 192 | { 193 | "abbrev_year": 17, 194 | "mon": 11, 195 | "mday": 1, 196 | "hour": 7, 197 | "min": 28, 198 | "sec": 29, 199 | }, 200 | datetime(2017, 11, 1, 7, 28, 29), 201 | ), 202 | ( 203 | { 204 | "abbrev_year": 0, 205 | "mon": 11, 206 | "mday": 1, 207 | "hour": 7, 208 | "min": 28, 209 | "sec": 29, 210 | }, 211 | datetime(2000, 11, 1, 7, 28, 29), 212 | ), 213 | ( 214 | { 215 | "abbrev_year": 68, 216 | "mon": 11, 217 | "mday": 1, 218 | "hour": 7, 219 | "min": 28, 220 | "sec": 29, 221 | }, 222 | datetime(2068, 11, 1, 7, 28, 29), 223 | ), 224 | ( 225 | { 226 | "abbrev_year": 69, 227 | "mon": 11, 228 | "mday": 1, 229 | "hour": 7, 230 | "min": 28, 231 | "sec": 29, 232 | }, 233 | datetime(1969, 11, 1, 7, 28, 29), 234 | ), 235 | ( 236 | { 237 | "abbrev_year": 99, 238 | "mon": 11, 239 | "mday": 1, 240 | "hour": 7, 241 | "min": 28, 242 | "sec": 29, 243 | }, 244 | datetime(1999, 11, 1, 7, 28, 29), 245 | ), 246 | ( 247 | { 248 | "century": 19, 249 | "abbrev_year": 0, 250 | "mon": 11, 251 | "mday": 1, 252 | "hour": 7, 253 | "min": 28, 254 | "sec": 29, 255 | }, 256 | datetime(1900, 11, 1, 7, 28, 29), 257 | ), 258 | ( 259 | { 260 | "century": 19, 261 | "abbrev_year": 68, 262 | "mon": 11, 263 | "mday": 1, 264 | "hour": 7, 265 | "min": 28, 266 | "sec": 29, 267 | }, 268 | datetime(1968, 11, 1, 7, 28, 29), 269 | ), 270 | ( 271 | { 272 | "century": 20, 273 | "abbrev_year": 69, 274 | "mon": 11, 275 | "mday": 1, 276 | "hour": 7, 277 | "min": 28, 278 | "sec": 29, 279 | }, 280 | datetime(2069, 11, 1, 7, 28, 29), 281 | ), 282 | ( 283 | { 284 | "century": 20, 285 | "abbrev_year": 99, 286 | "mon": 11, 287 | "mday": 1, 288 | "hour": 7, 289 | "min": 28, 290 | "sec": 29, 291 | }, 292 | datetime(2099, 11, 1, 7, 28, 29), 293 | ), 294 | ( 295 | { 296 | "year": 2017, 297 | "full_mon": "November", 298 | "mday": 1, 299 | "hour": 7, 300 | "min": 28, 301 | "sec": 29, 302 | }, 303 | datetime(2017, 11, 1, 7, 28, 29), 304 | ), 305 | ( 306 | { 307 | "year": 2017, 308 | "abbrev_mon": "Nov", 309 | "mday": 1, 310 | "hour": 7, 311 | "min": 28, 312 | "sec": 29, 313 | }, 314 | datetime(2017, 11, 1, 7, 28, 29), 315 | ), 316 | ( 317 | { 318 | "year": 2017, 319 | "mon": 11, 320 | "mday": 1, 321 | "hour": 7, 322 | "min": 28, 323 | "sec": 29, 324 | "msec_frac": 123, 325 | }, 326 | datetime(2017, 11, 1, 7, 28, 29, 123000), 327 | ), 328 | ( 329 | { 330 | "year": 2017, 331 | "mon": 11, 332 | "mday": 1, 333 | "hour": 7, 334 | "min": 28, 335 | "sec": 29, 336 | "usec_frac": 123456, 337 | }, 338 | datetime(2017, 11, 1, 7, 28, 29, 123456), 339 | ), 340 | ( 341 | {"year": 2017, "mday": 1, "hour": 7, "min": 28, "sec": 29}, 342 | None, 343 | ), 344 | ( 345 | {"year": 2017, "mon": 11, "hour": 7, "min": 28, "sec": 29}, 346 | None, 347 | ), 348 | ( 349 | {"year": 2017, "mon": 11, "mday": 1, "hour": 7, "sec": 29}, 350 | None, 351 | ), 352 | ( 353 | { 354 | "iso_year": 2019, 355 | "iso_weeknum": 20, 356 | "iso_wday": 7, 357 | "time": time(12, 34, 56), 358 | }, 359 | datetime(2019, 5, 19, 12, 34, 56), 360 | ), 361 | ( 362 | { 363 | "iso_year": 2019, 364 | "iso_weeknum": 20, 365 | "wday": 0, 366 | "time": time(12, 34, 56), 367 | }, 368 | datetime(2019, 5, 19, 12, 34, 56), 369 | ), 370 | ( 371 | { 372 | "iso_year": 2019, 373 | "iso_weeknum": 20, 374 | "abbrev_wday": "Sun", 375 | "time": time(12, 34, 56), 376 | }, 377 | datetime(2019, 5, 19, 12, 34, 56), 378 | ), 379 | ( 380 | { 381 | "iso_year": 2019, 382 | "iso_weeknum": 20, 383 | "full_wday": "Sunday", 384 | "time": time(12, 34, 56), 385 | }, 386 | datetime(2019, 5, 19, 12, 34, 56), 387 | ), 388 | ( 389 | { 390 | "iso_year": 2019, 391 | "iso_weeknum": 52, 392 | "iso_wday": 7, 393 | "time": time(12, 34, 56), 394 | }, 395 | datetime(2019, 12, 29, 12, 34, 56), 396 | ), 397 | ( 398 | { 399 | "iso_year": 2020, 400 | "iso_weeknum": 1, 401 | "iso_wday": 1, 402 | "time": time(12, 34, 56), 403 | }, 404 | datetime(2019, 12, 30, 12, 34, 56), 405 | ), 406 | ( 407 | { 408 | "abbrev_iso_year": 19, 409 | "iso_weeknum": 20, 410 | "iso_wday": 7, 411 | "time": time(12, 34, 56), 412 | }, 413 | datetime(2019, 5, 19, 12, 34, 56), 414 | ), 415 | ( 416 | { 417 | "abbrev_iso_year": 0, 418 | "iso_weeknum": 20, 419 | "iso_wday": 5, 420 | "time": time(12, 34, 56), 421 | }, 422 | datetime(2000, 5, 19, 12, 34, 56), 423 | ), 424 | ( 425 | { 426 | "abbrev_iso_year": 68, 427 | "iso_weeknum": 20, 428 | "iso_wday": 6, 429 | "time": time(12, 34, 56), 430 | }, 431 | datetime(2068, 5, 19, 12, 34, 56), 432 | ), 433 | ( 434 | { 435 | "abbrev_iso_year": 69, 436 | "iso_weeknum": 21, 437 | "iso_wday": 1, 438 | "time": time(12, 34, 56), 439 | }, 440 | datetime(1969, 5, 19, 12, 34, 56), 441 | ), 442 | ( 443 | { 444 | "abbrev_iso_year": 99, 445 | "iso_weeknum": 20, 446 | "iso_wday": 3, 447 | "time": time(12, 34, 56), 448 | }, 449 | datetime(1999, 5, 19, 12, 34, 56), 450 | ), 451 | ( 452 | { 453 | "year": 2019, 454 | "sunday_weeknum": 20, 455 | "wday": 0, 456 | "time": time(12, 34, 56), 457 | }, 458 | datetime(2019, 5, 19, 12, 34, 56), 459 | ), 460 | ( 461 | { 462 | "year": 2019, 463 | "sunday_weeknum": 20, 464 | "iso_wday": 7, 465 | "time": time(12, 34, 56), 466 | }, 467 | datetime(2019, 5, 19, 12, 34, 56), 468 | ), 469 | ( 470 | { 471 | "year": 2019, 472 | "sunday_weeknum": 20, 473 | "full_wday": "Sunday", 474 | "time": time(12, 34, 56), 475 | }, 476 | datetime(2019, 5, 19, 12, 34, 56), 477 | ), 478 | ( 479 | { 480 | "year": 2019, 481 | "sunday_weeknum": 20, 482 | "abbrev_wday": "Sun", 483 | "time": time(12, 34, 56), 484 | }, 485 | datetime(2019, 5, 19, 12, 34, 56), 486 | ), 487 | ( 488 | { 489 | "year": 2019, 490 | "monday_weeknum": 19, 491 | "wday": 0, 492 | "time": time(12, 34, 56), 493 | }, 494 | datetime(2019, 5, 19, 12, 34, 56), 495 | ), 496 | ( 497 | { 498 | "year": 2019, 499 | "monday_weeknum": 19, 500 | "iso_wday": 7, 501 | "time": time(12, 34, 56), 502 | }, 503 | datetime(2019, 5, 19, 12, 34, 56), 504 | ), 505 | ( 506 | { 507 | "year": 2019, 508 | "monday_weeknum": 19, 509 | "full_wday": "Sunday", 510 | "time": time(12, 34, 56), 511 | }, 512 | datetime(2019, 5, 19, 12, 34, 56), 513 | ), 514 | ( 515 | { 516 | "year": 2019, 517 | "monday_weeknum": 19, 518 | "abbrev_wday": "Sun", 519 | "time": time(12, 34, 56), 520 | }, 521 | datetime(2019, 5, 19, 12, 34, 56), 522 | ), 523 | ( 524 | { 525 | "abbrev_year": 0, 526 | "sunday_weeknum": 20, 527 | "wday": 0, 528 | "time": time(12, 34, 56), 529 | }, 530 | datetime(2000, 5, 14, 12, 34, 56), 531 | ), 532 | ( 533 | { 534 | "abbrev_year": 68, 535 | "sunday_weeknum": 20, 536 | "wday": 0, 537 | "time": time(12, 34, 56), 538 | }, 539 | datetime(2068, 5, 13, 12, 34, 56), 540 | ), 541 | ( 542 | { 543 | "abbrev_year": 69, 544 | "sunday_weeknum": 20, 545 | "wday": 0, 546 | "time": time(12, 34, 56), 547 | }, 548 | datetime(1969, 5, 18, 12, 34, 56), 549 | ), 550 | ( 551 | { 552 | "abbrev_year": 99, 553 | "sunday_weeknum": 20, 554 | "wday": 0, 555 | "time": time(12, 34, 56), 556 | }, 557 | datetime(1999, 5, 16, 12, 34, 56), 558 | ), 559 | ( 560 | { 561 | "abbrev_year": 0, 562 | "monday_weeknum": 20, 563 | "wday": 0, 564 | "time": time(12, 34, 56), 565 | }, 566 | datetime(2000, 5, 21, 12, 34, 56), 567 | ), 568 | ( 569 | { 570 | "abbrev_year": 68, 571 | "monday_weeknum": 20, 572 | "wday": 0, 573 | "time": time(12, 34, 56), 574 | }, 575 | datetime(2068, 5, 20, 12, 34, 56), 576 | ), 577 | ( 578 | { 579 | "abbrev_year": 69, 580 | "monday_weeknum": 20, 581 | "wday": 0, 582 | "time": time(12, 34, 56), 583 | }, 584 | datetime(1969, 5, 25, 12, 34, 56), 585 | ), 586 | ( 587 | { 588 | "abbrev_year": 99, 589 | "monday_weeknum": 20, 590 | "wday": 0, 591 | "time": time(12, 34, 56), 592 | }, 593 | datetime(1999, 5, 23, 12, 34, 56), 594 | ), 595 | ( 596 | { 597 | "century": 19, 598 | "abbrev_year": 0, 599 | "sunday_weeknum": 20, 600 | "wday": 0, 601 | "time": time(12, 34, 56), 602 | }, 603 | datetime(1900, 5, 20, 12, 34, 56), 604 | ), 605 | ( 606 | { 607 | "century": 19, 608 | "abbrev_year": 68, 609 | "sunday_weeknum": 20, 610 | "wday": 0, 611 | "time": time(12, 34, 56), 612 | }, 613 | datetime(1968, 5, 19, 12, 34, 56), 614 | ), 615 | ( 616 | { 617 | "century": 20, 618 | "abbrev_year": 69, 619 | "sunday_weeknum": 20, 620 | "wday": 0, 621 | "time": time(12, 34, 56), 622 | }, 623 | datetime(2069, 5, 19, 12, 34, 56), 624 | ), 625 | ( 626 | { 627 | "century": 20, 628 | "abbrev_year": 99, 629 | "sunday_weeknum": 20, 630 | "wday": 0, 631 | "time": time(12, 34, 56), 632 | }, 633 | datetime(2099, 5, 17, 12, 34, 56), 634 | ), 635 | ( 636 | { 637 | "century": 19, 638 | "abbrev_year": 0, 639 | "monday_weeknum": 20, 640 | "wday": 0, 641 | "time": time(12, 34, 56), 642 | }, 643 | datetime(1900, 5, 20, 12, 34, 56), 644 | ), 645 | ( 646 | { 647 | "century": 19, 648 | "abbrev_year": 68, 649 | "monday_weeknum": 20, 650 | "wday": 0, 651 | "time": time(12, 34, 56), 652 | }, 653 | datetime(1968, 5, 19, 12, 34, 56), 654 | ), 655 | ( 656 | { 657 | "century": 20, 658 | "abbrev_year": 69, 659 | "monday_weeknum": 20, 660 | "wday": 0, 661 | "time": time(12, 34, 56), 662 | }, 663 | datetime(2069, 5, 26, 12, 34, 56), 664 | ), 665 | ( 666 | { 667 | "century": 20, 668 | "abbrev_year": 99, 669 | "monday_weeknum": 20, 670 | "wday": 0, 671 | "time": time(12, 34, 56), 672 | }, 673 | datetime(2099, 5, 24, 12, 34, 56), 674 | ), 675 | ], 676 | ) 677 | def test_assemble_datetime(fields, dt): 678 | res = assemble_datetime(fields) 679 | if dt is None: 680 | assert res is None 681 | else: 682 | assert res == dt 683 | assert res.replace(tzinfo=None) == dt.replace(tzinfo=None) 684 | assert res.tzinfo == dt.tzinfo 685 | -------------------------------------------------------------------------------- /test/test_parse.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta, timezone 2 | import pytest 3 | from apachelogs import COMBINED, VHOST_COMBINED, LogParser 4 | 5 | 6 | @pytest.mark.parametrize( 7 | "fmt,entry,fields", 8 | [ 9 | ( 10 | COMBINED, 11 | '209.126.136.4 - - [01/Nov/2017:07:28:29 +0000] "GET / HTTP/1.1" 301 521 "-" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36"', 12 | { 13 | "remote_host": "209.126.136.4", 14 | "remote_logname": None, 15 | "remote_user": None, 16 | "request_time": datetime(2017, 11, 1, 7, 28, 29, tzinfo=timezone.utc), 17 | "request_line": "GET / HTTP/1.1", 18 | "final_status": 301, 19 | "bytes_sent": 521, 20 | "headers_in": { 21 | "Referer": None, 22 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36", 23 | }, 24 | "directives": { 25 | "%h": "209.126.136.4", 26 | "%l": None, 27 | "%u": None, 28 | "%t": datetime(2017, 11, 1, 7, 28, 29, tzinfo=timezone.utc), 29 | "%r": "GET / HTTP/1.1", 30 | "%>s": 301, 31 | "%b": 521, 32 | "%{Referer}i": None, 33 | "%{User-Agent}i": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36", 34 | }, 35 | }, 36 | ), 37 | ( 38 | '"%400r" "%r"', 39 | '"-" "GET /index.html HTTP/1.1"', 40 | { 41 | "request_line": "GET /index.html HTTP/1.1", 42 | "directives": { 43 | "%400r": None, 44 | "%r": "GET /index.html HTTP/1.1", 45 | }, 46 | }, 47 | ), 48 | ( 49 | '"%r" "%400r"', 50 | '"GET /index.html HTTP/1.1" "-"', 51 | { 52 | "request_line": "GET /index.html HTTP/1.1", 53 | "directives": { 54 | "%r": "GET /index.html HTTP/1.1", 55 | "%400r": None, 56 | }, 57 | }, 58 | ), 59 | ( 60 | '"%!400r" "%r"', 61 | '"-" "GET /index.xml HTTP/1.1"', 62 | { 63 | "request_line": "GET /index.xml HTTP/1.1", 64 | "directives": { 65 | "%!400r": None, 66 | "%r": "GET /index.xml HTTP/1.1", 67 | }, 68 | }, 69 | ), 70 | ( 71 | '"%r" "%!400r"', 72 | '"GET /index.xml HTTP/1.1" "-"', 73 | { 74 | "request_line": "GET /index.xml HTTP/1.1", 75 | "directives": { 76 | "%r": "GET /index.xml HTTP/1.1", 77 | "%!400r": None, 78 | }, 79 | }, 80 | ), 81 | ( 82 | "%s", 83 | "201 202 203", 84 | { 85 | "original_status": 201, 86 | "status": 202, 87 | "final_status": 203, 88 | "directives": { 89 | "%s": 203, 92 | }, 93 | }, 94 | ), 95 | ( 96 | "%s", 97 | "000", 98 | { 99 | "status": 000, 100 | "directives": { 101 | "%s": 000, 102 | }, 103 | }, 104 | ), 105 | ( 106 | "%<{Referer}i %{Referer}i %>{Referer}i", 107 | "http://example.com/original http://example.com/default http://example.com/final", 108 | { 109 | "original_headers_in": { 110 | "Referer": "http://example.com/original", 111 | }, 112 | "headers_in": { 113 | "Referer": "http://example.com/default", 114 | }, 115 | "final_headers_in": { 116 | "Referer": "http://example.com/final", 117 | }, 118 | "directives": { 119 | "%<{Referer}i": "http://example.com/original", 120 | "%{Referer}i": "http://example.com/default", 121 | "%>{Referer}i": "http://example.com/final", 122 | }, 123 | }, 124 | ), 125 | ( 126 | "%T %{ms}T", 127 | "1 1042", 128 | { 129 | "request_duration_seconds": 1, 130 | "request_duration_milliseconds": 1042, 131 | "directives": { 132 | "%T": 1, 133 | "%{ms}T": 1042, 134 | }, 135 | }, 136 | ), 137 | ( 138 | "%T %{MS}T", 139 | "1 1042", 140 | { 141 | "request_duration_seconds": 1, 142 | "request_duration_milliseconds": 1042, 143 | "directives": { 144 | "%T": 1, 145 | "%{MS}T": 1042, 146 | }, 147 | }, 148 | ), 149 | ( 150 | '%{%Y-%m-%d %H:%M:%S %z}t [%{msec_frac}t] %s %a:%{remote}p <-> %A:%p "%m" "%U%q" "%f" %P:%{tid}P "%R"', 151 | '2019-05-05 20:49:14 +0000 [690] 403 172.21.0.1:44782 <-> 172.21.0.2:80 "GET" "/wsgi/test?q=foo" "/usr/local/app/run.wsgi" 16:140168282543872 "wsgi-script"', 152 | { 153 | "request_time": datetime( 154 | 2019, 5, 5, 20, 49, 14, 690000, tzinfo=timezone.utc 155 | ), 156 | "request_time_fields": { 157 | "year": 2019, 158 | "mon": 5, 159 | "mday": 5, 160 | "hour": 20, 161 | "min": 49, 162 | "sec": 14, 163 | "timezone": timezone.utc, 164 | "msec_frac": 690, 165 | }, 166 | "status": 403, 167 | "remote_address": "172.21.0.1", 168 | "remote_port": 44782, 169 | "local_address": "172.21.0.2", 170 | "server_port": 80, 171 | "request_method": "GET", 172 | "request_uri": "/wsgi/test", 173 | "request_query": "?q=foo", 174 | "request_file": "/usr/local/app/run.wsgi", 175 | "pid": 16, 176 | "tid": 140168282543872, 177 | "handler": "wsgi-script", 178 | "directives": { 179 | "%{%Y}t": 2019, 180 | "%{%m}t": 5, 181 | "%{%d}t": 5, 182 | "%{%H}t": 20, 183 | "%{%M}t": 49, 184 | "%{%S}t": 14, 185 | "%{%z}t": timezone.utc, 186 | "%{msec_frac}t": 690, 187 | "%s": 403, 188 | "%a": "172.21.0.1", 189 | "%{remote}p": 44782, 190 | "%A": "172.21.0.2", 191 | "%p": 80, 192 | "%m": "GET", 193 | "%U": "/wsgi/test", 194 | "%q": "?q=foo", 195 | "%f": "/usr/local/app/run.wsgi", 196 | "%P": 16, 197 | "%{tid}P": 140168282543872, 198 | "%R": "wsgi-script", 199 | }, 200 | }, 201 | ), 202 | ( 203 | '%{%Y-%m-%d %H:%M:%S %z}t [%{msec_frac}t] %s %a:%{remote}p <-> %A:%p "%m" "%U%q" "%f" %P:%{hextid}P "%R"', 204 | r'2019-05-05 20:56:07 +0000 [148] 403 172.22.0.1:34488 <-> 172.22.0.2:80 "GET" "/wsgi/t\xc3\xa9st" "/usr/local/app/run.wsgi" 16:7fb9de5af700 "wsgi-script"', 205 | { 206 | "request_time": datetime( 207 | 2019, 5, 5, 20, 56, 7, 148000, tzinfo=timezone.utc 208 | ), 209 | "request_time_fields": { 210 | "year": 2019, 211 | "mon": 5, 212 | "mday": 5, 213 | "hour": 20, 214 | "min": 56, 215 | "sec": 7, 216 | "timezone": timezone.utc, 217 | "msec_frac": 148, 218 | }, 219 | "status": 403, 220 | "remote_address": "172.22.0.1", 221 | "remote_port": 34488, 222 | "local_address": "172.22.0.2", 223 | "server_port": 80, 224 | "request_method": "GET", 225 | "request_uri": "/wsgi/t\xc3\xa9st", 226 | "request_query": "", 227 | "request_file": "/usr/local/app/run.wsgi", 228 | "pid": 16, 229 | "tid": 140436276180736, 230 | "handler": "wsgi-script", 231 | "directives": { 232 | "%{%Y}t": 2019, 233 | "%{%m}t": 5, 234 | "%{%d}t": 5, 235 | "%{%H}t": 20, 236 | "%{%M}t": 56, 237 | "%{%S}t": 7, 238 | "%{%z}t": timezone.utc, 239 | "%{msec_frac}t": 148, 240 | "%s": 403, 241 | "%a": "172.22.0.1", 242 | "%{remote}p": 34488, 243 | "%A": "172.22.0.2", 244 | "%p": 80, 245 | "%m": "GET", 246 | "%U": "/wsgi/t\xc3\xa9st", 247 | "%q": "", 248 | "%f": "/usr/local/app/run.wsgi", 249 | "%P": 16, 250 | "%{hextid}P": 140436276180736, 251 | "%R": "wsgi-script", 252 | }, 253 | }, 254 | ), 255 | ( 256 | "%a:%{REMOTE}p", 257 | "172.21.0.1:44782", 258 | { 259 | "remote_address": "172.21.0.1", 260 | "remote_port": 44782, 261 | "directives": { 262 | "%a": "172.21.0.1", 263 | "%{REMOTE}p": 44782, 264 | }, 265 | }, 266 | ), 267 | ( 268 | "%P:%{TID}P", 269 | "16:140168282543872", 270 | { 271 | "pid": 16, 272 | "tid": 140168282543872, 273 | "directives": { 274 | "%P": 16, 275 | "%{TID}P": 140168282543872, 276 | }, 277 | }, 278 | ), 279 | ( 280 | "%200f", 281 | "-", 282 | { 283 | "request_file": None, 284 | "directives": { 285 | "%200f": None, 286 | }, 287 | }, 288 | ), 289 | ( 290 | "%200f", 291 | "/var/www/html/index.html", 292 | { 293 | "request_file": "/var/www/html/index.html", 294 | "directives": { 295 | "%200f": "/var/www/html/index.html", 296 | }, 297 | }, 298 | ), 299 | ( 300 | "%200{%Y-%m-%d}t", 301 | "-", 302 | { 303 | "request_time": None, 304 | "request_time_fields": { 305 | "year": None, 306 | "mon": None, 307 | "mday": None, 308 | }, 309 | "directives": { 310 | "%200{%Y}t": None, 311 | "%200{%m}t": None, 312 | "%200{%d}t": None, 313 | }, 314 | }, 315 | ), 316 | ( 317 | "%200{%Y-%m-%d}t", 318 | "2019-05-06", 319 | { 320 | "request_time": None, 321 | "request_time_fields": { 322 | "year": 2019, 323 | "mon": 5, 324 | "mday": 6, 325 | }, 326 | "directives": { 327 | "%200{%Y}t": 2019, 328 | "%200{%m}t": 5, 329 | "%200{%d}t": 6, 330 | }, 331 | }, 332 | ), 333 | ( 334 | "%200t", 335 | "-", 336 | { 337 | "request_time": None, 338 | "request_time_fields": {"timestamp": None}, 339 | "directives": { 340 | "%200t": None, 341 | }, 342 | }, 343 | ), 344 | ( 345 | "%200{}t", 346 | "-", 347 | { 348 | "request_time": None, 349 | "request_time_fields": {"timestamp": None}, 350 | "directives": { 351 | "%200{}t": None, 352 | }, 353 | }, 354 | ), 355 | ( 356 | VHOST_COMBINED, 357 | r'www.varonathe.org:80 185.234.218.71 - - [14/Apr/2018:18:39:42 +0000] "GET / HTTP/1.1" 301 539 "-" "}__test|O:21:\"JDatabaseDriverMysqli\":3:{s:4:\"\\0\\0\\0a\";O:17:\"JSimplepieFactory\":0:{}s:21:\"\\0\\0\\0disconnectHandlers\";a:1:{i:0;a:2:{i:0;O:9:\"SimplePie\":5:{s:8:\"sanitize\";O:20:\"JDatabaseDriverMysql\":0:{}s:5:\"cache\";b:1;s:19:\"cache_name_function\";s:6:\"assert\";s:10:\"javascript\";i:9999;s:8:\"feed_url\";s:54:\"eval(base64_decode($_POST[111]));JFactory::get();exit;\";}i:1;s:4:\"init\";}}s:13:\"\\0\\0\\0connection\";i:1;}\xf0\x9d\x8c\x86"', 358 | { 359 | "virtual_host": "www.varonathe.org", 360 | "server_port": 80, 361 | "remote_host": "185.234.218.71", 362 | "remote_logname": None, 363 | "remote_user": None, 364 | "request_time": datetime(2018, 4, 14, 18, 39, 42, tzinfo=timezone.utc), 365 | "request_time_fields": { 366 | "timestamp": datetime(2018, 4, 14, 18, 39, 42, tzinfo=timezone.utc), 367 | }, 368 | "request_line": "GET / HTTP/1.1", 369 | "final_status": 301, 370 | "bytes_out": 539, 371 | "headers_in": { 372 | "Referer": None, 373 | "User-Agent": '}__test|O:21:"JDatabaseDriverMysqli":3:{s:4:"\\0\\0\\0a";O:17:"JSimplepieFactory":0:{}s:21:"\\0\\0\\0disconnectHandlers";a:1:{i:0;a:2:{i:0;O:9:"SimplePie":5:{s:8:"sanitize";O:20:"JDatabaseDriverMysql":0:{}s:5:"cache";b:1;s:19:"cache_name_function";s:6:"assert";s:10:"javascript";i:9999;s:8:"feed_url";s:54:"eval(base64_decode($_POST[111]));JFactory::get();exit;";}i:1;s:4:"init";}}s:13:"\\0\\0\\0connection";i:1;}\xf0\x9d\x8c\x86', 374 | }, 375 | "directives": { 376 | "%v": "www.varonathe.org", 377 | "%p": 80, 378 | "%h": "185.234.218.71", 379 | "%l": None, 380 | "%u": None, 381 | "%t": datetime(2018, 4, 14, 18, 39, 42, tzinfo=timezone.utc), 382 | "%r": "GET / HTTP/1.1", 383 | "%>s": 301, 384 | "%O": 539, 385 | "%{Referer}i": None, 386 | "%{User-Agent}i": '}__test|O:21:"JDatabaseDriverMysqli":3:{s:4:"\\0\\0\\0a";O:17:"JSimplepieFactory":0:{}s:21:"\\0\\0\\0disconnectHandlers";a:1:{i:0;a:2:{i:0;O:9:"SimplePie":5:{s:8:"sanitize";O:20:"JDatabaseDriverMysql":0:{}s:5:"cache";b:1;s:19:"cache_name_function";s:6:"assert";s:10:"javascript";i:9999;s:8:"feed_url";s:54:"eval(base64_decode($_POST[111]));JFactory::get();exit;";}i:1;s:4:"init";}}s:13:"\\0\\0\\0connection";i:1;}\xf0\x9d\x8c\x86', 387 | }, 388 | }, 389 | ), 390 | ( 391 | "%{%Y-%m-%dT%H:%M:%S}t.%>{usec_frac}t%>{%z}t", 407 | "2019-05-06T12:09:43.123456-0400", 408 | { 409 | "final_request_time": datetime( 410 | 2019, 411 | 5, 412 | 6, 413 | 12, 414 | 9, 415 | 43, 416 | 123456, 417 | tzinfo=timezone(timedelta(hours=-4)), 418 | ), 419 | "final_request_time_fields": { 420 | "year": 2019, 421 | "mon": 5, 422 | "mday": 6, 423 | "hour": 12, 424 | "min": 9, 425 | "sec": 43, 426 | "usec_frac": 123456, 427 | "timezone": timezone(timedelta(hours=-4)), 428 | }, 429 | "directives": { 430 | "%>{%Y}t": 2019, 431 | "%>{%m}t": 5, 432 | "%>{%d}t": 6, 433 | "%>{%H}t": 12, 434 | "%>{%M}t": 9, 435 | "%>{%S}t": 43, 436 | "%>{usec_frac}t": 123456, 437 | "%>{%z}t": timezone(timedelta(hours=-4)), 438 | }, 439 | }, 440 | ), 441 | ( 442 | "%m %% %U%q %% %H", 443 | "GET % /index.html?foo % HTTP/1.1", 444 | { 445 | "request_method": "GET", 446 | "request_uri": "/index.html", 447 | "request_query": "?foo", 448 | "request_protocol": "HTTP/1.1", 449 | "directives": { 450 | "%m": "GET", 451 | "%U": "/index.html", 452 | "%q": "?foo", 453 | "%H": "HTTP/1.1", 454 | }, 455 | }, 456 | ), 457 | ( 458 | "%<>s", 459 | "200", 460 | { 461 | "final_status": 200, 462 | "directives": { 463 | "%<>s": 200, 464 | }, 465 | }, 466 | ), 467 | ( 468 | "%200!200T", 479 | "-", 480 | { 481 | "final_request_duration_seconds": None, 482 | "directives": { 483 | "%>!200T": None, 484 | }, 485 | }, 486 | ), 487 | ( 488 | "%>{s}!200T", 489 | "-", 490 | { 491 | "final_request_duration_seconds": None, 492 | "directives": { 493 | "%>{s}!200T": None, 494 | }, 495 | }, 496 | ), 497 | ( 498 | "%{s}!200>T", 499 | "-", 500 | { 501 | "final_request_duration_seconds": None, 502 | "directives": { 503 | "%{s}!200>T": None, 504 | }, 505 | }, 506 | ), 507 | ( 508 | "%<{s}!200>T", 509 | "-", 510 | { 511 | "final_request_duration_seconds": None, 512 | "directives": { 513 | "%<{s}!200>T": None, 514 | }, 515 | }, 516 | ), 517 | ( 518 | "%><{s}!200>T", 519 | "-", 520 | { 521 | "final_request_duration_seconds": None, 522 | "directives": { 523 | "%><{s}!200>T": None, 524 | }, 525 | }, 526 | ), 527 | ( 528 | "%<200>T", 529 | "-", 530 | { 531 | "final_request_duration_seconds": None, 532 | "directives": { 533 | "%<200>T": None, 534 | }, 535 | }, 536 | ), 537 | ( 538 | "%<200T", 539 | "-", 540 | { 541 | "final_request_duration_seconds": None, 542 | "directives": { 543 | "%<200T": None, 544 | }, 545 | }, 546 | ), 547 | ( 548 | "%u", 549 | '""', 550 | { 551 | "remote_user": "", 552 | "directives": { 553 | "%u": "", 554 | }, 555 | }, 556 | ), 557 | ( 558 | "%U%q", 559 | "/wsgi/test?", 560 | { 561 | "request_uri": "/wsgi/test", 562 | "request_query": "?", 563 | "directives": { 564 | "%U": "/wsgi/test", 565 | "%q": "?", 566 | }, 567 | }, 568 | ), 569 | ], 570 | ) 571 | def test_parse(fmt, entry, fields): 572 | log_entry = LogParser(fmt).parse(entry) 573 | assert log_entry.entry == entry.rstrip("\r\n") 574 | assert log_entry.format == fmt 575 | for k, v in fields.items(): 576 | assert getattr(log_entry, k) == v 577 | -------------------------------------------------------------------------------- /test/test_parse_custom_time.py: -------------------------------------------------------------------------------- 1 | from datetime import date, datetime, time, timedelta, timezone 2 | import locale 3 | import pytest 4 | from apachelogs import LogParser 5 | 6 | w5 = timezone(timedelta(hours=-5)) 7 | w4 = timezone(timedelta(hours=-4)) 8 | 9 | 10 | @pytest.mark.parametrize( 11 | "fmt,entry,fields", 12 | [ 13 | ( 14 | "%{%a %b %d}t %r", 15 | "Sat Nov 25 GET / HTTP/1.1", 16 | { 17 | "request_line": "GET / HTTP/1.1", 18 | "request_time": None, 19 | "request_time_fields": { 20 | "abbrev_wday": "Sat", 21 | "abbrev_mon": "Nov", 22 | "mday": 25, 23 | }, 24 | "directives": { 25 | "%{%a}t": "Sat", 26 | "%{%b}t": "Nov", 27 | "%{%d}t": 25, 28 | "%r": "GET / HTTP/1.1", 29 | }, 30 | }, 31 | ), 32 | ( 33 | "%{%A %B %d}t %r", 34 | "Saturday November 25 GET / HTTP/1.1", 35 | { 36 | "request_line": "GET / HTTP/1.1", 37 | "request_time": None, 38 | "request_time_fields": { 39 | "full_wday": "Saturday", 40 | "full_mon": "November", 41 | "mday": 25, 42 | }, 43 | "directives": { 44 | "%{%A}t": "Saturday", 45 | "%{%B}t": "November", 46 | "%{%d}t": 25, 47 | "%r": "GET / HTTP/1.1", 48 | }, 49 | }, 50 | ), 51 | ( 52 | "%{%w (%u) %m/%d}t %r", 53 | "6 (6) 11/25 GET / HTTP/1.1", 54 | { 55 | "request_line": "GET / HTTP/1.1", 56 | "request_time": None, 57 | "request_time_fields": { 58 | "wday": 6, 59 | "iso_wday": 6, 60 | "mon": 11, 61 | "mday": 25, 62 | }, 63 | "directives": { 64 | "%{%w}t": 6, 65 | "%{%u}t": 6, 66 | "%{%m}t": 11, 67 | "%{%d}t": 25, 68 | "%r": "GET / HTTP/1.1", 69 | }, 70 | }, 71 | ), 72 | ( 73 | "%{%s}t %r", 74 | "1511642826 GET / HTTP/1.1", 75 | { 76 | "request_line": "GET / HTTP/1.1", 77 | "request_time": datetime(2017, 11, 25, 20, 47, 6, tzinfo=timezone.utc), 78 | "request_time_fields": {"epoch": 1511642826}, 79 | "directives": { 80 | "%{%s}t": 1511642826, 81 | "%r": "GET / HTTP/1.1", 82 | }, 83 | }, 84 | ), 85 | ( 86 | "%{%s@%z}t %r", 87 | "1511642826@-0500 GET / HTTP/1.1", 88 | { 89 | "request_line": "GET / HTTP/1.1", 90 | "request_time": datetime(2017, 11, 25, 15, 47, 6, tzinfo=w5), 91 | "request_time_fields": {"epoch": 1511642826, "timezone": w5}, 92 | "directives": { 93 | "%{%s}t": 1511642826, 94 | "%{%z}t": w5, 95 | "%r": "GET / HTTP/1.1", 96 | }, 97 | }, 98 | ), 99 | ( 100 | "%{%Y-%m-%d %H:%M:%S}t %r", 101 | "2017-11-25 20:47:06 GET / HTTP/1.1", 102 | { 103 | "request_line": "GET / HTTP/1.1", 104 | "request_time": datetime(2017, 11, 25, 20, 47, 6), 105 | "request_time_fields": { 106 | "year": 2017, 107 | "mon": 11, 108 | "mday": 25, 109 | "hour": 20, 110 | "min": 47, 111 | "sec": 6, 112 | }, 113 | "directives": { 114 | "%{%Y}t": 2017, 115 | "%{%m}t": 11, 116 | "%{%d}t": 25, 117 | "%{%H}t": 20, 118 | "%{%M}t": 47, 119 | "%{%S}t": 6, 120 | "%r": "GET / HTTP/1.1", 121 | }, 122 | }, 123 | ), 124 | ( 125 | "%{%Y-%m-%d %H:%M:%S %z}t %r", 126 | "2017-11-25 20:47:06 -0500 GET / HTTP/1.1", 127 | { 128 | "request_line": "GET / HTTP/1.1", 129 | "request_time": datetime(2017, 11, 25, 20, 47, 6, tzinfo=w5), 130 | "request_time_fields": { 131 | "year": 2017, 132 | "mon": 11, 133 | "mday": 25, 134 | "hour": 20, 135 | "min": 47, 136 | "sec": 6, 137 | "timezone": w5, 138 | }, 139 | "directives": { 140 | "%{%Y}t": 2017, 141 | "%{%m}t": 11, 142 | "%{%d}t": 25, 143 | "%{%H}t": 20, 144 | "%{%M}t": 47, 145 | "%{%S}t": 6, 146 | "%{%z}t": w5, 147 | "%r": "GET / HTTP/1.1", 148 | }, 149 | }, 150 | ), 151 | ( 152 | "%{%s}t@%{%z}t %r", 153 | "1511642826@-0500 GET / HTTP/1.1", 154 | { 155 | "request_line": "GET / HTTP/1.1", 156 | "request_time": datetime(2017, 11, 25, 15, 47, 6, tzinfo=w5), 157 | "request_time_fields": {"epoch": 1511642826, "timezone": w5}, 158 | "directives": { 159 | "%{%s}t": 1511642826, 160 | "%{%z}t": w5, 161 | "%r": "GET / HTTP/1.1", 162 | }, 163 | }, 164 | ), 165 | ( 166 | "%{%Y-%m-%d}t %{%H:%M:%S}t %r", 167 | "2017-11-25 20:47:06 GET / HTTP/1.1", 168 | { 169 | "request_line": "GET / HTTP/1.1", 170 | "request_time": datetime(2017, 11, 25, 20, 47, 6), 171 | "request_time_fields": { 172 | "year": 2017, 173 | "mon": 11, 174 | "mday": 25, 175 | "hour": 20, 176 | "min": 47, 177 | "sec": 6, 178 | }, 179 | "directives": { 180 | "%{%Y}t": 2017, 181 | "%{%m}t": 11, 182 | "%{%d}t": 25, 183 | "%{%H}t": 20, 184 | "%{%M}t": 47, 185 | "%{%S}t": 6, 186 | "%r": "GET / HTTP/1.1", 187 | }, 188 | }, 189 | ), 190 | ( 191 | "%{%Y-%m-%d}t %{%H:%M:%S}t %{%z}t %r", 192 | "2017-11-25 20:47:06 -0500 GET / HTTP/1.1", 193 | { 194 | "request_line": "GET / HTTP/1.1", 195 | "request_time": datetime(2017, 11, 25, 20, 47, 6, tzinfo=w5), 196 | "request_time_fields": { 197 | "year": 2017, 198 | "mon": 11, 199 | "mday": 25, 200 | "hour": 20, 201 | "min": 47, 202 | "sec": 6, 203 | "timezone": w5, 204 | }, 205 | "directives": { 206 | "%{%Y}t": 2017, 207 | "%{%m}t": 11, 208 | "%{%d}t": 25, 209 | "%{%H}t": 20, 210 | "%{%M}t": 47, 211 | "%{%S}t": 6, 212 | "%{%z}t": w5, 213 | "%r": "GET / HTTP/1.1", 214 | }, 215 | }, 216 | ), 217 | ( 218 | "%{%D %T}t", 219 | "05/06/19 13:42:26", 220 | { 221 | "request_time": datetime(2019, 5, 6, 13, 42, 26), 222 | "request_time_fields": { 223 | "date": date(2019, 5, 6), 224 | "time": time(13, 42, 26), 225 | }, 226 | "directives": { 227 | "%{%D}t": date(2019, 5, 6), 228 | "%{%T}t": time(13, 42, 26), 229 | }, 230 | }, 231 | ), 232 | ( 233 | "%{%D%%%T}t", 234 | "05/06/19%13:42:26", 235 | { 236 | "request_time": datetime(2019, 5, 6, 13, 42, 26), 237 | "request_time_fields": { 238 | "date": date(2019, 5, 6), 239 | "time": time(13, 42, 26), 240 | }, 241 | "directives": { 242 | "%{%D}t": date(2019, 5, 6), 243 | "%{%T}t": time(13, 42, 26), 244 | }, 245 | }, 246 | ), 247 | ( 248 | "%{%D%t%T}t", 249 | "05/06/19\t13:42:26", 250 | { 251 | "request_time": datetime(2019, 5, 6, 13, 42, 26), 252 | "request_time_fields": { 253 | "date": date(2019, 5, 6), 254 | "time": time(13, 42, 26), 255 | }, 256 | "directives": { 257 | "%{%D}t": date(2019, 5, 6), 258 | "%{%T}t": time(13, 42, 26), 259 | }, 260 | }, 261 | ), 262 | ( 263 | "%{%F %R:%S}t", 264 | "2019-05-06 13:42:26", 265 | { 266 | "request_time": datetime(2019, 5, 6, 13, 42, 26), 267 | "request_time_fields": { 268 | "date": date(2019, 5, 6), 269 | "hour_min": time(13, 42), 270 | "sec": 26, 271 | }, 272 | "directives": { 273 | "%{%F}t": date(2019, 5, 6), 274 | "%{%R}t": time(13, 42), 275 | "%{%S}t": 26, 276 | }, 277 | }, 278 | ), 279 | ( 280 | "%{begin:%F %R:%S}t", 281 | "2019-05-06 13:42:26", 282 | { 283 | "begin_request_time": datetime(2019, 5, 6, 13, 42, 26), 284 | "begin_request_time_fields": { 285 | "date": date(2019, 5, 6), 286 | "hour_min": time(13, 42), 287 | "sec": 26, 288 | }, 289 | "directives": { 290 | "%{begin:%F}t": date(2019, 5, 6), 291 | "%{begin:%R}t": time(13, 42), 292 | "%{begin:%S}t": 26, 293 | }, 294 | }, 295 | ), 296 | ( 297 | "%{end:%F %R:%S}t", 298 | "2019-05-06 13:42:26", 299 | { 300 | "end_request_time": datetime(2019, 5, 6, 13, 42, 26), 301 | "end_request_time_fields": { 302 | "date": date(2019, 5, 6), 303 | "hour_min": time(13, 42), 304 | "sec": 26, 305 | }, 306 | "directives": { 307 | "%{end:%F}t": date(2019, 5, 6), 308 | "%{end:%R}t": time(13, 42), 309 | "%{end:%S}t": 26, 310 | }, 311 | }, 312 | ), 313 | ( 314 | "%<{end:%F %R:%S}t", 315 | "2019-05-06 13:42:26", 316 | { 317 | "original_end_request_time": datetime(2019, 5, 6, 13, 42, 26), 318 | "original_end_request_time_fields": { 319 | "date": date(2019, 5, 6), 320 | "hour_min": time(13, 42), 321 | "sec": 26, 322 | }, 323 | "directives": { 324 | "%<{end:%F}t": date(2019, 5, 6), 325 | "%<{end:%R}t": time(13, 42), 326 | "%<{end:%S}t": 26, 327 | }, 328 | }, 329 | ), 330 | ( 331 | "%{}t", 332 | "[05/Nov/2017:02:01:01 -0500]", 333 | { 334 | "request_time": datetime(2017, 11, 5, 2, 1, 1, tzinfo=w5), 335 | "request_time_fields": { 336 | "timestamp": datetime(2017, 11, 5, 2, 1, 1, tzinfo=w5), 337 | }, 338 | "directives": { 339 | "%{}t": datetime(2017, 11, 5, 2, 1, 1, tzinfo=w5), 340 | }, 341 | }, 342 | ), 343 | ( 344 | "%{begin}t", 345 | "[05/Nov/2017:02:01:01 -0500]", 346 | { 347 | "begin_request_time": datetime(2017, 11, 5, 2, 1, 1, tzinfo=w5), 348 | "begin_request_time_fields": { 349 | "timestamp": datetime(2017, 11, 5, 2, 1, 1, tzinfo=w5), 350 | }, 351 | "directives": { 352 | "%{begin}t": datetime(2017, 11, 5, 2, 1, 1, tzinfo=w5), 353 | }, 354 | }, 355 | ), 356 | ( 357 | "%{end}t", 358 | "[05/Nov/2017:02:01:01 -0500]", 359 | { 360 | "end_request_time": datetime(2017, 11, 5, 2, 1, 1, tzinfo=w5), 361 | "end_request_time_fields": { 362 | "timestamp": datetime(2017, 11, 5, 2, 1, 1, tzinfo=w5), 363 | }, 364 | "directives": { 365 | "%{end}t": datetime(2017, 11, 5, 2, 1, 1, tzinfo=w5), 366 | }, 367 | }, 368 | ), 369 | ( 370 | "%{begin:}t", 371 | "[05/Nov/2017:02:01:01 -0500]", 372 | { 373 | "begin_request_time": datetime(2017, 11, 5, 2, 1, 1, tzinfo=w5), 374 | "begin_request_time_fields": { 375 | "timestamp": datetime(2017, 11, 5, 2, 1, 1, tzinfo=w5), 376 | }, 377 | "directives": { 378 | "%{begin:}t": datetime(2017, 11, 5, 2, 1, 1, tzinfo=w5), 379 | }, 380 | }, 381 | ), 382 | ( 383 | "%{end:}t", 384 | "[05/Nov/2017:02:01:01 -0500]", 385 | { 386 | "end_request_time": datetime(2017, 11, 5, 2, 1, 1, tzinfo=w5), 387 | "end_request_time_fields": { 388 | "timestamp": datetime(2017, 11, 5, 2, 1, 1, tzinfo=w5), 389 | }, 390 | "directives": { 391 | "%{end:}t": datetime(2017, 11, 5, 2, 1, 1, tzinfo=w5), 392 | }, 393 | }, 394 | ), 395 | ( 396 | "%{%Y%n%m%t%d}t", 397 | "2019 05 19", 398 | { 399 | "request_time": None, 400 | "request_time_fields": { 401 | "year": 2019, 402 | "mon": 5, 403 | "mday": 19, 404 | }, 405 | "directives": { 406 | "%{%Y}t": 2019, 407 | "%{%m}t": 5, 408 | "%{%d}t": 19, 409 | }, 410 | }, 411 | ), 412 | ( 413 | "%{%Y%n%m%t%d}t", 414 | "2019 \t 05 \n 19", 415 | { 416 | "request_time": None, 417 | "request_time_fields": { 418 | "year": 2019, 419 | "mon": 5, 420 | "mday": 19, 421 | }, 422 | "directives": { 423 | "%{%Y}t": 2019, 424 | "%{%m}t": 5, 425 | "%{%d}t": 19, 426 | }, 427 | }, 428 | ), 429 | ( 430 | "%{%Y%n%m%t%d}t", 431 | "20190519", 432 | { 433 | "request_time": None, 434 | "request_time_fields": { 435 | "year": 2019, 436 | "mon": 5, 437 | "mday": 19, 438 | }, 439 | "directives": { 440 | "%{%Y}t": 2019, 441 | "%{%m}t": 5, 442 | "%{%d}t": 19, 443 | }, 444 | }, 445 | ), 446 | ( 447 | "%200{%I:%M:%S %p}t", 448 | "12:34:56 ", 449 | { 450 | "request_time": None, 451 | "request_time_fields": { 452 | "hour12": 12, 453 | "min": 34, 454 | "sec": 56, 455 | "am_pm": "", 456 | }, 457 | "directives": { 458 | "%200{%I}t": 12, 459 | "%200{%M}t": 34, 460 | "%200{%S}t": 56, 461 | "%200{%p}t": "", 462 | }, 463 | }, 464 | ), 465 | ( 466 | "%200{%I:%M:%S %p}t", 467 | "-", 468 | { 469 | "request_time": None, 470 | "request_time_fields": { 471 | "hour12": None, 472 | "min": None, 473 | "sec": None, 474 | "am_pm": None, 475 | }, 476 | "directives": { 477 | "%200{%I}t": None, 478 | "%200{%M}t": None, 479 | "%200{%S}t": None, 480 | "%200{%p}t": None, 481 | }, 482 | }, 483 | ), 484 | ( 485 | "%{%s %Z}t", 486 | "1511642826 GMT", 487 | { 488 | "request_time": datetime(2017, 11, 25, 20, 47, 6, tzinfo=timezone.utc), 489 | "request_time_fields": { 490 | "epoch": 1511642826, 491 | "tzname": "GMT", 492 | }, 493 | "directives": { 494 | "%{%s}t": 1511642826, 495 | "%{%Z}t": "GMT", 496 | }, 497 | }, 498 | ), 499 | ( 500 | "%{%s %Z}t", 501 | "1511642826 UTC", 502 | { 503 | "request_time": datetime(2017, 11, 25, 20, 47, 6, tzinfo=timezone.utc), 504 | "request_time_fields": { 505 | "epoch": 1511642826, 506 | "tzname": "UTC", 507 | }, 508 | "directives": { 509 | "%{%s}t": 1511642826, 510 | "%{%Z}t": "UTC", 511 | }, 512 | }, 513 | ), 514 | ( 515 | "%{%s %Z}t", 516 | "1511642826 EST", 517 | { 518 | "request_time": datetime(2017, 11, 25, 15, 47, 6, tzinfo=w5), 519 | "request_time_fields": { 520 | "epoch": 1511642826, 521 | "tzname": "EST", 522 | }, 523 | "directives": { 524 | "%{%s}t": 1511642826, 525 | "%{%Z}t": "EST", 526 | }, 527 | }, 528 | ), 529 | ( 530 | "%{%s %Z}t", 531 | "1558378254 EDT", 532 | { 533 | "request_time": datetime(2019, 5, 20, 14, 50, 54, tzinfo=w4), 534 | "request_time_fields": { 535 | "epoch": 1558378254, 536 | "tzname": "EDT", 537 | }, 538 | "directives": { 539 | "%{%s}t": 1558378254, 540 | "%{%Z}t": "EDT", 541 | }, 542 | }, 543 | ), 544 | ( 545 | "%{%s %Z}t", 546 | "1558378254 XXX", 547 | { 548 | "request_time": datetime(2019, 5, 20, 18, 50, 54, tzinfo=timezone.utc), 549 | "request_time_fields": { 550 | "epoch": 1558378254, 551 | "tzname": "XXX", 552 | }, 553 | "directives": { 554 | "%{%s}t": 1558378254, 555 | "%{%Z}t": "XXX", 556 | }, 557 | }, 558 | ), 559 | ( 560 | "%{%FT%T %Z}t", 561 | "2019-02-20T14:54:43 GMT", 562 | { 563 | "request_time": datetime(2019, 2, 20, 14, 54, 43, tzinfo=timezone.utc), 564 | "request_time_fields": { 565 | "date": date(2019, 2, 20), 566 | "time": time(14, 54, 43), 567 | "tzname": "GMT", 568 | }, 569 | "directives": { 570 | "%{%F}t": date(2019, 2, 20), 571 | "%{%T}t": time(14, 54, 43), 572 | "%{%Z}t": "GMT", 573 | }, 574 | }, 575 | ), 576 | ( 577 | "%{%FT%T %Z}t", 578 | "2019-02-20T14:54:43 UTC", 579 | { 580 | "request_time": datetime(2019, 2, 20, 14, 54, 43, tzinfo=timezone.utc), 581 | "request_time_fields": { 582 | "date": date(2019, 2, 20), 583 | "time": time(14, 54, 43), 584 | "tzname": "UTC", 585 | }, 586 | "directives": { 587 | "%{%F}t": date(2019, 2, 20), 588 | "%{%T}t": time(14, 54, 43), 589 | "%{%Z}t": "UTC", 590 | }, 591 | }, 592 | ), 593 | ( 594 | "%{%FT%T %Z}t", 595 | "2019-02-20T14:54:43 EST", 596 | { 597 | "request_time": datetime(2019, 2, 20, 14, 54, 43, tzinfo=w5), 598 | "request_time_fields": { 599 | "date": date(2019, 2, 20), 600 | "time": time(14, 54, 43), 601 | "tzname": "EST", 602 | }, 603 | "directives": { 604 | "%{%F}t": date(2019, 2, 20), 605 | "%{%T}t": time(14, 54, 43), 606 | "%{%Z}t": "EST", 607 | }, 608 | }, 609 | ), 610 | ( 611 | "%{%FT%T %Z}t", 612 | "2019-05-20T14:54:43 EDT", 613 | { 614 | "request_time": datetime(2019, 5, 20, 14, 54, 43, tzinfo=w4), 615 | "request_time_fields": { 616 | "date": date(2019, 5, 20), 617 | "time": time(14, 54, 43), 618 | "tzname": "EDT", 619 | }, 620 | "directives": { 621 | "%{%F}t": date(2019, 5, 20), 622 | "%{%T}t": time(14, 54, 43), 623 | "%{%Z}t": "EDT", 624 | }, 625 | }, 626 | ), 627 | ( 628 | "%{%FT%T %Z}t", 629 | "2019-05-20T14:54:43 XXX", 630 | { 631 | "request_time": datetime(2019, 5, 20, 14, 54, 43), 632 | "request_time_fields": { 633 | "date": date(2019, 5, 20), 634 | "time": time(14, 54, 43), 635 | "tzname": "XXX", 636 | }, 637 | "directives": { 638 | "%{%F}t": date(2019, 5, 20), 639 | "%{%T}t": time(14, 54, 43), 640 | "%{%Z}t": "XXX", 641 | }, 642 | }, 643 | ), 644 | ], 645 | ) 646 | def test_parse_custom_time(fmt, entry, fields): 647 | log_entry = LogParser(fmt, encoding="utf-8").parse(entry) 648 | for k, v in fields.items(): 649 | assert getattr(log_entry, k) == v 650 | 651 | 652 | @pytest.mark.parametrize( 653 | "fmt,entry,fields", 654 | [ 655 | ( 656 | "%{%d %b %Y %H:%M:%S %z}t", 657 | "19 Mär 2019 01:39:12 +0000", 658 | { 659 | "request_time": datetime(2019, 3, 19, 1, 39, 12, tzinfo=timezone.utc), 660 | "request_time_fields": { 661 | "mday": 19, 662 | "abbrev_mon": "Mär", 663 | "year": 2019, 664 | "hour": 1, 665 | "min": 39, 666 | "sec": 12, 667 | "timezone": timezone.utc, 668 | }, 669 | "directives": { 670 | "%{%d}t": 19, 671 | "%{%b}t": "Mär", 672 | "%{%Y}t": 2019, 673 | "%{%H}t": 1, 674 | "%{%M}t": 39, 675 | "%{%S}t": 12, 676 | "%{%z}t": timezone.utc, 677 | }, 678 | }, 679 | ), 680 | ( 681 | "%{%d %B %Y %H:%M:%S %z}t", 682 | "19 März 2019 01:39:12 +0000", 683 | { 684 | "request_time": datetime(2019, 3, 19, 1, 39, 12, tzinfo=timezone.utc), 685 | "request_time_fields": { 686 | "mday": 19, 687 | "full_mon": "März", 688 | "year": 2019, 689 | "hour": 1, 690 | "min": 39, 691 | "sec": 12, 692 | "timezone": timezone.utc, 693 | }, 694 | "directives": { 695 | "%{%d}t": 19, 696 | "%{%B}t": "März", 697 | "%{%Y}t": 2019, 698 | "%{%H}t": 1, 699 | "%{%M}t": 39, 700 | "%{%S}t": 12, 701 | "%{%z}t": timezone.utc, 702 | }, 703 | }, 704 | ), 705 | ( 706 | "%{%G--%V %a %H:%M:%S}t", 707 | "2019--20 So 12:34:56", 708 | { 709 | "request_time": datetime(2019, 5, 19, 12, 34, 56), 710 | "request_time_fields": { 711 | "iso_year": 2019, 712 | "iso_weeknum": 20, 713 | "abbrev_wday": "So", 714 | "hour": 12, 715 | "min": 34, 716 | "sec": 56, 717 | }, 718 | "directives": { 719 | "%{%G}t": 2019, 720 | "%{%V}t": 20, 721 | "%{%a}t": "So", 722 | "%{%H}t": 12, 723 | "%{%M}t": 34, 724 | "%{%S}t": 56, 725 | }, 726 | }, 727 | ), 728 | ( 729 | "%{%G--%V %A %H:%M:%S}t", 730 | "2019--20 Sonntag 12:34:56", 731 | { 732 | "request_time": datetime(2019, 5, 19, 12, 34, 56), 733 | "request_time_fields": { 734 | "iso_year": 2019, 735 | "iso_weeknum": 20, 736 | "full_wday": "Sonntag", 737 | "hour": 12, 738 | "min": 34, 739 | "sec": 56, 740 | }, 741 | "directives": { 742 | "%{%G}t": 2019, 743 | "%{%V}t": 20, 744 | "%{%A}t": "Sonntag", 745 | "%{%H}t": 12, 746 | "%{%M}t": 34, 747 | "%{%S}t": 56, 748 | }, 749 | }, 750 | ), 751 | ], 752 | ) 753 | def test_parse_custom_german_time(fmt, entry, fields): 754 | oldlocale = locale.setlocale(locale.LC_ALL) 755 | try: 756 | locale.setlocale(locale.LC_ALL, "de_DE.UTF-8") 757 | except locale.Error: 758 | pytest.skip("Locale not supported") 759 | else: 760 | entry = LogParser(fmt).parse(entry) 761 | for k, v in fields.items(): 762 | assert getattr(entry, k) == v 763 | finally: 764 | locale.setlocale(locale.LC_ALL, oldlocale) 765 | --------------------------------------------------------------------------------