├── tests ├── __init__.py └── test_exporters.py ├── examples ├── example │ ├── __init__.py │ ├── spiders │ │ ├── __init__.py │ │ ├── example1.py │ │ ├── example2.py │ │ ├── example4.py │ │ └── example3.py │ ├── items.py │ └── settings.py ├── README.rst └── scrapy.cfg ├── setup.cfg ├── MANIFEST.in ├── CHANGES.rst ├── scrapy_xlsx ├── __init__.py └── exporters.py ├── tox.ini ├── README.rst ├── .github └── workflows │ └── test.yml ├── setup.py ├── LICENSE.txt └── .gitignore /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/example/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/example/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | ignore = W503 3 | extend-ignore = E203 4 | max-line-length = 88 5 | exclude = .tox,.git,__pycache__,.mypy_cache,.pytest_cache 6 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.rst 2 | include CHANGES.rst 3 | include LICENSE.txt 4 | recursive-include tests *.py 5 | recursive-include examples *.py *.rst *.cfg -------------------------------------------------------------------------------- /examples/example/items.py: -------------------------------------------------------------------------------- 1 | from scrapy import Field, Item 2 | 3 | 4 | class ExampleItem(Item): 5 | a = Field() 6 | b = Field() 7 | c = Field() 8 | -------------------------------------------------------------------------------- /examples/example/settings.py: -------------------------------------------------------------------------------- 1 | BOT_NAME = "example" 2 | 3 | SPIDER_MODULES = ["example.spiders"] 4 | NEWSPIDER_MODULE = "example.spiders" 5 | 6 | FEED_EXPORTERS = {"xlsx": "scrapy_xlsx.XlsxItemExporter"} 7 | -------------------------------------------------------------------------------- /CHANGES.rst: -------------------------------------------------------------------------------- 1 | Changes 2 | ======= 3 | 4 | 0.1.1 (2019-04-17) 5 | ------------------ 6 | 7 | * Fix the package name in the README file. 8 | 9 | 0.1.0 (2019-04-17) 10 | ------------------ 11 | 12 | * Initial release. -------------------------------------------------------------------------------- /scrapy_xlsx/__init__.py: -------------------------------------------------------------------------------- 1 | from .exporters import XlsxItemExporter # noqa: F401 2 | 3 | __version__ = "0.1.1" 4 | __author__ = "Jesús Losada Novo" 5 | __license__ = "MIT" 6 | __copyright__ = "Copyright 2019 Jesús Losada Novo" 7 | -------------------------------------------------------------------------------- /examples/README.rst: -------------------------------------------------------------------------------- 1 | This directory contains a Scrapy project with some examples 2 | explaining the different exporter options. 3 | You can run the spiders using this command:: 4 | 5 | scrapy crawl example1 -o example1.xlsx 6 | -------------------------------------------------------------------------------- /examples/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = example.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = example 12 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py37,py38,py39,py310,py311,format,lint 3 | skip_missing_interpreters = true 4 | 5 | [testenv] 6 | deps = 7 | pytest 8 | commands = 9 | pytest {posargs} 10 | 11 | [testenv:format] 12 | deps = 13 | black 14 | commands = black --check scrapy_xlsx tests 15 | 16 | [testenv:lint] 17 | deps = 18 | flake8 19 | commands = flake8 scrapy_xlsx tests 20 | -------------------------------------------------------------------------------- /examples/example/spiders/example1.py: -------------------------------------------------------------------------------- 1 | """ 2 | Simple spider to demonstrate how to use the XLSX exporter. 3 | 4 | This spider produces the following output: 5 | 6 | +-----+----+---+ 7 | | a | b | c | 8 | +=====+====+===+ 9 | | foo | 42 | | 10 | +-----+----+---+ 11 | """ 12 | from scrapy import Spider 13 | 14 | from ..items import ExampleItem 15 | 16 | 17 | class Example1Spider(Spider): 18 | name = "example1" 19 | allowed_domains = ["example.com"] 20 | start_urls = ["http://example.com/"] 21 | 22 | def parse(self, response): 23 | return ExampleItem(a="foo", b=42) 24 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | =========== 2 | scrapy-xlsx 3 | =========== 4 | 5 | **scrapy-xlsx** is a `Scrapy`_ exporter that supports the XLSX format. It 6 | produces files that can be read with Microsoft Excel or LibreOffice Calc. 7 | 8 | Usage 9 | ----- 10 | 11 | Install the library using `pip`_:: 12 | 13 | $ pip install scrapy-xlsx 14 | 15 | Configure the exporter in your Scrapy project ``settings.py`` file:: 16 | 17 | FEED_EXPORTERS = { 18 | 'xlsx': 'scrapy_xlsx.XlsxItemExporter', 19 | } 20 | 21 | Run your spider and export the data to XLSX (this command will overwrite the 22 | output file if it already exists):: 23 | 24 | $ scrapy crawl myspider -o output.xlsx 25 | 26 | License 27 | ------- 28 | 29 | Licensed under the MIT License. 30 | 31 | .. _Scrapy: https://scrapy.org/ 32 | .. _pip: https://pypi.org/project/pip/ -------------------------------------------------------------------------------- /examples/example/spiders/example2.py: -------------------------------------------------------------------------------- 1 | """ 2 | Simple spider to demonstrate how to use the XLSX exporter. 3 | 4 | This spider produces the following output: 5 | 6 | +-----+----+-----+ 7 | | foo | 42 | bar | 8 | +-----+----+-----+ 9 | """ 10 | from scrapy import Spider 11 | from scrapy_xlsx import XlsxItemExporter 12 | 13 | from ..items import ExampleItem 14 | 15 | 16 | class CustomExporter(XlsxItemExporter): 17 | def __init__(self, file, **kwargs): 18 | super().__init__(file, include_header_row=False, **kwargs) 19 | 20 | 21 | class Example2Spider(Spider): 22 | name = "example2" 23 | allowed_domains = ["example.com"] 24 | start_urls = ["http://example.com/"] 25 | 26 | custom_settings = { 27 | "FEED_EXPORTERS": {"xlsx": "example.spiders.example2.CustomExporter"} 28 | } 29 | 30 | def parse(self, response): 31 | return ExampleItem(a="foo", b=42, c="bar") 32 | -------------------------------------------------------------------------------- /examples/example/spiders/example4.py: -------------------------------------------------------------------------------- 1 | """ 2 | Simple spider to demonstrate how to use the XLSX exporter. 3 | 4 | This spider produces the following output: 5 | 6 | +-----+----+---------+ 7 | | a | b | c | 8 | +=====+====+=========+ 9 | | foo | 42 | (empty) | 10 | +-----+----+---------+ 11 | """ 12 | from scrapy import Spider 13 | from scrapy_xlsx import XlsxItemExporter 14 | 15 | from ..items import ExampleItem 16 | 17 | 18 | class CustomExporter(XlsxItemExporter): 19 | def __init__(self, file, **kwargs): 20 | super().__init__(file, default_value="(empty)", **kwargs) 21 | 22 | 23 | class Example4Spider(Spider): 24 | name = "example4" 25 | allowed_domains = ["example.com"] 26 | start_urls = ["http://example.com/"] 27 | 28 | custom_settings = { 29 | "FEED_EXPORTERS": {"xlsx": "example.spiders.example4.CustomExporter"} 30 | } 31 | 32 | def parse(self, response): 33 | return ExampleItem(a="foo", b=42) 34 | -------------------------------------------------------------------------------- /examples/example/spiders/example3.py: -------------------------------------------------------------------------------- 1 | """ 2 | Simple spider to demonstrate how to use the XLSX exporter. 3 | 4 | This spider produces the following output: 5 | 6 | +-----+----+-------+ 7 | | a | b | c | 8 | +=====+====+=======+ 9 | | foo | 42 | a|b|c | 10 | +-----+----+-------+ 11 | """ 12 | from scrapy import Spider 13 | from scrapy_xlsx import XlsxItemExporter 14 | 15 | from ..items import ExampleItem 16 | 17 | 18 | class CustomExporter(XlsxItemExporter): 19 | def __init__(self, file, **kwargs): 20 | super().__init__(file, join_multivalued="|", **kwargs) 21 | 22 | 23 | class Example3Spider(Spider): 24 | name = "example3" 25 | allowed_domains = ["example.com"] 26 | start_urls = ["http://example.com/"] 27 | 28 | custom_settings = { 29 | "FEED_EXPORTERS": {"xlsx": "example.spiders.example3.CustomExporter"} 30 | } 31 | 32 | def parse(self, response): 33 | return ExampleItem(a="foo", b=42, c=["a", "b", "c"]) 34 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Test Python package 2 | 3 | on: 4 | push: 5 | pull_request: 6 | schedule: 7 | - cron: "0 8 * * 6" 8 | 9 | jobs: 10 | build: 11 | 12 | runs-on: ubuntu-latest 13 | strategy: 14 | max-parallel: 4 15 | matrix: 16 | python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"] 17 | 18 | steps: 19 | - uses: actions/checkout@v3 20 | - name: Set up Python ${{ matrix.python-version }} 21 | uses: actions/setup-python@v4 22 | with: 23 | python-version: ${{ matrix.python-version }} 24 | - name: Install dependencies 25 | run: | 26 | python -m pip install --upgrade pip 27 | pip install tox 28 | - name: Format 29 | env: 30 | TOXENV: format 31 | run: | 32 | tox 33 | - name: Lint 34 | env: 35 | TOXENV: lint 36 | run: | 37 | tox 38 | - name: Run tests 39 | env: 40 | TOXENV: py 41 | run: | 42 | tox 43 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | 4 | with open("README.rst") as f: 5 | readme = f.read() 6 | 7 | 8 | setup( 9 | name="scrapy-xlsx", 10 | version="0.1.1", 11 | description="XLSX exporter for Scrapy", 12 | long_description=readme, 13 | author="Jesús Losada Novo", 14 | author_email="dev@jesuslosada.com", 15 | url="https://github.com/jesuslosada/scrapy-xlsx", 16 | license="MIT", 17 | packages=["scrapy_xlsx"], 18 | install_requires=["scrapy", "openpyxl"], 19 | extras_require={"testing": ["pytest", "tox"]}, 20 | classifiers=[ 21 | "Development Status :: 4 - Beta", 22 | "Intended Audience :: Developers", 23 | "License :: OSI Approved :: MIT License", 24 | "Natural Language :: English", 25 | "Operating System :: OS Independent", 26 | "Programming Language :: Python", 27 | "Programming Language :: Python :: 3", 28 | "Topic :: Software Development :: Libraries", 29 | ], 30 | keywords="scrapy xlsx exporter", 31 | ) 32 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Jesús Losada Novo 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /scrapy_xlsx/exporters.py: -------------------------------------------------------------------------------- 1 | from openpyxl import Workbook 2 | from openpyxl.cell.cell import KNOWN_TYPES 3 | from scrapy.exporters import BaseItemExporter 4 | 5 | 6 | class XlsxItemExporter(BaseItemExporter): 7 | """XlsxItemExporter allows exporting the output items to a XLSX file.""" 8 | 9 | def __init__( 10 | self, 11 | file, 12 | include_header_row=True, 13 | join_multivalued=",", 14 | default_value=None, 15 | **kwargs 16 | ): 17 | self._configure(kwargs, dont_fail=True) 18 | 19 | self.file = file 20 | self.include_header_row = include_header_row 21 | self._join_multivalued = join_multivalued 22 | self.default_value = default_value 23 | self._headers_not_written = True 24 | 25 | self.workbook = Workbook(write_only=True) 26 | self.sheet = self.workbook.create_sheet() 27 | 28 | def serialize_field(self, field, name, value): 29 | serializer = field.get("serializer", self._default_serializer) 30 | return serializer(value) 31 | 32 | def _default_serializer(self, value): 33 | """ 34 | Provide a valid XLSX serialization for value. 35 | 36 | This method serializes the item fields trying to respect their type. 37 | Strings, numbers, booleans and dates are handled by openpyxl and they 38 | should appear with proper formatting in the output file. Lists and 39 | tuples of strings are converted into a single string when possible. 40 | Complex types like dict or set do not have a proper representation in 41 | XLSX format so they will just be converted into a string. You can 42 | override this method to provide a custom serialization, like a JSON 43 | representation using json.dumps(). Individual scrapy.Item fields can 44 | provide a custom serializer too: 45 | my_field = Field(serializer=custom_serializer) 46 | """ 47 | # Do not modify values supported by openpyxl. 48 | if isinstance(value, KNOWN_TYPES): 49 | return value 50 | 51 | # Convert lists and tuples of strings into a single string. 52 | if self._join_multivalued is not None and isinstance(value, (list, tuple)): 53 | try: 54 | return self._join_multivalued.join(value) 55 | except TypeError: 56 | pass 57 | 58 | # Convert complex types like dict into a string as fallback mechanism. 59 | return str(value) 60 | 61 | def export_item(self, item): 62 | if self._headers_not_written: 63 | self._headers_not_written = False 64 | self._write_headers_and_set_fields_to_export(item) 65 | 66 | fields = self._get_serialized_fields( 67 | item, default_value=self.default_value, include_empty=True 68 | ) 69 | values = list(value for _, value in fields) 70 | self.sheet.append(values) 71 | 72 | def finish_exporting(self): 73 | # XXX: ideally, Scrapy would pass the filename and let the exporter 74 | # create the output file, however, it passes a file object already 75 | # open in "append" mode, so this method ignores this file object and 76 | # only uses it to retrieve the filename. 77 | self.workbook.save(self.file.name) 78 | 79 | def _write_headers_and_set_fields_to_export(self, item): 80 | """ 81 | Write the header row using the field names of the first item. 82 | 83 | This method writes the header row using the field names of the first 84 | exported item. This works fine with scrapy.Item objects because they 85 | provide a formal schema definition, but you need to be careful when 86 | using dictionaries that may omit some fields. It is recommended to 87 | set fields_to_export when using dictionaries to avoid omitting fields 88 | accidentally. 89 | """ 90 | if self.fields_to_export is None: 91 | if isinstance(item, dict): 92 | self.fields_to_export = list(item.keys()) 93 | else: 94 | self.fields_to_export = list(item.fields.keys()) 95 | 96 | if self.include_header_row: 97 | row = list(self.fields_to_export) 98 | self.sheet.append(row) 99 | -------------------------------------------------------------------------------- /tests/test_exporters.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from tempfile import NamedTemporaryFile 3 | 4 | from openpyxl import load_workbook 5 | from scrapy import Field, Item 6 | 7 | from scrapy_xlsx import XlsxItemExporter 8 | 9 | 10 | class XlsxItemExporterTest(unittest.TestCase): 11 | def setUp(self): 12 | class TestItem(Item): 13 | name = Field() 14 | age = Field() 15 | 16 | self.item = TestItem(name="John", age="42") 17 | self.output = NamedTemporaryFile() 18 | self.exporter = self._get_exporter() 19 | 20 | def tearDown(self): 21 | self.output.close() 22 | 23 | def _get_exporter(self, **kwargs): 24 | return XlsxItemExporter(self.output, **kwargs) 25 | 26 | def _check_basic_workflow(self, item): 27 | self.exporter.start_exporting() 28 | self.exporter.export_item(item) 29 | self.exporter.finish_exporting() 30 | 31 | def test_export_scrapy_item(self): 32 | self._check_basic_workflow(self.item) 33 | 34 | def test_export_dict_item(self): 35 | self._check_basic_workflow(dict(self.item)) 36 | 37 | def test_fields_to_export(self): 38 | ie = self._get_exporter(fields_to_export=["name"]) 39 | self.assertEqual(list(ie._get_serialized_fields(self.item)), [("name", "John")]) 40 | 41 | def test_serialize_field(self): 42 | res = self.exporter.serialize_field( 43 | self.item.fields["name"], "name", self.item["name"] 44 | ) 45 | self.assertEqual(res, "John") 46 | 47 | res = self.exporter.serialize_field( 48 | self.item.fields["age"], "age", self.item["age"] 49 | ) 50 | self.assertEqual(res, "42") 51 | 52 | def test_serialize_field_join_multivalued(self): 53 | res = self.exporter.serialize_field({}, "name", ["a", "b", "c"]) 54 | self.assertEqual(res, "a,b,c") 55 | 56 | exporter = self._get_exporter(join_multivalued=None) 57 | res = exporter.serialize_field({}, "name", ["a", "b", "c"]) 58 | self.assertEqual(res, "['a', 'b', 'c']") 59 | 60 | exporter = self._get_exporter(join_multivalued="|") 61 | res = exporter.serialize_field({}, "name", ["a", "b", "c"]) 62 | self.assertEqual(res, "a|b|c") 63 | 64 | exporter = self._get_exporter(join_multivalued="|") 65 | res = exporter.serialize_field({}, "name", ["a", 1]) 66 | self.assertEqual(res, "['a', 1]") 67 | 68 | def test_field_custom_serializer(self): 69 | def custom_serializer(value): 70 | return value.lower() 71 | 72 | class CustomItem(Item): 73 | name = Field(serializer=custom_serializer) 74 | age = Field() 75 | 76 | item = CustomItem(name="John", age="42") 77 | self.assertEqual( 78 | self.exporter.serialize_field(item.fields["name"], "name", item["name"]), 79 | "john", 80 | ) 81 | self.assertEqual( 82 | self.exporter.serialize_field(item.fields["age"], "age", item["age"]), "42" 83 | ) 84 | 85 | def test_output_content(self): 86 | exporter = self._get_exporter(fields_to_export=["age", "name"]) 87 | exporter.start_exporting() 88 | exporter.export_item(self.item) 89 | exporter.finish_exporting() 90 | 91 | workbook = load_workbook(self.output) 92 | sheet = workbook.active 93 | 94 | self.assertEqual(sheet.max_row, 2) 95 | self.assertEqual(sheet.max_column, 2) 96 | 97 | expected_rows = [["age", "name"], ["42", "John"]] 98 | for row, row_values in enumerate(sheet.iter_rows()): 99 | for column, cell in enumerate(row_values): 100 | self.assertEqual(cell.value, expected_rows[row][column]) 101 | 102 | def test_output_content_no_header_row(self): 103 | exporter = self._get_exporter( 104 | include_header_row=False, fields_to_export=["age", "name"] 105 | ) 106 | exporter.start_exporting() 107 | exporter.export_item(self.item) 108 | exporter.finish_exporting() 109 | 110 | workbook = load_workbook(self.output) 111 | sheet = workbook.active 112 | 113 | self.assertEqual(sheet.max_row, 1) 114 | self.assertEqual(sheet.max_column, 2) 115 | 116 | expected_rows = [["42", "John"]] 117 | for row, row_values in enumerate(sheet.iter_rows()): 118 | for column, cell in enumerate(row_values): 119 | self.assertEqual(cell.value, expected_rows[row][column]) 120 | 121 | def test_output_content_default_value(self): 122 | exporter = self._get_exporter( 123 | fields_to_export=["age", "name"], default_value="-" 124 | ) 125 | exporter.start_exporting() 126 | exporter.export_item({"name": "John"}) 127 | exporter.finish_exporting() 128 | 129 | workbook = load_workbook(self.output) 130 | sheet = workbook.active 131 | 132 | self.assertEqual(sheet.max_row, 2) 133 | self.assertEqual(sheet.max_column, 2) 134 | 135 | expected_rows = [["age", "name"], ["-", "John"]] 136 | for row, row_values in enumerate(sheet.iter_rows()): 137 | for column, cell in enumerate(row_values): 138 | self.assertEqual(cell.value, expected_rows[row][column]) 139 | --------------------------------------------------------------------------------