├── setup.cfg ├── ckanext ├── __init__.py └── importlib │ ├── __init__.py │ ├── tests │ ├── __init__.py │ ├── samples │ │ ├── test_importer_full.xls │ │ ├── test_importer_example.xls │ │ ├── test_importer_bis_example.xls │ │ ├── test_importer_example.csv │ │ └── test_importer_full.csv │ ├── test_spreadsheet_importer.py │ ├── test_spreadsheet_import_files.py │ └── test_loader.py │ ├── api_command.py │ ├── command.py │ ├── importer.py │ ├── spreadsheet_importer.py │ └── loader.py ├── .gitignore ├── pip-requirements.txt ├── setup.py ├── test.ini └── README.txt /setup.cfg: -------------------------------------------------------------------------------- 1 | [nosetests] 2 | with-pylons = test.ini 3 | -------------------------------------------------------------------------------- /ckanext/__init__.py: -------------------------------------------------------------------------------- 1 | __import__("pkg_resources").declare_namespace(__name__) 2 | -------------------------------------------------------------------------------- /ckanext/importlib/__init__.py: -------------------------------------------------------------------------------- 1 | __import__("pkg_resources").declare_namespace(__name__) 2 | -------------------------------------------------------------------------------- /ckanext/importlib/tests/__init__.py: -------------------------------------------------------------------------------- 1 | __import__("pkg_resources").declare_namespace(__name__) 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | syntax:glob 2 | # generic 3 | *.pyc 4 | *~ 5 | .DS_Store 6 | *.egg-info/* 7 | sandbox/* 8 | -------------------------------------------------------------------------------- /pip-requirements.txt: -------------------------------------------------------------------------------- 1 | xlrd 2 | xlwt 3 | -e git+https://github.com/okfn/ckanclient.git#egg=ckanclient 4 | -------------------------------------------------------------------------------- /ckanext/importlib/tests/samples/test_importer_full.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/okfn/ckanext-importlib/master/ckanext/importlib/tests/samples/test_importer_full.xls -------------------------------------------------------------------------------- /ckanext/importlib/tests/samples/test_importer_example.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/okfn/ckanext-importlib/master/ckanext/importlib/tests/samples/test_importer_example.xls -------------------------------------------------------------------------------- /ckanext/importlib/tests/samples/test_importer_bis_example.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/okfn/ckanext-importlib/master/ckanext/importlib/tests/samples/test_importer_bis_example.xls -------------------------------------------------------------------------------- /ckanext/importlib/tests/samples/test_importer_example.csv: -------------------------------------------------------------------------------- 1 | "name","title","resource-0-url","resource-0-format","resource-0-description","tags" 2 | "wikipedia","Wikipedia","http://static.wikipedia.org/downloads/2008-06/en/wikipedia-en-html.tar.7z","html","In English","encyclopedia reference" 3 | "tviv","TV IV","http://tviv.org/Category:Grids","","","tv encyclopedia" 4 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name='ckanext-importlib', 5 | version='0.1', 6 | author='Open Knowledge Foundation', 7 | author_email='info@okfn.org', 8 | license='AGPL', 9 | url='http://ckan.org/', 10 | description='CKAN importer and loader library', 11 | keywords='data packaging component tool server', 12 | namespace_packages=['ckanext', 'ckanext.importlib'], 13 | install_requires=[ 14 | # List of dependencies is moved to pip-requirements.txt 15 | # to avoid conflicts with Debian packaging. 16 | #'xlrd>=0.7.1', 17 | #'xlwt>=0.7.2', 18 | ], 19 | packages=find_packages(exclude=['ez_setup']), 20 | include_package_data=True, 21 | package_data={'ckan': ['i18n/*/LC_MESSAGES/*.mo']}, 22 | entry_points=""" 23 | """, 24 | test_suite = 'nose.collector', 25 | ) 26 | -------------------------------------------------------------------------------- /test.ini: -------------------------------------------------------------------------------- 1 | # 2 | # ckan - Pylons testing environment configuration 3 | # 4 | # The %(here)s variable will be replaced with the parent directory of this file 5 | # 6 | [DEFAULT] 7 | debug = true 8 | # Uncomment and replace with the address which should receive any error reports 9 | #email_to = you@yourdomain.com 10 | smtp_server = localhost 11 | error_email_from = paste@localhost 12 | 13 | [server:main] 14 | use = egg:Paste#http 15 | host = 0.0.0.0 16 | port = 5000 17 | 18 | 19 | [app:main] 20 | use = config:../ckan/test.ini 21 | 22 | 23 | # Logging configuration 24 | [loggers] 25 | keys = root, ckan, sqlalchemy 26 | 27 | [handlers] 28 | keys = console 29 | 30 | [formatters] 31 | keys = generic 32 | 33 | [logger_root] 34 | level = WARN 35 | handlers = console 36 | 37 | [logger_ckan] 38 | qualname = ckan 39 | handlers = 40 | level = INFO 41 | 42 | [logger_sqlalchemy] 43 | handlers = 44 | qualname = sqlalchemy.engine 45 | level = WARN 46 | 47 | [handler_console] 48 | class = StreamHandler 49 | args = (sys.stdout,) 50 | level = NOTSET 51 | formatter = generic 52 | 53 | [formatter_generic] 54 | format = %(asctime)s %(levelname)-5.5s [%(name)s] %(message)s 55 | -------------------------------------------------------------------------------- /README.txt: -------------------------------------------------------------------------------- 1 | Library for importing datasets into CKAN using the API. 2 | 3 | Introduction 4 | ============ 5 | 6 | One-off imports of metadata into CKAN isn't very hard, and using ckanclient directly is probably best for that. But when you are continuously importing you have some challenges which this library aims to help with: 7 | 8 | * when you reimport a dataset you want to check if it already exists in CKAN, using an ID stored in an extra field and possibly another extra field naming the source 9 | 10 | * you may import resources, which become grouped into datasets (e.g. time series data) - ResourceSeriesLoader 11 | 12 | * when you derive a unique name for a dataset from its title, you need to avoid clashes. 13 | 14 | ckanext-importlib was designed as a framework to be expanded, based on the needs of the data.gov.uk ONS importer. But TBH it is not so flexible. But even if you don't use it, you might want to steal stuff from it. 15 | 16 | Quickstart 17 | ========== 18 | 19 | To get the code:: 20 | 21 | hg clone https://github.com/okfn/ckanext-importlib.git 22 | 23 | The code also requires installed: 24 | * importlib dependencies (pip-requirements.txt) 25 | * ckan 26 | * ckan dependencies (ckan/pip-requirements.txt) 27 | 28 | To install the dependencies into a virtual environment:: 29 | 30 | virtualenv pyenv 31 | pip -E pyenv install -e ../ckanext-importlib 32 | pip -E pyenv install -e ckan 33 | pip -E ../pyenv-ckanext-importlib install -r ../ckan/pip-requirements.txt 34 | pip -E pyenv install -r pip-requirements.txt 35 | 36 | 37 | Tests 38 | ===== 39 | 40 | To run the tests:: 41 | 42 | pip -E pyenv install -e nose 43 | cd ckanext-importlib 44 | nosetests --ckan ckanext/importlib/tests/ 45 | -------------------------------------------------------------------------------- /ckanext/importlib/api_command.py: -------------------------------------------------------------------------------- 1 | from command import Command 2 | 3 | from ckanclient import CkanClient 4 | 5 | class ApiCommand(Command): 6 | def __init__(self, usage=None): 7 | ''' 8 | Base class for commands that use the API 9 | ''' 10 | self.parser = Command.StandardParser(usage=usage) 11 | super(ApiCommand, self).__init__() 12 | 13 | def add_options(self): 14 | self.parser.add_option("-H", "--host", 15 | dest="api_url", 16 | help="API URL (e.g.: http://test.ckan.net/api)") 17 | self.parser.add_option("-k", "--key", 18 | dest="api_key", 19 | help="API Key (required)") 20 | self.parser.add_option("-u", "--username", 21 | dest="username", 22 | help="Username for HTTP Basic Authentication") 23 | self.parser.add_option("-p", "--password", 24 | dest="password", 25 | help="Password for HTTP Basic Authentication") 26 | 27 | def command(self): 28 | super(ApiCommand, self).command() 29 | if not self.options.api_key: 30 | self.parser.error('Please specify an API Key') 31 | if not self.options.api_url: 32 | self.parser.error('Please specify an API URL') 33 | if self.options.api_url: 34 | if not (self.options.api_url.startswith('http://') or \ 35 | self.options.api_url.startswith('https://')): 36 | self.parser.error('--host must start with "http://"') 37 | if not '/api' in self.options.api_url: 38 | self.parser.error('--host must have "/api" towards the end') 39 | user_agent = self.user_agent if hasattr(self, 'user_agent') else 'ckanext-importlib/ApiCommand' 40 | 41 | self.client = CkanClient(base_location=self.options.api_url, 42 | api_key=self.options.api_key, 43 | http_user=self.options.username, 44 | http_pass=self.options.password, 45 | is_verbose=True, 46 | user_agent=user_agent) 47 | 48 | # now do command 49 | -------------------------------------------------------------------------------- /ckanext/importlib/tests/samples/test_importer_full.csv: -------------------------------------------------------------------------------- 1 | "name","title","version","url","author","author_email","maintainer","maintainer_email","notes","state","license","isopen","tags","groups","ckan_url","relationships","metadata_modified","metadata_created","notes_rendered","genre","original media","resource-0-url","resource-0-format","resource-0-description","resource-0-hash","resource-0-name","resource-0-resource_type","resource-0-mimetype","resource-0-mimetype_inner","resource-0-size","resource-0-last_modified","resource-0-cache_url","resource-0-cache_last_updated","resource-0-webstore_url","resource-0-webstore_last_updated","resource-0-alt_url","resource-0-size_extra","resource-1-url","resource-1-format","resource-1-description","resource-1-hash","resource-1-name","resource-1-resource_type","resource-1-mimetype","resource-1-mimetype_inner","resource-1-size","resource-1-last_modified","resource-1-cache_url","resource-1-cache_last_updated","resource-1-webstore_url","resource-1-webstore_last_updated","resource-1-alt_url","resource-1-size_extra" 2 | "annakarenina","A Novel By Tolstoy","0.7a","http://www.annakarenina.com","","","","","Some test notes 3 | 4 | ### A 3rd level heading 5 | 6 | **Some bolded text.** 7 | 8 | *Some italicized text.* 9 | 10 | Foreign characters: 11 | u with umlaut ü 12 | 66-style quote “ 13 | foreign word: thümb 14 | 15 | Needs escaping: 16 | left arrow < 17 | 18 | 19 | 20 | ","active","OKD Compliant::Other (Open)",True,"Flexible ァ russian tolstoy","david roger","http://test.ckan.net/dataset/annakarenina","","2011-12-09T17:15:57.440192","2011-12-09T17:15:57.440192","

Some test notes 21 |

22 | 23 |

A 3rd level heading

24 |

Some bolded text. 25 |

26 |

Some italicized text. 27 |

28 |

Foreign characters: 29 | u with umlaut ü 30 | 66-style quote “ 31 | foreign word: thümb 32 |

33 |

Needs escaping: 34 | left arrow < 35 |

36 |

http://ckan.net/ 37 |

","romantic novel","book","http://www.annakarenina.com/download/x=1&y=2","plain text","Full text. Needs escaping: "" Umlaut: ü","abc123","","","","","","","","","","","alt123","123","http://www.annakarenina.com/index.json","json","Index of the novel","def456","","","","","","","","","","","alt345","345" 38 | "warandpeace","A Wonderful Story","","","","","","","","active","Non-OKD Compliant::Creative Commons Non-Commercial (Any)",False,"Flexible ァ russian","david","http://test.ckan.net/dataset/warandpeace","","2011-12-09T17:15:57.440192","2011-12-09T17:15:57.440192","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","" 39 | -------------------------------------------------------------------------------- /ckanext/importlib/command.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from optparse import OptionParser 4 | import logging 5 | from ConfigParser import ConfigParser 6 | 7 | class Command(object): 8 | """ 9 | (this class is copied from :module:`ordf.command`) 10 | 11 | This class is very similar to :class:`paste.script.command.Command` but 12 | rather than implementing a :program:`paster` plugin it is for stand-alone 13 | command line programs. To implement a command line program, sub-class this 14 | class, and make a minimal method to instantiate and run it. As with the 15 | paster counterpart you have to add an option parser and a method called 16 | :meth:`command`. A minimal example: 17 | 18 | .. code-block:: python 19 | 20 | class Hello(Command): 21 | def command(self): 22 | print "hello world" 23 | 24 | def hello(): 25 | Hello().command() 26 | 27 | To create the actual script, in your package's *setup.py* needs an entry 28 | point like:: 29 | 30 | [console_scripts] 31 | hello=mypackage.command:hello 32 | 33 | and then run one of:: 34 | 35 | % python setup.py develop 36 | % python setup.py install 37 | """ 38 | def __init__(self): 39 | usage = self.usage if hasattr(self, 'usage') else None 40 | self.parser = Command.StandardParser(usage=usage) 41 | self.add_options() 42 | self.parse_args() 43 | self.setup_logging() 44 | super(Command, self).__init__() 45 | 46 | @classmethod 47 | def StandardParser(cls, *av, **kw): 48 | parser = OptionParser(*av, **kw) 49 | parser.add_option("-l", "--logfile", 50 | dest="logfile", default=None, 51 | help="log to file") 52 | parser.add_option("-v", "--verbosity", 53 | dest="verbosity", default="info", 54 | help="log verbosity. one of debug, info, warning, error, critical") 55 | return parser 56 | 57 | def parse_args(self): 58 | self.options, self.args = self.parser.parse_args() 59 | 60 | def add_options(self): 61 | pass 62 | 63 | def setup_logging(self): 64 | ## set up logging 65 | logcfg = { 66 | "level": logging.INFO, 67 | "format": "%(asctime)s %(levelname)s [%(name)s] %(message)s", 68 | } 69 | if self.options.logfile: 70 | logcfg["filename"] = self.options.logfile 71 | if self.options.verbosity: 72 | levels = { 73 | "debug": logging.DEBUG, 74 | "info": logging.INFO, 75 | "warning": logging.WARNING, 76 | "error": logging.ERROR, 77 | "critical": logging.CRITICAL 78 | } 79 | logcfg["level"] = levels.get(self.options.verbosity, logging.NOTSET) 80 | logging.basicConfig(**logcfg) 81 | 82 | def command(self): 83 | pass 84 | 85 | def config(filename): 86 | cfgpath = os.path.abspath(filename) 87 | cfgfile = ConfigParser({ "here": os.path.dirname(cfgpath) }) 88 | cfgfile.read(cfgpath) 89 | 90 | cfg = {} 91 | if cfgfile.has_section("app:main"): 92 | cfg.update(cfgfile.items("app:main")) 93 | return cfg 94 | 95 | class ConfiguredCommand(Command): 96 | '''The same as Command, only with the --config option.''' 97 | def __init__(self): 98 | super(ConfiguredCommand, self).__init__() 99 | self.parse_config() 100 | 101 | @classmethod 102 | def StandardParser(cls, *av, **kw): 103 | parser = super(ConfiguredCommand, cls).StandardParser(*av, **kw) 104 | parser.add_option("-c", "--config", 105 | dest="config", default="development.ini", 106 | help="configuration file (default: development.ini)") 107 | return parser 108 | 109 | def parse_config(self): 110 | self.config = {} 111 | 112 | if self.options.config: 113 | cfg = config(self.options.config) 114 | self.config.update(cfg) 115 | 116 | -------------------------------------------------------------------------------- /ckanext/importlib/importer.py: -------------------------------------------------------------------------------- 1 | import StringIO 2 | 3 | import re 4 | import datetime 5 | 6 | class ImportException(Exception): 7 | pass 8 | 9 | class RowParseError(ImportException): 10 | pass 11 | 12 | class DataRecords(object): 13 | '''Represents raw data records in the form of a dictionary. 14 | (The raw data is not yet processed - it will be converted to package_dict 15 | in the next step.) 16 | ''' 17 | @property 18 | def records(self): 19 | '''Yields each record as a dict.''' 20 | raise NotImplementedError 21 | 22 | 23 | class PackageImporter(object): 24 | '''Base class for an importer that converts a particular file type 25 | and creates corresponding package dictionaries.''' 26 | _log = [] 27 | 28 | def __init__(self, filepath=None, buf=None): 29 | assert filepath or buf, 'Must specify a filepath or a buf.' 30 | self._filepath = filepath 31 | self._buf = buf 32 | self.import_into_package_records() 33 | 34 | def import_into_package_records(self): 35 | '''Reads in the source file given by self._filepath and 36 | stores the resulting DataRecords in self._package_data_records.''' 37 | raise NotImplementedError() 38 | 39 | @classmethod 40 | def log(cls, msg): 41 | cls._log.append(msg) 42 | 43 | @classmethod 44 | def get_log(cls): 45 | return cls._log 46 | 47 | @classmethod 48 | def clear_log(cls): 49 | cls._log = [] 50 | 51 | def record_2_package(self, record_dict): 52 | '''Converts a raw record into a package dictionary. 53 | @param record_dict - the raw record 54 | @return - pkg_dict''' 55 | raise NotImplementedError() 56 | 57 | def pkg_dict(self): 58 | '''Generates package dicts from the package data records.''' 59 | for row_dict in self._package_data_records.records: 60 | try: 61 | yield self.record_2_package(row_dict) 62 | except RowParseError, e: 63 | print 'Error with row', e 64 | raise StopIteration 65 | 66 | @classmethod 67 | def license_2_license_id(self, license_title, logger=None): 68 | # import is here, as it creates a dependency on ckan, which 69 | # many importers won't want 70 | from ckan.model.license import LicenseRegister 71 | 72 | licenses = LicenseRegister() 73 | license_obj = licenses.get_by_title(license_title) 74 | if license_obj: 75 | return u'%s' % license_obj.id 76 | else: 77 | logger('Warning: No license name matches \'%s\'. Ignoring license.' % license_title) 78 | 79 | 80 | @classmethod 81 | def munge(self, name): 82 | '''Munge a title into a name. 83 | 84 | Note this function must be only carefully changed, as reimporting 85 | data with a name munged differently may create duplicates packages. 86 | For this reason, this munge function is for use by the importers only. 87 | Other users should use the API slug creation functionality. 88 | ''' 89 | # import is here, as it creates a dependency on ckan, which 90 | # many importers won't want 91 | import ckan.model as model 92 | 93 | # convert spaces to underscores 94 | name = re.sub(' ', '_', name).lower() 95 | # convert symbols to dashes 96 | name = re.sub('[:]', '_-', name).lower() 97 | name = re.sub('[/]', '-', name).lower() 98 | # take out not-allowed characters 99 | name = re.sub('[^a-zA-Z0-9-_]', '', name).lower() 100 | # remove double underscores 101 | name = re.sub('__', '_', name).lower() 102 | # if longer than max_length, keep last word if a year 103 | max_length = model.PACKAGE_NAME_MAX_LENGTH - 5 104 | # (make length less than max, in case we need a few for '_' chars 105 | # to de-clash names.) 106 | if len(name) > max_length: 107 | year_match = re.match('.*?[_-]((?:\d{2,4}[-/])?\d{2,4})$', name) 108 | if year_match: 109 | year = year_match.groups()[0] 110 | name = '%s-%s' % (name[:(max_length-len(year)-1)], year) 111 | else: 112 | name = name[:max_length] 113 | return name 114 | 115 | @classmethod 116 | def name_munge(self, input_name): 117 | '''Munges the name field in case it is not to spec. 118 | 119 | Note this function must be only carefully changed, as reimporting 120 | data with a name munged differently may create duplicates packages. 121 | For this reason, this munge function is for use by the importers only. 122 | Other users should use the API slug creation functionality. 123 | ''' 124 | return self.munge(input_name.replace(' ', '').replace('.', '_').replace('&', 'and')) 125 | 126 | @classmethod 127 | def tidy_url(self, url, logger=None): 128 | if url and not url.startswith('http') and not url.startswith('webcal:'): 129 | if url.startswith('www.'): 130 | url = url.replace('www.', 'http://www.') 131 | else: 132 | logger('Warning: URL doesn\'t start with http: %s' % url) 133 | return url 134 | 135 | 136 | -------------------------------------------------------------------------------- /ckanext/importlib/tests/test_spreadsheet_importer.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from pylons import config 4 | 5 | import ckanext.importlib.spreadsheet_importer as spreadsheet_importer 6 | 7 | TEST_DIR = os.path.dirname(os.path.abspath(__file__)) 8 | EXAMPLES_DIR = os.path.join(TEST_DIR, 'samples') 9 | EXAMPLE_FILEBASE = 'test_importer' 10 | EXAMPLE_TESTFILE_SUFFIX = '_example' 11 | EXAMPLE_BIS_TESTFILE_SUFFIX = '_bis_example' 12 | XL_EXTENSION = '.xls' 13 | CSV_EXTENSION = '.csv' 14 | EXTENSIONS = [CSV_EXTENSION, XL_EXTENSION] 15 | SPREADSHEET_DATA_MAP = {XL_EXTENSION:spreadsheet_importer.XlData, 16 | CSV_EXTENSION:spreadsheet_importer.CsvData} 17 | 18 | class ExampleFiles(object): 19 | def __init__(self, examples_dir, example_filebase): 20 | ''' 21 | Easy accessor for info about test fixture files. 22 | @param examples_dir - absolute 23 | ''' 24 | self.examples_dir = examples_dir 25 | self.example_filebase = example_filebase 26 | 27 | def get_spreadsheet_filepath(self, test_file_suffix, extension): 28 | return os.path.join(self.examples_dir, self.example_filebase + test_file_suffix + extension) 29 | 30 | def get_data(self, test_file_suffix, extension=XL_EXTENSION): 31 | logger = BasicLogger() 32 | filepath = self.get_spreadsheet_filepath(test_file_suffix, extension) 33 | return SPREADSHEET_DATA_MAP[extension](logger, filepath=filepath) 34 | 35 | examples = ExampleFiles(EXAMPLES_DIR, EXAMPLE_FILEBASE) 36 | 37 | class BasicLogger: 38 | def __init__(self): 39 | self.log = [] 40 | 41 | 42 | class TestSpreadsheetData: 43 | def test_0_example_file_by_filepath(self): 44 | for extension in EXTENSIONS: 45 | logger = BasicLogger() 46 | filepath = examples.get_spreadsheet_filepath(EXAMPLE_TESTFILE_SUFFIX, extension) 47 | data = SPREADSHEET_DATA_MAP[extension](logger, filepath=filepath) 48 | self.assert_example_data(data) 49 | assert logger.log == [], logger.log 50 | 51 | def test_1_example_file_by_buf(self): 52 | for extension in EXTENSIONS: 53 | logger = BasicLogger() 54 | filepath = examples.get_spreadsheet_filepath(EXAMPLE_TESTFILE_SUFFIX, extension) 55 | f = open(filepath, 'rb') 56 | buf = f.read() 57 | f.close() 58 | data = SPREADSHEET_DATA_MAP[extension](logger, buf=buf) 59 | self.assert_example_data(data) 60 | assert logger.log == [], logger.log 61 | 62 | def assert_example_data(self, data): 63 | num_rows = data.get_num_rows() 64 | assert 3 <= num_rows <= 4, num_rows 65 | rows = data.get_all_rows() 66 | assert len(rows) == num_rows 67 | first_row = data.get_row(0) 68 | assert first_row == rows[0] 69 | assert rows[0] == [u'name', u'title', u'resource-0-url', u'resource-0-format', u'resource-0-description', u'tags'], rows[0] 70 | assert rows[1] == [u'wikipedia', u'Wikipedia', u'http://static.wikipedia.org/downloads/2008-06/en/wikipedia-en-html.tar.7z', u'html', u'In English', u'encyclopedia reference'], rows[1] 71 | # xl gives None and csv gives u'' for blank cells 72 | assert rows[2] == [u'tviv', u'TV IV', u'http://tviv.org/Category:Grids', u'', u'', u'tv encyclopedia'] or \ 73 | rows[2] == [u'tviv', u'TV IV', u'http://tviv.org/Category:Grids', None, None, u'tv encyclopedia'], rows[2] 74 | if num_rows == 4: 75 | assert rows[3] == [], rows[3] 76 | 77 | class TestDataRecords: 78 | def test_0_example(self): 79 | data = examples.get_data(EXAMPLE_TESTFILE_SUFFIX, XL_EXTENSION) 80 | data_records = spreadsheet_importer.SpreadsheetDataRecords(data, 'title') 81 | assert data_records.titles == data.get_row(0), data_records.titles 82 | records = [record for record in data_records.records] 83 | assert len(records) == 2, records 84 | assert records[0].items() == [ 85 | (u'name', u'wikipedia'), 86 | (u'title', u'Wikipedia'), 87 | (u'resource-0-url', u'http://static.wikipedia.org/downloads/2008-06/en/wikipedia-en-html.tar.7z'), 88 | (u'resource-0-format', u'html'), 89 | (u'resource-0-description', u'In English'), 90 | (u'tags', u'encyclopedia reference'), 91 | ], records[0].items() 92 | assert records[1].items() == [ 93 | (u'name', u'tviv'), 94 | (u'title', u'TV IV'), 95 | (u'resource-0-url', u'http://tviv.org/Category:Grids'), 96 | (u'resource-0-format', None), 97 | (u'resource-0-description', None), 98 | (u'tags', u'tv encyclopedia'), 99 | ], records[1].items() 100 | 101 | def test_1_bis_example(self): 102 | data = examples.get_data(EXAMPLE_BIS_TESTFILE_SUFFIX, XL_EXTENSION) 103 | data_records = spreadsheet_importer.SpreadsheetDataRecords(data, 'Dataset Ref#') 104 | assert data_records.titles[:3] == [None, 'Dataset Ref#', 'Dataset Status'], data_records.titles 105 | records = [record for record in data_records.records] 106 | assert len(records) == 2, records 107 | assert records[0]['Dataset Ref#'] == 'BIS-000002', records[0]['Dataset Ref#'] 108 | assert records[1]['Dataset Ref#'] == 'BIS-000003', records[1]['Dataset Ref#'] 109 | 110 | class TestPackageImporter: 111 | def test_munge(self): 112 | def test_munge(title, expected_munge): 113 | munge = spreadsheet_importer.SpreadsheetPackageImporter.munge(title) 114 | assert munge == expected_munge, 'Got %s not %s' % (munge, expected_munge) 115 | test_munge('Adult participation in learning', 'adult_participation_in_learning') 116 | test_munge('Alcohol Profile: Alcohol-specific hospital admission, males', 'alcohol_profile_-_alcohol-specific_hospital_admission_males') 117 | test_munge('Age and limiting long-term illness by NS-SeC', 'age_and_limiting_long-term_illness_by_ns-sec') 118 | test_munge('Higher Education Statistics: HE qualifications obtained in the UK by level, mode of study, domicile, gender, class of first degree and subject area 2001/02', 'higher_education_statistics_-_he_qualifications_obtained_in_the_uk_by_level_mode_of_stu-2001-02') 119 | 120 | def test_0_example_by_filepath(self): 121 | for extension in EXTENSIONS: 122 | filepath = examples.get_spreadsheet_filepath(EXAMPLE_TESTFILE_SUFFIX, extension) 123 | package_import = spreadsheet_importer.SpreadsheetPackageImporter(filepath=filepath) 124 | self.assert_example_package_import(package_import) 125 | 126 | def assert_example_package_import(self, package_import): 127 | pkg_dicts = [pkg_dict for pkg_dict in package_import.pkg_dict()] 128 | assert len(pkg_dicts) == 2, pkg_dicts 129 | assert pkg_dicts[0].items() == [(u'name', u'wikipedia'), (u'title', u'Wikipedia'), ('resources', [{'url': u'http://static.wikipedia.org/downloads/2008-06/en/wikipedia-en-html.tar.7z', 'alt_url': u'', 'hash': u'', 'description': u'In English', 'format': u'html'}]), (u'tags', u'encyclopedia reference')], pkg_dicts[0].items() 130 | assert pkg_dicts[1].items() == [(u'name', u'tviv'), (u'title', u'TV IV'), ('resources', [{'url': u'http://tviv.org/Category:Grids', 'alt_url': u'', 'hash': u'', 'description': u'', 'format': u''}]), (u'tags', u'tv encyclopedia')], pkg_dicts[1].items() 131 | -------------------------------------------------------------------------------- /ckanext/importlib/tests/test_spreadsheet_import_files.py: -------------------------------------------------------------------------------- 1 | import types 2 | import tempfile 3 | import os 4 | 5 | from sqlalchemy.util import OrderedDict 6 | from pylons import config 7 | 8 | import ckan.model as model 9 | from ckan.tests import * 10 | from ckanext.importlib import importer 11 | from ckanext.importlib import spreadsheet_importer 12 | from ckanext.importlib.spreadsheet_importer import readonly_keys 13 | import ckan.lib.dumper as dumper 14 | 15 | TEST_DIR = os.path.dirname(os.path.abspath(__file__)) 16 | TEST_FILES_DIR = os.path.join(TEST_DIR, 'samples') + '/' 17 | TEST_FILE_FULL = 'test_importer_full' 18 | TEST_FILE_EXAMPLE = 'test_importer_example' 19 | XL_EXTENSION = '.xls' 20 | CSV_EXTENSION = '.csv' 21 | EXTENSIONS = [XL_EXTENSION, CSV_EXTENSION] 22 | 23 | EXAMPLE_XL_DICTS = [ 24 | OrderedDict( 25 | [('name', 'wikipedia'), 26 | ('title', 'Wikipedia'), 27 | ('resource-0-url', 'http://static.wikipedia.org/downloads/2008-06/en/wikipedia-en-html.tar.7z'), 28 | ('resource-0-format', 'html'), 29 | ('resource-0-description', 'In English'), 30 | ('tags', 'encyclopedia reference')]), 31 | OrderedDict( 32 | [('name', 'tviv'), 33 | ('title', 'TV IV'), 34 | ('resource-0-url', 'http://tviv.org/Category:Grids'), 35 | ('tags', 'tv encyclopedia')]), 36 | ] 37 | 38 | pkg_to_xl_dict = dumper.PackagesXlWriter.pkg_to_xl_dict 39 | 40 | # This test recreates the sample files 41 | class Test0FilesCreation(TestController): 42 | @classmethod 43 | def setup_class(self): 44 | model.repo.init_db() 45 | CreateTestData.create() 46 | full_row_dicts = [pkg_to_xl_dict(pkg) for pkg in [model.Package.by_name(u'annakarenina'), model.Package.by_name(u'warandpeace')]] 47 | creators = [ (dumper.PackagesXlWriter, XL_EXTENSION), 48 | (dumper.PackagesCsvWriter, CSV_EXTENSION), 49 | ] 50 | for creator, extension in creators: 51 | creator(full_row_dicts).save(open(TEST_FILES_DIR + TEST_FILE_FULL + extension, 'wb')) 52 | creator(EXAMPLE_XL_DICTS).save(open(TEST_FILES_DIR + TEST_FILE_EXAMPLE + extension, 'wb')) 53 | 54 | @classmethod 55 | def teardown_class(self): 56 | model.repo.rebuild_db() 57 | 58 | def test_exist(self): 59 | for filename in (TEST_FILE_EXAMPLE, TEST_FILE_FULL): 60 | for extension in EXTENSIONS: 61 | filepath = TEST_FILES_DIR + filename + extension 62 | assert os.path.exists(filepath), filepath 63 | 64 | class Test1Import(TestController): 65 | @classmethod 66 | def setup_class(self): 67 | model.Session.remove() 68 | model.repo.init_db() 69 | CreateTestData.create() 70 | anna = model.Package.by_name(u'annakarenina') 71 | war = model.Package.by_name(u'warandpeace') 72 | self.anna_xl_dict = pkg_to_xl_dict(anna) 73 | self.war_xl_dict = pkg_to_xl_dict(war) 74 | self.anna_fs_dict = pkg_to_fs_dict(anna) 75 | self.war_fs_dict = pkg_to_fs_dict(war) 76 | self.full_buf = {} # extension:filebuf 77 | for extension in EXTENSIONS: 78 | filepath = TEST_FILES_DIR + TEST_FILE_FULL + XL_EXTENSION 79 | assert os.path.exists(filepath) 80 | f = open(filepath) 81 | self.full_buf[extension] = f.read() 82 | f.close() 83 | 84 | @classmethod 85 | def teardown_class(self): 86 | model.Session.remove() 87 | model.repo.rebuild_db() 88 | 89 | def _get_row(self, sheet, row_index): 90 | return [cell.value for cell in sheet.row(row_index)] 91 | 92 | def test_0_pkg_to_xl_dict(self): 93 | d = self.anna_xl_dict 94 | for key, value in d.items(): 95 | assert isinstance(d[key], (str, unicode, types.NoneType)), '%s:%s %s' % (key, value, type(value)) 96 | for key in ['name', 'license', 'tags', 'groups', 'genre', 97 | 'notes_rendered', 'metadata_modified', 'metadata_created']: 98 | assert d.has_key(key), key 99 | for key in ['id', 'license_id', 'ratings_average', 'extras']: 100 | assert not d.has_key(key), key 101 | 102 | def test_1_pkg_to_fs_dict(self): 103 | d = self.anna_fs_dict 104 | for key, value in d.items(): 105 | if key == 'extras': 106 | assert isinstance(d[key], dict), '%s:%s %s' % (key, value, type(value)) 107 | elif key == 'resources': 108 | assert isinstance(d[key], list), '%s:%s %s' % (key, value, type(value)) 109 | # check each resource 110 | for value in d[key]: 111 | assert isinstance(value, dict), '%s %s' % (value, type(value)) 112 | else: 113 | assert isinstance(d[key], (str, unicode, types.NoneType)), '%s:%s %s' % (key, value, type(value)) 114 | for key in ['name', 'license_id', 'tags', 'groups', 'extras']: 115 | assert d.has_key(key), '%s not in %s' % (key, d) 116 | for key in ['id', 'license', 'ratings_average', 'genre', 'ckan_url']: 117 | assert not d.has_key(key), key 118 | 119 | def test_2_creator_xl_file(self): 120 | import xlrd 121 | assert self.full_buf[XL_EXTENSION] 122 | 123 | book = xlrd.open_workbook(file_contents=self.full_buf[XL_EXTENSION]) 124 | assert book.nsheets == 1, book.nsheets 125 | sheet = book.sheet_by_index(0) 126 | titles = self._get_row(sheet, 0) 127 | assert titles[:2] == ['name', 'title'], titles 128 | row1 = self._get_row(sheet, 1) 129 | assert row1[:2] == ['annakarenina', 'A Novel By Tolstoy'], row1 130 | row2 = self._get_row(sheet, 2) 131 | assert row2[:2] == ['warandpeace', 'A Wonderful Story'], row2 132 | 133 | def test_3_read_full_buf(self): 134 | comparison_dicts = [self.anna_fs_dict, self.war_fs_dict] 135 | for extension in EXTENSIONS: 136 | log = self._test_read(buf=self.full_buf[extension], expected_dicts=comparison_dicts) 137 | assert not log, log 138 | 139 | def test_3_read_full_file(self): 140 | comparison_dicts = [self.anna_fs_dict, self.war_fs_dict] 141 | for extension in EXTENSIONS: 142 | filepath = TEST_FILES_DIR + TEST_FILE_FULL + extension 143 | log = self._test_read(filepath=filepath, expected_dicts=comparison_dicts) 144 | assert not log, log 145 | 146 | def test_4_read_example_file(self): 147 | comparison_dicts = [pkg_xl_dict_to_fs_dict(xl_dict) for xl_dict in EXAMPLE_XL_DICTS] 148 | for extension in EXTENSIONS: 149 | log = self._test_read(filepath=TEST_FILES_DIR + TEST_FILE_EXAMPLE + extension, expected_dicts=comparison_dicts) 150 | 151 | def _test_read(self, buf=None, filepath=None, expected_dicts=None): 152 | reader = spreadsheet_importer.SpreadsheetPackageImporter(buf=buf, filepath=filepath) 153 | index = 0 154 | for pkg_dict in reader.pkg_dict(): 155 | for key, comp_val in expected_dicts[index].items(): 156 | err_msg = 'Package \'%s\', Key %s should be: \n%s' % (pkg_dict['name'], repr(key), repr(comp_val)) 157 | if comp_val: 158 | assert pkg_dict.has_key(key), err_msg 159 | err_msg += ', but is: \n%s' % (repr(pkg_dict[key])) 160 | if key in ('groups', 'tags'): 161 | #order doesn't matter 162 | a = set(); b = set() 163 | [a.add(val) for val in pkg_dict[key].split(' ')] 164 | [b.add(val) for val in comp_val.split(' ')] 165 | assert a == b, err_msg 166 | elif key == 'license_id': 167 | assert pkg_dict[key] == str(comp_val), err_msg 168 | else: 169 | assert pkg_dict[key] == comp_val, err_msg 170 | else: 171 | assert not pkg_dict.has_key(key), err_msg 172 | for key, val in pkg_dict.items(): 173 | comp_val = expected_dicts[index].get(key, None) 174 | assert not (val and not comp_val), 'Package \'%s\', Key \'%s\' with value \'%s\' appeared.' % (pkg_dict['name'], key, val) 175 | 176 | index += 1 177 | return reader.get_log() 178 | 179 | # TODO: (rgrp: 2010-11-16) 180 | # why is not in the ckan/lib/spreadsheet_importer.pkg_xl_dict_to_fs_dict(cls, # pkg_xl_dict, logger=None)? 181 | # furthermore why is that not in a core module (or forms) rather than there ... 182 | def pkg_to_fs_dict(pkg): 183 | '''Convert a Package object to a dictionary suitable for fieldset data. 184 | e.g. {'name':'annakarenina', 'resources':{'url':'anna.com'}}''' 185 | dict_ = pkg.as_dict() 186 | for key, value in dict_.items(): 187 | if key in readonly_keys: 188 | del dict_[key] 189 | if key=='resources': 190 | dict_[key] = [res.as_dict(core_columns_only=True) for res in pkg.resources] 191 | elif isinstance(value, (list, tuple)): 192 | dict_[key] = ' '.join(value) 193 | return dict_ 194 | 195 | def pkg_xl_dict_to_fs_dict(pkg_xl_dict): 196 | return spreadsheet_importer.SpreadsheetPackageImporter.pkg_xl_dict_to_fs_dict(pkg_xl_dict) 197 | -------------------------------------------------------------------------------- /ckanext/importlib/spreadsheet_importer.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import copy 3 | 4 | from sqlalchemy.util import OrderedDict 5 | 6 | import ckan.model as model 7 | from importer import * 8 | 9 | readonly_keys = ('id', 'revision_id', 10 | 'relationships', 11 | 'license', 12 | 'ratings_average', 'ratings_count', 13 | 'ckan_url', 14 | 'metadata_modified', 15 | 'metadata_created', 16 | 'notes_rendered') 17 | 18 | class SpreadsheetData(object): 19 | '''Represents a spreadsheet file which you can access row by row.''' 20 | def __init__(self, logger, filepath=None, buf=None): 21 | assert filepath or buf 22 | assert not (filepath and buf) 23 | self._logger = logger 24 | self._rows = [] 25 | 26 | def get_row(self, row_index): 27 | 'Returns a list of the cells in unicode format.' 28 | raise NotImplementedError 29 | 30 | def get_num_rows(self): 31 | 'Returns the number of rows in the sheet.' 32 | raise NotImplementedError 33 | 34 | def get_all_rows(self): 35 | 'A crude way to get all the rows at once.' 36 | return [self.get_row(i) for i in range(self.get_num_rows())] 37 | 38 | 39 | class CsvData(SpreadsheetData): 40 | def __init__(self, logger, filepath=None, buf=None): 41 | super(CsvData, self).__init__(logger, filepath, buf) 42 | if 1: 43 | if filepath: 44 | csvfile = open(filepath) 45 | if not csvfile: 46 | raise ImportException('Could not open file \'%s\'.' % filepath) 47 | csv_snippet = csvfile.read(1024) 48 | elif buf: 49 | csvfile = buf.split('\n') 50 | if not csvfile: 51 | raise ImportException('Empty csv data.') 52 | csv_snippet = buf[:1024] 53 | try: 54 | dialect = csv.Sniffer().sniff(csv_snippet) 55 | dialect.doublequote = True # sniff doesn't seem to pick this up 56 | except csv.Error, inst: 57 | dialect = None 58 | if filepath: 59 | csvfile.seek(0) 60 | try: 61 | reader = csv.reader(csvfile, dialect) 62 | except TypeError, inst: 63 | raise ImportException('CSV file read error: %s' % inst) 64 | 65 | try: 66 | for line in reader: 67 | self._rows.append(line) 68 | except csv.Error, inst: 69 | raise ImportException('CSV file corrupt: %s' % inst) 70 | self._num_rows = len(self._rows) 71 | if self._num_rows < 2: 72 | raise ImportException('Not enough rows') 73 | 74 | def get_num_sheets(self): 75 | return 1 76 | 77 | def get_row(self, row_index): 78 | row = self._rows[row_index] 79 | return [cell.decode('utf8') for cell in row] 80 | 81 | def get_num_rows(self): 82 | return self._num_rows 83 | 84 | 85 | class XlData(SpreadsheetData): 86 | '''Spreadsheet data in Excel format. 87 | NB Cells with no value return None rather than u''. 88 | @param sheet_index - if None, warn if more than 1 sheet in workbook. 89 | ''' 90 | def __init__(self, logger, filepath=None, buf=None, sheet_index=None): 91 | super(XlData, self).__init__(logger, filepath, buf) 92 | import xlrd 93 | 94 | try: 95 | if filepath: 96 | self._book = xlrd.open_workbook(filepath) 97 | elif buf: 98 | self._book = xlrd.open_workbook(file_contents=buf) 99 | except xlrd.XLRDError, e: 100 | raise ImportException('Could not open workbook: %r' % e) 101 | 102 | if sheet_index == None: 103 | if self.get_num_sheets() != 1: 104 | logger.log.append('Warning: Just importing from sheet %r' % self._book.sheet_by_index(0).name) 105 | sheet_index = 0 106 | self.sheet = self._book.sheet_by_index(sheet_index) 107 | 108 | def get_num_sheets(self): 109 | return self._book.nsheets 110 | 111 | def get_sheet_names(self): 112 | return self._book.sheet_names() 113 | 114 | def get_data_by_sheet(self): 115 | data_list = [] 116 | for sheet_index in range(self.get_num_sheets()): 117 | data = copy.deepcopy(self) 118 | data.sheet = self._book.sheet_by_index(sheet_index) 119 | data_list.append(data) 120 | return data_list 121 | 122 | def get_row(self, row_index): 123 | import xlrd 124 | row = self.sheet.row(row_index) 125 | row_values = [] 126 | for cell in row: 127 | value = None 128 | if cell.ctype == xlrd.XL_CELL_TEXT: 129 | value = cell.value 130 | elif cell.ctype == xlrd.XL_CELL_NUMBER: 131 | if cell.value == int(cell.value): 132 | value = int(cell.value) 133 | else: 134 | value = cell.value 135 | elif cell.ctype == xlrd.XL_CELL_DATE: 136 | date_tuple = xlrd.xldate_as_tuple(cell.value, self._book.datemode) 137 | value = datetime.date(*date_tuple[:3]) 138 | elif cell.ctype == xlrd.XL_CELL_EMPTY: 139 | value = None 140 | else: 141 | raise ImportException, 'Unknown cell type: %s' % cell.ctype 142 | row_values.append(value) 143 | return row_values 144 | 145 | def get_num_rows(self): 146 | return self.sheet.nrows 147 | 148 | 149 | class SpreadsheetDataRecords(DataRecords): 150 | '''Takes SpreadsheetData and converts it its titles and 151 | data records. Handles title rows and filters out rows of rubbish. 152 | ''' 153 | def __init__(self, spreadsheet_data, essential_title): 154 | assert isinstance(spreadsheet_data, SpreadsheetData), spreadsheet_data 155 | self._data = spreadsheet_data 156 | # find titles row 157 | self.titles, last_titles_row_index = self.find_titles(essential_title) 158 | self._first_record_row = self.find_first_record_row(last_titles_row_index + 1) 159 | 160 | def find_titles(self, essential_title): 161 | row_index = 0 162 | titles = [] 163 | essential_title_lower = essential_title.lower() 164 | while True: 165 | if row_index >= self._data.get_num_rows(): 166 | raise ImportException('Could not find title row') 167 | row = self._data.get_row(row_index) 168 | if essential_title in row or essential_title_lower in row: 169 | for row_val in row: 170 | titles.append(row_val.strip() if isinstance(row_val, basestring) else None) 171 | return (titles, row_index) 172 | row_index += 1 173 | 174 | def find_first_record_row(self, row_index_to_start_looking): 175 | row_index = row_index_to_start_looking 176 | while True: 177 | if row_index >= self._data.get_num_rows(): 178 | raise ImportException('Could not find first record row') 179 | row = self._data.get_row(row_index) 180 | if not (u'<< Datasets Displayed Below' in row or\ 181 | row[:5] == [None, None, None, None, None] or\ 182 | row[:5] == ['', '', '', '', '']\ 183 | ): 184 | return row_index 185 | row_index += 1 186 | 187 | @property 188 | def records(self): 189 | '''Returns each record as a dict.''' 190 | for row_index in range(self._first_record_row, self._data.get_num_rows()): 191 | row = self._data.get_row(row_index) 192 | row_has_content = False 193 | for cell in row: 194 | if cell: 195 | row_has_content = True 196 | break 197 | if row_has_content: 198 | record_dict = OrderedDict(zip(self.titles, row)) 199 | if record_dict.has_key(None): 200 | del record_dict[None] 201 | yield record_dict 202 | 203 | 204 | class SpreadsheetPackageImporter(PackageImporter): 205 | '''From a filepath of an Excel or csv file, extracts package 206 | dictionaries.''' 207 | def __init__(self, record_params=None, record_class=SpreadsheetDataRecords, **kwargs): 208 | self._record_params = record_params if record_params != None else ['Title'] 209 | self._record_class = record_class 210 | super(SpreadsheetPackageImporter, self).__init__(**kwargs) 211 | 212 | def import_into_package_records(self): 213 | try: 214 | package_data = CsvData(self.log, filepath=self._filepath, 215 | buf=self._buf) 216 | except ImportException: 217 | package_data = XlData(self.log, filepath=self._filepath, 218 | buf=self._buf, sheet_index=0) 219 | if package_data.get_num_sheets() > 1: 220 | package_data = [XlData(self.log, filepath=self._filepath, 221 | buf=self._buf, sheet_index=i) for i in range(package_data.get_num_sheets())] 222 | self._package_data_records = MultipleSpreadsheetDataRecords( 223 | data_list=package_data, 224 | record_params=self._record_params, 225 | record_class=self._record_class) 226 | 227 | def record_2_package(self, row_dict): 228 | pkg_dict = self.pkg_xl_dict_to_fs_dict(row_dict, self.log) 229 | return pkg_dict 230 | 231 | @classmethod 232 | def pkg_xl_dict_to_fs_dict(cls, pkg_xl_dict, logger=None): 233 | '''Convert a Package represented in an Excel-type dictionary to a 234 | dictionary suitable for fieldset data. 235 | Takes Excel-type dict: 236 | {'name':'wikipedia', 237 | 'resource-0-url':'http://static.wikipedia.org/'} 238 | Returns Fieldset-type dict: 239 | {'name':'wikipedia', 240 | 'resources':[{'url':'http://static.wikipedia.org/'}]} 241 | ''' 242 | import ckan.forms 243 | standard_fields = model.Package.get_fields() 244 | 245 | pkg_fs_dict = OrderedDict() 246 | for title, cell in pkg_xl_dict.items(): 247 | if cell: 248 | if title in standard_fields: 249 | pkg_fs_dict[title] = cell 250 | elif title == 'license': 251 | license_id = cls.license_2_license_id(cell) 252 | if license: 253 | pkg_fs_dict['license_id'] = license_id 254 | else: 255 | logger('Warning: No license name matches \'%s\'. Ignoring license.' % cell) 256 | elif title.startswith('resource-'): 257 | match = re.match('resource-(\d+)-(\w+)', title) 258 | if match: 259 | res_index, field = match.groups() 260 | res_index = int(res_index) 261 | field = str(field) 262 | if not pkg_fs_dict.has_key('resources'): 263 | pkg_fs_dict['resources'] = [] 264 | resources = pkg_fs_dict['resources'] 265 | num_new_resources = 1 + res_index - len(resources) 266 | for i in range(num_new_resources): 267 | blank_dict = OrderedDict() 268 | for blank_field in model.Resource.get_columns(): 269 | blank_dict[blank_field] = u'' 270 | pkg_fs_dict['resources'].append(blank_dict) 271 | 272 | pkg_fs_dict['resources'][res_index][field] = cell 273 | else: 274 | logger('Warning: Could not understand resource title \'%s\'. Ignoring value: %s' % (title, cell)) 275 | elif title.startswith('relationships'): 276 | # TODO 277 | pass 278 | elif title == 'download_url': 279 | # deprecated - only in there for compatibility 280 | pass 281 | elif title in readonly_keys: 282 | pass 283 | else: 284 | if not pkg_fs_dict.has_key('extras'): 285 | pkg_fs_dict['extras'] = {} 286 | pkg_fs_dict['extras'][title] = cell 287 | return pkg_fs_dict 288 | 289 | 290 | class MultipleSpreadsheetDataRecords(DataRecords): 291 | '''Takes several SpreadsheetData objects and returns records for all 292 | of them combined. 293 | ''' 294 | def __init__(self, data_list, record_params, record_class=SpreadsheetDataRecords): 295 | self.records_list = [] 296 | if not isinstance(data_list, (list, tuple)): 297 | data_list = [data_list] 298 | for data in data_list: 299 | self.records_list.append(record_class(data, *record_params)) 300 | 301 | @property 302 | def records(self): 303 | for spreadsheet_records in self.records_list: 304 | for spreadsheet_record in spreadsheet_records.records: 305 | yield spreadsheet_record 306 | 307 | 308 | -------------------------------------------------------------------------------- /ckanext/importlib/loader.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Takes a package dictionary and loads into CKAN via the API. 3 | Checks to see if it already exists by name and preferably a unique field in 4 | the extras too. 5 | Uses ckanclient. 6 | ''' 7 | import re 8 | import copy 9 | from traceback import format_exc 10 | from pprint import pformat 11 | import itertools 12 | 13 | from ckanclient import CkanApiError, CkanApiNotAuthorizedError 14 | 15 | PACKAGE_NAME_MAX_LENGTH = 100 # this should match with ckan/model/package.py 16 | # but we avoid requiring ckan in this loader. 17 | 18 | ACTIVE = 'active' # should match ckan.model.ACTIVE 19 | 20 | log = __import__("logging").getLogger(__name__) 21 | 22 | class LoaderError(Exception): 23 | pass 24 | 25 | class PackageLoader(object): 26 | def __init__(self, ckanclient, stats=None): 27 | ''' 28 | Loader for packages into a CKAN server. Takes package dictionaries 29 | and loads them using the ckanclient. Can also add packages to a 30 | specified group. 31 | 32 | It checks to see if a package of the same name is already on the 33 | CKAN server and if so, updates it with the new info. Create a subclass 34 | implementing _find_package, which determines how an existing package 35 | is discovered. 36 | 37 | @param ckanclient - ckanclient object, which contains the 38 | connection to CKAN server 39 | ''' 40 | # Note: we pass in the ckanclient (rather than deriving from it), so 41 | # that we can choose to pass a test client instead of a real one. 42 | self.ckanclient = ckanclient 43 | self._stats = stats 44 | 45 | def load_package(self, pkg_dict): 46 | ''' 47 | May raise LoaderError or CkanApiNotAuthorizedError (which implies API 48 | key is wrong, so stop). 49 | ''' 50 | 51 | log.info('..Loading "%s"' % pkg_dict['name']) 52 | 53 | # see if the package is already there 54 | existing_pkg_name, existing_pkg = self._find_package(pkg_dict) 55 | log.debug('Check for dataset already existing: %s', existing_pkg_name) 56 | 57 | # if creating a new package, check the name is available 58 | if not existing_pkg_name: 59 | self._ensure_pkg_name_is_available(pkg_dict) 60 | 61 | # write package 62 | # (May raise LoaderError or CkanApiNotAuthorizedError) 63 | pkg_dict = self._write_package(pkg_dict, existing_pkg_name, existing_pkg) 64 | pkg_dict = self.ckanclient.last_message 65 | 66 | log.debug('Package written: %s %r', pkg_dict['name'], pkg_dict) 67 | return pkg_dict 68 | 69 | def load_packages(self, pkg_dicts): 70 | '''Loads multiple packages. 71 | 72 | @return results and resulting package names/ids. 73 | ''' 74 | num_errors = 0 75 | num_loaded = 0 76 | pkg_ids = [] 77 | pkg_names = [] 78 | for pkg_dict in pkg_dicts: 79 | try: 80 | pkg_dict = self.load_package(pkg_dict) 81 | except CkanApiNotAuthorizedError, e: 82 | log.error('Authorization Error (fatal) loading dict "%s":\n%s' % (pkg_dict['name'], format_exc())) 83 | num_errors = 'fatal' 84 | self._add_stat('Authorization Error %s' % e, pkg_dict) 85 | break 86 | except LoaderError, e: 87 | log.error('Error loading dict "%s":\n%s' % (pkg_dict['name'], format_exc())) 88 | num_errors += 1 89 | self._add_stat('Error %s' % e, pkg_dict) 90 | else: 91 | pkg_ids.append(pkg_dict['id']) 92 | pkg_names.append(pkg_dict['name']) 93 | num_loaded += 1 94 | return {'pkg_names':pkg_names, 95 | 'pkg_ids':pkg_ids, 96 | 'num_loaded':num_loaded, 97 | 'num_errors':num_errors} 98 | 99 | def _add_stat(self, message, pkg_dict): 100 | if not self._stats: 101 | return 102 | pub_date = pkg_dict.get('extras', {}).get('date_released') 103 | item_id = '%s (%s)' % (pkg_dict['title'], pub_date) 104 | return self._stats.add(message, item_id) 105 | 106 | def _find_package(self, pkg_dict): 107 | raise NotImplemented 108 | 109 | def _write_package(self, pkg_dict, existing_pkg_name, existing_pkg=None): 110 | ''' 111 | Writes a package (pkg_dict). If there is an existing package to 112 | be changed, then supply existing_pkg_name. If the caller has already 113 | got the existing package then pass it in, to save getting it twice. 114 | 115 | @return pkg_dict - the package as it was written 116 | 117 | May raise LoaderError or CkanApiNotAuthorizedError (which implies API 118 | key is wrong, so stop). 119 | ''' 120 | if existing_pkg_name: 121 | if not existing_pkg: 122 | existing_pkg = self._get_package(existing_pkg_name) 123 | if existing_pkg_name != pkg_dict["name"]: 124 | pkg_dict = pkg_dict.copy() 125 | pkg_dict["name"] = existing_pkg_name 126 | if self._pkg_has_changed(existing_pkg, pkg_dict): 127 | log.info('..Updating existing package') 128 | try: 129 | self.ckanclient.package_entity_put(pkg_dict) 130 | except CkanApiError: 131 | raise LoaderError( 132 | 'Error (%s) editing package over API: %s' % \ 133 | (self.ckanclient.last_status, 134 | self.ckanclient.last_message)) 135 | pkg_dict = self.ckanclient.last_message 136 | self._add_stat('Updated package', pkg_dict) 137 | else: 138 | log.info('..No change') 139 | self._add_stat('No change', pkg_dict) 140 | else: 141 | log.info('..Creating package') 142 | try: 143 | self.ckanclient.package_register_post(pkg_dict) 144 | except CkanApiNotAuthorizedError: 145 | raise 146 | except CkanApiError: 147 | raise LoaderError( 148 | 'Error (%s) creating package over API: %s' % \ 149 | (self.ckanclient.last_status, 150 | self.ckanclient.last_message)) 151 | pkg_dict = self.ckanclient.last_message 152 | self._add_stat('Created package', pkg_dict) 153 | return pkg_dict 154 | 155 | def add_pkg_to_group(self, pkg_name, group_name): 156 | return self.add_pkgs_to_group([pkg_name], group_name) 157 | 158 | def add_pkgs_to_group(self, pkg_names, group_name): 159 | for pkg_name in pkg_names: 160 | assert not self.ckanclient.is_id(pkg_name), pkg_name 161 | assert not self.ckanclient.is_id(group_name), group_name 162 | try: 163 | group_dict = self.ckanclient.group_entity_get(group_name) 164 | except CkanApiError, e: 165 | if self.ckanclient.last_status == 404: 166 | raise LoaderError('Group named %r does not exist' % group_name) 167 | else: 168 | raise LoaderError('Unexpected status (%s) checking for group name %r: %r') % (self.ckanclient.last_status, group_name, group_dict) 169 | group_dict['packages'] = (group_dict['packages'] or []) + pkg_names 170 | try: 171 | group_dict = self.ckanclient.group_entity_put(group_dict) 172 | except CkanApiError, e: 173 | raise LoaderError('Unexpected status %s writing to group \'%s\': %r' % (self.ckanclient.last_status, group_dict, e.args)) 174 | 175 | def _get_package(self, pkg_name): 176 | try: 177 | pkg = self.ckanclient.package_entity_get(pkg_name) 178 | except CkanApiError, e: 179 | if self.ckanclient.last_status == 404: 180 | pkg = None 181 | else: 182 | raise LoaderError('Unexpected status %s checking for package under \'%s\': %r' % (self.ckanclient.last_status, pkg_name, e.args)) 183 | return pkg 184 | 185 | def _find_package_by_fields(self, field_keys, pkg_dict): 186 | '''Looks for a package that has matching keys to the pkg supplied. 187 | Requires a unique match or it raises LoaderError. 188 | @return (pkg_name, pkg) - pkg_name - the name of the matching 189 | package or None if there is none. 190 | pkg - the matching package dict if it 191 | happens to have been requested, 192 | otherwise None 193 | ''' 194 | if field_keys == ['name']: 195 | pkg = self._get_package(pkg_dict['name']) 196 | pkg_name = pkg_dict['name'] if pkg else None 197 | else: 198 | search_options = self._get_search_options(field_keys, pkg_dict) 199 | pkg_name, pkg = self._find_package_by_options(search_options) 200 | 201 | if not pkg_name: 202 | # Just in case search is not being well indexed, look for the 203 | # package under its name as well 204 | try_pkg_name = pkg_dict['name'] 205 | pkg = self._get_package(try_pkg_name) 206 | while pkg: 207 | if self._pkg_matches_search_options(pkg, search_options): 208 | log.warn('Search failed to find package %r with ref %r, ' 209 | 'but luckily the name is what was expected so loader ' 210 | 'found it anyway.' % (pkg_dict['name'], search_options)) 211 | pkg_name = try_pkg_name 212 | break 213 | try_pkg_name += '_' 214 | pkg = self._get_package(try_pkg_name) 215 | else: 216 | pkg_name = pkg = None 217 | 218 | log.info('..Search for existing package found: %r with filter: %r', 219 | pkg_name, search_options) 220 | return pkg_name, pkg 221 | 222 | def _get_search_options(self, field_keys, pkg_dict): 223 | search_options = {} 224 | has_a_value = False 225 | for field_key in field_keys: 226 | field_value = pkg_dict.get(field_key) or (pkg_dict['extras'].get(field_key) if pkg_dict.has_key('extras') else None) 227 | ## else: 228 | ## # This is how solr searches for blank values 229 | ## # http://stackoverflow.com/questions/4238609/how-to-query-solr-for-empty-fields 230 | ## #search_options['-%s' % field_key] = u'["" TO *]' 231 | ## search_options['q'] = u'-%s:["" TO *]' % field_key 232 | if field_value: 233 | if isinstance(field_value, list): 234 | for value in field_value: 235 | search_options[field_key] = value or u'' 236 | else: 237 | search_options[field_key] = field_value or u'' 238 | has_a_value = True 239 | if not has_a_value: 240 | raise LoaderError('Package %r has blank values for identifying fields: %r' % (pkg_dict['name'], field_keys)) 241 | return search_options 242 | 243 | def _package_search(self, search_options): 244 | try: 245 | res = self.ckanclient.package_search(q='', search_options=search_options) 246 | except CkanApiError, e: 247 | raise LoaderError('Search request failed (status %s): %r' % (self.ckanclient.last_status, e.args)) 248 | return res 249 | 250 | def _find_package_by_options(self, search_options): 251 | '''The search_options specify values a package must have and this 252 | returns the package. 253 | 254 | If more than one package matching then it logs an error but returns 255 | the first one as we prefer to save the data to one, rather than 256 | lose it. 257 | 258 | If none match then it returns (None, None). 259 | 260 | A successful search returns (pkg_name, pkg) where pkg may be None, 261 | or returned filled, as a convenience. 262 | 263 | ''' 264 | search = self._package_search(search_options) 265 | # Search doesn't do exact match (e.g. sql search searches *in* 266 | # a field), so check matches thoroughly. 267 | # Also check the package is active 268 | exactly_matching_pkg_names = [] 269 | pkg = None 270 | for pkg_ref in search['results']: 271 | pkg = self._get_package(pkg_ref) 272 | if pkg['state'] == ACTIVE and \ 273 | self._pkg_matches_search_options(pkg, search_options): 274 | exactly_matching_pkg_names.append(pkg["name"]) 275 | if len(exactly_matching_pkg_names) > 1: 276 | log.error('More than one record matches the search options %r: %r (so picking the first one)' % (search_options, exactly_matching_pkg_names)) 277 | pkg_name = exactly_matching_pkg_names[0] 278 | elif len(exactly_matching_pkg_names) == 1: 279 | pkg_name = exactly_matching_pkg_names[0] 280 | else: 281 | pkg_name = None 282 | # Only carry through value for pkg if it was the last one and only 283 | # one fetched 284 | if not(search['count'] == 1 and pkg and pkg['name'] == pkg_name): 285 | pkg = None 286 | return pkg_name, pkg 287 | 288 | def _ensure_pkg_name_is_available(self, pkg_dict): 289 | '''Checks the CKAN db to see if the name for this package has been 290 | already taken, and if so, changes the pkg_dict to have another 291 | name that is free. 292 | @return nothing - changes the name in the pkg_dict itself 293 | ''' 294 | preferred_name = pkg_dict['name'] 295 | clashing_pkg = self._get_package(pkg_dict['name']) 296 | original_clashing_pkg = clashing_pkg 297 | while clashing_pkg: 298 | if len(pkg_dict['name']) >= PACKAGE_NAME_MAX_LENGTH: 299 | new_name = pkg_dict['name'].rstrip('_')[:-1] 300 | new_name = new_name.ljust(PACKAGE_NAME_MAX_LENGTH, '_') 301 | pkg_dict['name'] = new_name 302 | else: 303 | pkg_dict['name'] += '_' 304 | clashing_pkg = self._get_package(pkg_dict['name']) 305 | 306 | if pkg_dict['name'] != preferred_name: 307 | log.warn('Name %r already exists so new package renamed ' 308 | 'to %r.' % (preferred_name, pkg_dict['name'])) 309 | else: 310 | log.debug('Name %r available', pkg_dict['name']) 311 | 312 | def _pkg_has_changed(self, existing_value, value): 313 | changed = False 314 | if isinstance(value, dict): 315 | for key, sub_value in value.items(): 316 | if key in ('owner_org', 'import_source'): 317 | # loader doesn't setup groups 318 | # import_source changing alone doesn't require an update 319 | continue 320 | existing_sub_value = existing_value.get(key) 321 | if self._pkg_has_changed(existing_sub_value, sub_value): 322 | changed = True 323 | break 324 | elif isinstance(value, list) and \ 325 | isinstance(existing_value, list): 326 | if len(existing_value) != len(value): 327 | changed = True 328 | else: 329 | for i, sub_value in enumerate(value): 330 | if self._pkg_has_changed(existing_value[i], sub_value): 331 | changed = True 332 | break 333 | elif (existing_value or None) != (value or None): 334 | changed = True 335 | 336 | if changed: 337 | return True 338 | return False 339 | 340 | def lower(self, value): 341 | '''If given a string, returns lowercase version of it. 342 | Blank strings and None values are standardized on None. 343 | 344 | This is allowed for matching, because SOLR search returns values for 345 | either case. 346 | ''' 347 | if isinstance(value, basestring): 348 | value = value.lower().strip() 349 | if not value: 350 | return None 351 | return value 352 | 353 | def _pkg_matches_search_options(self, pkg_dict, search_options): 354 | '''Returns True if pkg_dict matches all of the search_options.''' 355 | matches = True 356 | for key, value in search_options.items(): 357 | pkg_dict_value = pkg_dict.get(key) or pkg_dict['extras'].get(key) 358 | 359 | if isinstance(pkg_dict_value, list): 360 | # e.g. must have the tag or be in that group 361 | if value and self.lower(value) not in \ 362 | [self.lower(val) for val in pkg_dict_value]: 363 | matches = False 364 | log.info('Match failed %s on field %s=%r but should have included %r', 365 | pkg_dict['name'], key, pkg_dict_value, value) 366 | break 367 | else: 368 | if self.lower(pkg_dict_value) != self.lower(value): 369 | matches = False 370 | log.info('Match failed %s on field %s=%r but should be %r', 371 | pkg_dict['name'], key, pkg_dict_value, value) 372 | break 373 | return matches 374 | 375 | class ReplaceByNameLoader(PackageLoader): 376 | '''Loader finds a package based on its name. 377 | Load replaces the package with the supplied pkg_dict.''' 378 | 379 | def _find_package(self, pkg_dict): 380 | find_pkg_by_keys = ['name'] 381 | return self._find_package_by_fields(find_pkg_by_keys, pkg_dict) 382 | 383 | class ReplaceByExtraFieldLoader(PackageLoader): 384 | '''Loader finds a package based on a unique id in an extra field. 385 | Loader replaces the package with the supplied pkg_dict.''' 386 | def __init__(self, ckanclient, package_id_extra_key, stats=None): 387 | super(ReplaceByExtraFieldLoader, self).__init__(ckanclient, stats) 388 | assert package_id_extra_key 389 | self.package_id_extra_key = package_id_extra_key 390 | 391 | def _find_package(self, pkg_dict): 392 | find_pkg_by_keys = [self.package_id_extra_key] 393 | return self._find_package_by_fields(find_pkg_by_keys, pkg_dict) 394 | 395 | class ResourceSeriesLoader(PackageLoader): 396 | '''Loader finds package based on a specified field and checks to see 397 | if most fields (listed in field_keys_to_expect_invariant) match the 398 | pkg_dict. Loader then inserts the resources in the pkg_dict into 399 | the package and updates any fields that have changed (e.g. last_updated). 400 | It checks to see if the particular resource is already in the package 401 | by a custom resource ID which is contained in the description field, 402 | as a word containing the given prefix. 403 | @param synonyms - a list of tuples describing values of a field that 404 | should be regarded as equal, for when searching for 405 | an existing package. 406 | e.g. {'department': [('DfE', 'DCSF'), ('DCLG', 'CLG')]} 407 | means resources for the department DfE would be inserted 408 | into a package which still had the old deparment name 409 | of DCSF (and the same for CLG and GCLG). 410 | ''' 411 | def __init__(self, ckanclient, 412 | field_keys_to_find_pkg_by, 413 | field_keys_to_expect_invariant=None, 414 | synonyms=None, 415 | extras_to_not_overwrite=None, 416 | stats=None): 417 | super(ResourceSeriesLoader, self).__init__(ckanclient, stats=stats) 418 | assert field_keys_to_find_pkg_by 419 | assert isinstance(field_keys_to_find_pkg_by, (list, tuple)) 420 | self.field_keys_to_find_pkg_by = field_keys_to_find_pkg_by 421 | self.field_keys_to_expect_invariant = field_keys_to_expect_invariant \ 422 | or [] 423 | self.synonyms = synonyms or {} 424 | self.extras_to_not_overwrite = extras_to_not_overwrite or [] 425 | 426 | def _find_package(self, pkg_dict): 427 | # take a copy of the keys since the find routine may change them 428 | find_pkg_by_keys = self.field_keys_to_find_pkg_by[:] 429 | return self._find_package_by_fields(find_pkg_by_keys, pkg_dict) 430 | 431 | def _get_search_options(self, field_keys, pkg_dict): 432 | search_options = super(ResourceSeriesLoader, self)._get_search_options(field_keys, pkg_dict) 433 | # now take account of the synonyms to search for 434 | search_options_list = [search_options] 435 | for field_key, field_value in search_options.items(): 436 | if field_key in self.synonyms: 437 | for synonym_list in self.synonyms[field_key]: 438 | if field_value in synonym_list: 439 | alt_field_values = list(synonym_list) 440 | alt_field_values.remove(field_value) 441 | for opts in search_options_list[:]: 442 | for alt_field_value in alt_field_values: 443 | alt_opts = opts.copy() 444 | alt_opts[field_key] = alt_field_value 445 | search_options_list.append(alt_opts) 446 | return search_options_list 447 | 448 | def _package_search(self, search_options_list): 449 | try: 450 | result_count = 0 451 | result_generators = [] 452 | for search_options in search_options_list: 453 | res = self.ckanclient.package_search(q='', search_options=search_options) 454 | result_count += res['count'] 455 | result_generators.append(res['results']) 456 | except CkanApiError, e: 457 | raise LoaderError('Search request failed (status %s): %r' % (self.ckanclient.last_status, e.args)) 458 | return {'count': result_count, 459 | 'results': itertools.chain(*result_generators)} 460 | 461 | def _pkg_matches_search_options(self, pkg_dict, search_options_list): 462 | '''Returns True if pkg_dict matches any of the search_options 463 | listed.''' 464 | matches = False 465 | for search_options in search_options_list: 466 | if super(ResourceSeriesLoader, self)._pkg_matches_search_options(pkg_dict, search_options): 467 | matches = True 468 | break 469 | return matches 470 | 471 | def _write_package(self, pkg_dict, existing_pkg_name, existing_pkg=None): 472 | ''' 473 | Writes a package (pkg_dict). If there is an existing package to 474 | be changed, then supply existing_pkg_name. If the caller has already 475 | got the existing package then pass it in, to save getting it twice. 476 | 477 | May raise LoaderError or CkanApiNotAuthorizedError (which implies API 478 | key is wrong, so stop). 479 | ''' 480 | if existing_pkg_name: 481 | if not existing_pkg: 482 | existing_pkg = self._get_package(existing_pkg_name) 483 | try: 484 | pkg_dict = self._merge_resources(existing_pkg, pkg_dict) 485 | except Exception, e: 486 | raise LoaderError('Could not merge resources.\n' 487 | ' existing_pkg: %r\n' 488 | ' pkg_dict: %r\n' 489 | ' Exception: %s'% (existing_pkg, pkg_dict, e)) 490 | if self.extras_to_not_overwrite and \ 491 | self.extras_to_not_overwrite == ['theme-primary', 'themes-secondary']: 492 | if existing_pkg and existing_pkg['extras'].get('theme-primary'): 493 | pkg_dict['extras']['theme-primary'] = existing_pkg['extras']['theme-primary'] 494 | pkg_dict['extras']['themes-secondary'] = existing_pkg['extras'].get('themes-secondary') 495 | super(ResourceSeriesLoader, self)._write_package(pkg_dict, 496 | existing_pkg_name, 497 | existing_pkg) 498 | 499 | def _merge_resources(self, existing_pkg, pkg): 500 | '''Takes an existing_pkg and merges in resources from the pkg. 501 | ''' 502 | log.info("..Merging resources into %s" % existing_pkg["name"]) 503 | log.debug("....Existing resources:\n%s" % pformat(existing_pkg["resources"])) 504 | log.debug("....New resources:\n%s" % pformat(pkg["resources"])) 505 | 506 | # check invariant fields aren't different 507 | warnings = [] 508 | for key in self.field_keys_to_expect_invariant: 509 | if key in existing_pkg or key in pkg: 510 | if (existing_pkg.get(key) or None) != (pkg.get(key) or None): 511 | warnings.append('%s: %r -> %r' % (key, existing_pkg.get(key), pkg.get(key))) 512 | else: 513 | if (existing_pkg['extras'].get(key) or None) != (pkg['extras'].get(key) or None): 514 | warnings.append('%s: %r -> %r' % (key, existing_pkg['extras'].get(key), pkg['extras'].get(key))) 515 | 516 | if warnings: 517 | log.warn('Warning: uploading package \'%s\' and surprised to see ' 518 | 'changes in these values:\n%s' % (existing_pkg['name'], 519 | '; '.join(warnings))) 520 | 521 | # copy over all fields but use the existing resources 522 | merged_dict = pkg.copy() 523 | merged_dict['resources'] = copy.deepcopy(existing_pkg['resources']) 524 | 525 | # merge resources 526 | for pkg_res in pkg['resources']: 527 | # look for resource ID already being there 528 | pkg_res_id = self._get_resource_id(pkg_res) 529 | for i, existing_res in enumerate(merged_dict['resources']): 530 | res_id = self._get_resource_id(existing_res) 531 | if res_id == pkg_res_id: 532 | # edit existing resource 533 | merged_dict['resources'][i] = pkg_res 534 | break 535 | else: 536 | # insert new res 537 | merged_dict['resources'].append(pkg_res) 538 | 539 | log.debug("....Merged resources:\n%s" % pformat(merged_dict["resources"])) 540 | 541 | return merged_dict 542 | 543 | def _get_resource_id(self, res): 544 | raise NotImplementedError 545 | -------------------------------------------------------------------------------- /ckanext/importlib/tests/test_loader.py: -------------------------------------------------------------------------------- 1 | import urllib2 2 | import time 3 | 4 | from sqlalchemy.util import OrderedDict 5 | from nose.tools import assert_equal 6 | 7 | from ckan import model 8 | from ckan.lib.create_test_data import CreateTestData 9 | from ckan.tests import * 10 | from ckan.tests import CreateTestData, TestSearchIndexer, is_search_supported 11 | from ckan.tests.wsgi_ckanclient import WsgiCkanClient 12 | from ckanclient import CkanClient 13 | from ckanext.importlib.loader import ReplaceByNameLoader, ReplaceByExtraFieldLoader, ResourceSeriesLoader, LoaderError 14 | 15 | USER = u'annafan' 16 | 17 | # Set to true for quicker tests using wsgi_ckanclient 18 | # otherwise it uses ckanclient 19 | # (some tests still fail with ckanclient currently) 20 | WSGI_CLIENT = True 21 | 22 | #TODO: test log statements 23 | 24 | def count_pkgs(): 25 | return model.Session.query(model.Package).count() 26 | 27 | class TestLoaderBase(TestController): 28 | @classmethod 29 | def setup_class(self): 30 | if hasattr(super(TestLoaderBase, self), 'setup_class'): 31 | super(TestLoaderBase, self).setup_class() 32 | CreateTestData.create_arbitrary([], extra_user_names=[USER]) 33 | user = model.User.by_name(USER) 34 | assert user 35 | if WSGI_CLIENT: 36 | self.testclient = WsgiCkanClient(self.app, api_key=user.apikey) 37 | else: 38 | self.sub_proc = self._start_ckan_server('test.ini') 39 | self.testclient = CkanClient(base_location='http://localhost:5000/api', 40 | api_key=user.apikey) 41 | self._wait_for_url(url='http://localhost:5000/api') 42 | 43 | 44 | @classmethod 45 | def teardown_class(self): 46 | if hasattr(super(TestLoaderBase, self), 'teardown_class'): 47 | super(TestLoaderBase, self).teardown_class() 48 | if WSGI_CLIENT: 49 | model.Session.remove() 50 | model.repo.rebuild_db() 51 | else: 52 | try: 53 | self._stop_ckan_server(self.sub_proc) 54 | finally: 55 | model.repo.rebuild_db() 56 | 57 | def assert_equal_dicts(dict1, dict2, only_assert_these_keys=None): 58 | only_assert_these_keys = set(only_assert_these_keys) if only_assert_these_keys else set([]) 59 | dict1_keys = set(dict1.keys()) & only_assert_these_keys 60 | dict2_keys = set(dict2.keys()) & only_assert_these_keys 61 | key_diffs = dict1_keys ^ dict2_keys 62 | if key_diffs: 63 | print '%i keys not in both dicts.' % len(key_diffs) 64 | print 'Only in dict1: %r' % (dict1_keys - dict2_keys) 65 | print 'Only in dict2: %r' % (dict2_keys - dict1_keys) 66 | print '\nDict1: %r\nDict2: %r' % \ 67 | (dict1, dict2) 68 | raise AssertionError 69 | for key in dict1_keys: 70 | if dict1[key] != dict2[key]: 71 | print 'Value for key %r is different. %r != %r' % \ 72 | (key, dict1[key], dict2[key]) 73 | raise AssertionError 74 | 75 | class TestLoader(TestLoaderBase): 76 | @classmethod 77 | def setup_class(self): 78 | super(TestLoader, self).setup_class() 79 | self.loader = ReplaceByNameLoader(self.testclient) 80 | 81 | # teardown is in the base class 82 | 83 | def test_0_simple_load(self): 84 | pkg_dict = {'name':u'pkgname', 85 | 'title':u'Boris'} 86 | assert not model.Package.by_name(pkg_dict['name']) 87 | CreateTestData.flag_for_deletion(pkg_names=[pkg_dict['name']]) 88 | res_pkg_dict = self.loader.load_package(pkg_dict) 89 | assert res_pkg_dict 90 | pkg = model.Package.by_name(pkg_dict['name']) 91 | assert_equal_dicts(res_pkg_dict, pkg.as_dict(), 92 | only_assert_these_keys=('name', 'title')) 93 | assert pkg 94 | assert pkg.name == pkg_dict['name'] 95 | assert pkg.title == pkg_dict['title'] 96 | 97 | def test_1_load_several(self): 98 | num_pkgs = count_pkgs() 99 | pkg_dicts = [{'name':u'pkgname_a', 100 | 'title':u'BorisA'}, 101 | {'name':u'pkgname_b', 102 | 'title':u'BorisB'}, 103 | ] 104 | assert not model.Package.by_name(pkg_dicts[0]['name']) 105 | CreateTestData.flag_for_deletion(pkg_names=[pkg_dict['name'] for pkg_dict in pkg_dicts]) 106 | res = self.loader.load_packages(pkg_dicts) 107 | assert (res['num_loaded'], res['num_errors']) == (2, 0), \ 108 | (res['num_loaded'], res['num_errors']) 109 | assert count_pkgs() == num_pkgs + 2, (count_pkgs() - num_pkgs) 110 | for pkg_index, pkg_dict in enumerate(pkg_dicts): 111 | pkg_name = pkg_dict['name'] 112 | pkg = model.Package.by_name(pkg_name) 113 | assert pkg.id == res['pkg_ids'][pkg_index], \ 114 | '%s != %s' % (pkg.id, res['pkg_ids'][pkg_index]) 115 | 116 | def test_1_load_several_with_errors(self): 117 | num_pkgs = count_pkgs() 118 | pkg_dicts = [{'name':u'pkgnameA', # not allowed uppercase name 119 | 'title':u'BorisA'}, 120 | {'name':u'pkgnameB', 121 | 'title':u'BorisB'}, 122 | ] 123 | assert not model.Package.by_name(pkg_dicts[0]['name']) 124 | CreateTestData.flag_for_deletion(pkg_names=[pkg_dict['name'] for pkg_dict in pkg_dicts]) 125 | res = self.loader.load_packages(pkg_dicts) 126 | assert (res['num_loaded'], res['num_errors']) == (0, 2), \ 127 | (res['num_loaded'], res['num_errors']) 128 | assert count_pkgs() == num_pkgs, (count_pkgs() - num_pkgs) 129 | assert res['pkg_ids'] == [], res['pkg_ids'] 130 | 131 | def test_2_reload(self): 132 | # load the package once 133 | num_pkgs = count_pkgs() 134 | pkg_dict = {'name':u'pkgname2', 135 | 'title':u'Boris'} 136 | assert not model.Package.by_name(pkg_dict['name']) 137 | CreateTestData.flag_for_deletion(pkg_names=[pkg_dict['name']]) 138 | self.loader.load_package(pkg_dict) 139 | pkg = model.Package.by_name(pkg_dict['name']) 140 | assert pkg 141 | assert count_pkgs() == num_pkgs + 1, (count_pkgs() - num_pkgs) 142 | 143 | # load the package again 144 | pkg_dict = {'name':u'pkgname2', 145 | 'title':u'Boris Becker'} 146 | self.loader.load_package(pkg_dict) 147 | pkg = model.Package.by_name(pkg_dict['name']) 148 | assert pkg 149 | assert pkg.name == pkg_dict['name'] 150 | assert pkg.title == pkg_dict['title'], pkg.title 151 | assert count_pkgs() == num_pkgs + 1, (count_pkgs() - num_pkgs) 152 | 153 | 154 | class TestLoaderUsingUniqueFields(TestLoaderBase): 155 | @classmethod 156 | def setup_class(self): 157 | self.tsi = TestSearchIndexer() 158 | super(TestLoaderUsingUniqueFields, self).setup_class() 159 | self.loader = ReplaceByExtraFieldLoader(self.testclient, 'ref') 160 | 161 | # teardown is in the base class 162 | 163 | def test_0_reload(self): 164 | # create initial package 165 | num_pkgs = count_pkgs() 166 | pkg_dict = {'name':u'pkgname0', 167 | 'title':u'Boris', 168 | 'extras':{u'ref':'boris'}} 169 | assert not model.Package.by_name(pkg_dict['name']) 170 | CreateTestData.create_arbitrary([pkg_dict]) 171 | self.tsi.index() 172 | pkg = model.Package.by_name(pkg_dict['name']) 173 | assert pkg 174 | assert count_pkgs() == num_pkgs + 1, (count_pkgs() - num_pkgs) 175 | 176 | # load the package with same name and ref 177 | pkg_dict = {'name':u'pkgname0', 178 | 'title':u'Boris 2', 179 | 'extras':{u'ref':'boris'}} 180 | self.loader.load_package(pkg_dict) 181 | pkg = model.Package.by_name(pkg_dict['name']) 182 | assert pkg 183 | assert pkg.name == pkg_dict['name'] 184 | assert pkg.title == pkg_dict['title'] 185 | assert count_pkgs() == num_pkgs + 1, (count_pkgs() - num_pkgs) 186 | 187 | # load the package with different name, same ref 188 | pkg_dict = {'name':u'pkgname0changed', 189 | 'title':u'Boris 3', 190 | 'extras':{u'ref':'boris'}} 191 | CreateTestData.flag_for_deletion(pkg_names=[pkg_dict['name']]) 192 | 193 | self.loader.load_package(pkg_dict) 194 | assert count_pkgs() == num_pkgs + 1, (count_pkgs() - num_pkgs) 195 | # for now we do not support renaming 196 | pkg = model.Package.by_name(pkg_dict['name']) 197 | assert pkg is None, pkg 198 | pkg = model.Package.by_name(u'pkgname0') 199 | assert pkg 200 | assert pkg.title == pkg_dict['title'] 201 | 202 | # load the package with same name, different ref - new package 203 | other_pkg_dict = pkg_dict 204 | pkg_dict = {'name':u'pkgname0', 205 | 'title':u'Boris 4', 206 | 'extras':{u'ref':'boris-4'}} 207 | CreateTestData.flag_for_deletion(pkg_names=[pkg_dict['name']]) 208 | self.loader.load_package(pkg_dict) 209 | assert pkg_dict['name'] == 'pkgname0_' 210 | orig_pkg = model.Package.by_name(u'pkgname0') 211 | assert orig_pkg 212 | assert orig_pkg.title == u'Boris 3' 213 | pkg = model.Package.by_name(pkg_dict['name']) 214 | assert pkg 215 | assert pkg.name == pkg_dict['name'] 216 | assert pkg.title == pkg_dict['title'] 217 | assert count_pkgs() == num_pkgs + 2, (count_pkgs() - num_pkgs) 218 | 219 | def test_1_avoid_long_name_clash(self): 220 | # load the package once 221 | num_pkgs = count_pkgs() 222 | pkg_dict = {'name':u'a'*99, 223 | 'title':u'99 char name', 224 | 'extras':{u'ref':'aaa'}} 225 | assert not model.Package.by_name(pkg_dict['name']) 226 | CreateTestData.flag_for_deletion(pkg_names=[pkg_dict['name']]) 227 | self.loader.load_package(pkg_dict) 228 | pkg = model.Package.by_name(pkg_dict['name']) 229 | assert pkg 230 | assert count_pkgs() == num_pkgs + 1, (count_pkgs() - num_pkgs) 231 | 232 | # load a clashing package - name appended '_' 233 | orig_pkg = pkg_dict 234 | pkg_dict = {'name':orig_pkg['name'], 235 | 'title':u'bbb', 236 | 'extras':{u'ref':'bbb'}} 237 | self.loader.load_package(pkg_dict) 238 | clash_name = u'a'*99 + u'_' 239 | pkg = model.Package.by_name(clash_name) 240 | assert pkg 241 | assert pkg.title == pkg_dict['title'], pkg.title 242 | assert count_pkgs() == num_pkgs + 2, (count_pkgs() - num_pkgs) 243 | 244 | # load another clashing package - name over 100 chars so shortened 245 | # and finishes '__' 246 | orig_pkg = pkg_dict 247 | pkg_dict = {'name':orig_pkg['name'], 248 | 'title':u'ccc', 249 | 'extras':{u'ref':'ccc'}} 250 | self.loader.load_package(pkg_dict) 251 | clash_name = u'a'*98 + u'__' 252 | assert pkg_dict['name'] == clash_name, (pkg_dict['name'], clash_name) 253 | pkg = model.Package.by_name(clash_name) 254 | assert pkg 255 | assert pkg.title == pkg_dict['title'], pkg.title 256 | assert count_pkgs() == num_pkgs + 3, (count_pkgs() - num_pkgs) 257 | 258 | 259 | class TestLoaderNoSearch(TestLoaderBase): 260 | '''Cope as best as possible if search indexing is flakey.''' 261 | @classmethod 262 | def setup_class(self): 263 | '''NB, no search indexing started''' 264 | if not is_search_supported(): 265 | raise SkipTest("Search not supported") 266 | super(TestLoaderNoSearch, self).setup_class() 267 | self.loader = ReplaceByExtraFieldLoader(self.testclient, 'ref') 268 | 269 | # teardown is in the base class 270 | 271 | def test_0_reload(self): 272 | # create initial package 273 | num_pkgs = count_pkgs() 274 | pkg_dict = {'name':u'pkgname0', 275 | 'title':u'Boris', 276 | 'extras':{u'ref':'boris'}} 277 | assert not model.Package.by_name(pkg_dict['name']) 278 | CreateTestData.create_arbitrary([pkg_dict]) 279 | pkg = model.Package.by_name(pkg_dict['name']) 280 | assert pkg 281 | assert count_pkgs() == num_pkgs + 1, (count_pkgs() - num_pkgs) 282 | 283 | # load the package with same name and ref 284 | pkg_dict = {'name':u'pkgname0', 285 | 'title':u'Boris 2', 286 | 'extras':{u'ref':'boris'}} 287 | self.loader.load_package(pkg_dict) 288 | pkg = model.Package.by_name(pkg_dict['name']) 289 | assert pkg 290 | assert pkg.name == pkg_dict['name'] 291 | assert pkg.title == pkg_dict['title'] 292 | assert count_pkgs() == num_pkgs + 1, (count_pkgs() - num_pkgs) 293 | # i.e. not tempted to create pkgname0_ alongside pkgname0 294 | 295 | 296 | class TestLoaderGroups(TestLoaderBase): 297 | @classmethod 298 | def setup_class(self): 299 | super(TestLoaderGroups, self).setup_class() 300 | self.loader = ReplaceByNameLoader(self.testclient) 301 | 302 | assert count_pkgs() == 0, count_pkgs() 303 | pkg_dicts = [{'name':u'pkga'}, 304 | {'name':u'pkgb'}, 305 | {'name':u'pkgc'}, 306 | ] 307 | CreateTestData.create_arbitrary(pkg_dicts) 308 | group_dicts = [ 309 | {'name':u'g1', 'packages':[u'pkga']}, 310 | {'name':u'g2'}, 311 | {'name':u'g3'}, 312 | ] 313 | CreateTestData.create_groups(group_dicts, USER) 314 | self.pkgs = [model.Package.by_name(pkg_dict['name']) \ 315 | for pkg_dict in pkg_dicts] 316 | self.pkg_ids = [pkg.id for pkg in self.pkgs] 317 | 318 | # teardown is in the base class 319 | 320 | def test_0_add_to_empty_group(self): 321 | pkg_name = u'pkga' 322 | group_name = u'g2' 323 | pkg = model.Package.by_name(pkg_name) 324 | group = model.Group.by_name(group_name) 325 | assert group 326 | assert not group.packages, group.packages 327 | self.loader.add_pkg_to_group(pkg.name, group.name) 328 | group = model.Group.by_name(group_name) 329 | pkg = model.Package.by_name(pkg_name) 330 | assert group.packages == [pkg], group.packages 331 | 332 | def test_1_add_to_non_empty_group(self): 333 | pkg_name = u'pkgb' 334 | group_name = u'g1' 335 | pkg = model.Package.by_name(pkg_name) 336 | group = model.Group.by_name(group_name) 337 | assert group 338 | assert len(group.packages) == 1, group.packages 339 | self.loader.add_pkg_to_group(pkg.name, group.name) 340 | group = model.Group.by_name(group_name) 341 | pkg = model.Package.by_name(pkg_name) 342 | assert pkg in group.packages, group.packages 343 | assert len(group.packages) == 2, group.packages 344 | 345 | def test_2_add_multiple_packages(self): 346 | pkg_names = [u'pkgb', u'pkgc'] 347 | group_name = u'g2' 348 | pkgs = [model.Package.by_name(pkg_name) for pkg_name in pkg_names] 349 | group = model.Group.by_name(group_name) 350 | assert group 351 | num_pkgs_at_start = len(group.packages) 352 | assert num_pkgs_at_start in (0, 1), group.packages 353 | self.loader.add_pkgs_to_group(pkg_names, group.name) 354 | group = model.Group.by_name(group_name) 355 | pkgs = [model.Package.by_name(pkg_name) for pkg_name in pkg_names] 356 | for pkg in pkgs: 357 | assert pkg in group.packages, group.packages 358 | assert len(group.packages) == num_pkgs_at_start + 2, group.packages 359 | 360 | def test_3_add_to_missing_group(self): 361 | pkg_names = [u'pkgb', u'pkgc'] 362 | try: 363 | self.loader.add_pkgs_to_group(pkg_names, 'random_name') 364 | except LoaderError, e: 365 | assert e.args[0] == 'Group named \'random_name\' does not exist', e.args 366 | else: 367 | assert 0, 'Should have raise a LoaderError for the missing group' 368 | 369 | 370 | class TestLoaderInsertingResources(TestLoaderBase): 371 | @classmethod 372 | def setup_class(self): 373 | self.tsi = TestSearchIndexer() 374 | super(TestLoaderInsertingResources, self).setup_class() 375 | self.loader = ResourceSeriesLoader( 376 | self.testclient, 377 | ['title', 'department'], 378 | 'ons/id/', 379 | field_keys_to_expect_invariant=['country']) 380 | 381 | # teardown is in the base class 382 | 383 | def test_0_reload(self): 384 | # create initial package 385 | num_pkgs = count_pkgs() 386 | pkg_dict = {'name':u'pollution', 387 | 'title':u'Pollution', 388 | 'extras':{u'department':'air', 389 | u'country':'UK', #invariant 390 | u'last_updated':'Monday', #variant 391 | }, 392 | 'resources':[{'url':'pollution.com/1', 393 | 'description':'ons/id/1'}], 394 | } 395 | bogus_dict = {'name':u'bogus', 396 | 'title':u'Pollution', 397 | 'extras':{u'department':'water', 398 | u'country':'UK', 399 | u'last_updated':'Monday', 400 | }, 401 | 'resources':[{'url':'pollution.com/2', 402 | 'description':'ons/id/2'}], 403 | } 404 | assert not model.Package.by_name(pkg_dict['name']) 405 | assert not model.Package.by_name(bogus_dict['name']) 406 | CreateTestData.create_arbitrary([pkg_dict, bogus_dict]) 407 | self.tsi.index() 408 | pkg = model.Package.by_name(pkg_dict['name']) 409 | assert pkg 410 | assert count_pkgs() == num_pkgs + 2, (count_pkgs() - num_pkgs) 411 | assert len(pkg.resources) == 1, pkg.resources 412 | 413 | # load the same package: same title, department, updated resource 414 | pkg_dict = {'name':u'pollution', 415 | 'title':u'Pollution', 416 | 'extras':{u'department':'air', 417 | u'country':'UK', #invariant 418 | u'last_updated':'Tuesday', #variant 419 | }, 420 | 'resources':[{'url':'pollution.com/id/1', 421 | 'description':'ons/id/1'}], 422 | } 423 | self.loader.load_package(pkg_dict) 424 | pkg = model.Package.by_name(pkg_dict['name']) 425 | assert pkg 426 | assert pkg.name == pkg_dict['name'] 427 | assert pkg.title == pkg_dict['title'] 428 | assert pkg.extras['country'] == pkg_dict['extras']['country'] 429 | assert pkg.extras['last_updated'] == pkg_dict['extras']['last_updated'] 430 | assert count_pkgs() == num_pkgs + 2, (count_pkgs() - num_pkgs) 431 | assert len(pkg.resources) == 1, pkg.resources 432 | assert pkg.resources[0].url == pkg_dict['resources'][0]['url'], pkg.resources[0].url 433 | assert pkg.resources[0].description == pkg_dict['resources'][0]['description'], pkg.resources[0]['description'] 434 | 435 | # load the same package: same title, department, new resource 436 | pkg_dict2 = {'name':u'pollution', 437 | 'title':u'Pollution', 438 | 'extras':{u'department':'air', 439 | u'country':'UK', #invariant 440 | u'last_updated':'Tuesday', #variant 441 | }, 442 | 'resources':[{'url':'pollution.com/id/3', 443 | 'description':'ons/id/3'}], 444 | } 445 | self.loader.load_package(pkg_dict2) 446 | pkg = model.Package.by_name(pkg_dict2['name']) 447 | assert pkg 448 | assert pkg.name == pkg_dict2['name'] 449 | assert pkg.title == pkg_dict2['title'] 450 | assert pkg.extras['country'] == pkg_dict2['extras']['country'] 451 | assert pkg.extras['last_updated'] == pkg_dict2['extras']['last_updated'] 452 | assert count_pkgs() == num_pkgs + 2, (count_pkgs() - num_pkgs) 453 | assert len(pkg.resources) == 2, pkg.resources 454 | print pkg.resources 455 | assert_equal(pkg.resources[0].url, pkg_dict['resources'][0]['url']) 456 | assert pkg.resources[0].description == pkg_dict['resources'][0]['description'], pkg.resources[0]['description'] 457 | assert pkg.resources[1].url == pkg_dict2['resources'][0]['url'], pkg.resources[1].url 458 | assert pkg.resources[1].description == pkg_dict2['resources'][0]['description'], pkg.resources[1]['description'] 459 | 460 | # load the different package: because of different department 461 | pkg_dict3 = {'name':u'pollution', 462 | 'title':u'Pollution', 463 | 'extras':{u'department':'river', 464 | u'country':'UK', #invariant 465 | u'last_updated':'Tuesday', #variant 466 | }, 467 | 'resources':[{'url':'pollution.com/id/3', 468 | 'description':'Lots of pollution | ons/id/3'}], 469 | } 470 | self.loader.load_package(pkg_dict3) 471 | CreateTestData.flag_for_deletion('pollution_') 472 | assert count_pkgs() == num_pkgs + 3, (count_pkgs() - num_pkgs) 473 | pkg_names = [pkg.name for pkg in model.Session.query(model.Package).all()] 474 | pkg = model.Package.by_name(u'pollution_') 475 | assert pkg 476 | assert pkg.extras['department'] == pkg_dict3['extras']['department'] 477 | 478 | # load the same package: but with different country 479 | # should just get a warning 480 | pkg_dict4 = {'name':u'pollution', 481 | 'title':u'Pollution', 482 | 'extras':OrderedDict([ 483 | (u'department', 'air'), 484 | (u'country', 'UK and France'), #invariant 485 | (u'last_updated', 'Tuesday'), #variant 486 | ]), 487 | 'resources':[OrderedDict([ 488 | ('url', 'pollution.com/id/3'), 489 | ('description', 'Lots of pollution | ons/id/3'), 490 | ])], 491 | } 492 | self.loader.load_package(pkg_dict4) 493 | pkg = model.Package.by_name(pkg_dict4['name']) 494 | assert pkg 495 | assert pkg.name == pkg_dict4['name'] 496 | assert pkg.title == pkg_dict4['title'] 497 | assert pkg.extras['country'] == pkg_dict4['extras']['country'] 498 | assert pkg.extras['last_updated'] == pkg_dict4['extras']['last_updated'] 499 | assert count_pkgs() == num_pkgs + 3, (count_pkgs() - num_pkgs) 500 | assert len(pkg.resources) == 2, pkg.resources 501 | assert pkg.resources[0].url == pkg_dict['resources'][0]['url'], pkg.resources[0].url 502 | assert pkg.resources[0].description == pkg_dict['resources'][0]['description'], pkg.resources[0]['description'] 503 | assert pkg.resources[1].url == pkg_dict4['resources'][0]['url'], pkg.resources[1].url 504 | assert pkg.resources[1].description == pkg_dict4['resources'][0]['description'], pkg.resources[1]['description'] 505 | 506 | 507 | class TestLoaderInsertingResourcesWithSynonym(TestLoaderBase): 508 | @classmethod 509 | def setup_class(self): 510 | self.tsi = TestSearchIndexer() 511 | super(TestLoaderInsertingResourcesWithSynonym, self).setup_class() 512 | self.loader = ResourceSeriesLoader( 513 | self.testclient, 514 | ['title', 'department'], 515 | 'ons/id/', 516 | field_keys_to_expect_invariant=['country'], 517 | synonyms={'department': [('air', 'sky')]} 518 | ) 519 | 520 | # teardown is in the base class 521 | 522 | def test_0_search_options(self): 523 | loader = ResourceSeriesLoader( 524 | self.testclient, 525 | ['title', 'department'], 526 | 'ons/id/', 527 | field_keys_to_expect_invariant=['country'], 528 | synonyms={'department': [('dept1', 'dept2', 'dept3')], 529 | 'title': [('titleA', 'titleB', 'titleC')]} 530 | ) 531 | field_keys = ['title', 'department'] 532 | pkg_dict = {'title':'titleA', 533 | 'extras':{'department':'dept1'}} 534 | opts = loader._get_search_options(field_keys, pkg_dict) 535 | self.assert_equal(opts, [{'department': 'dept1', 'title': 'titleA'}, {'department': 'dept2', 'title': 'titleA'}, {'department': 'dept3', 'title': 'titleA'}, {'department': 'dept1', 'title': 'titleB'}, {'department': 'dept1', 'title': 'titleC'}, {'department': 'dept2', 'title': 'titleB'}, {'department': 'dept2', 'title': 'titleC'}, {'department': 'dept3', 'title': 'titleB'}, {'department': 'dept3', 'title': 'titleC'}]) 536 | 537 | def test_1_reload(self): 538 | # create initial package 539 | num_pkgs = count_pkgs() 540 | pkg_dict = {'name':u'pollution', 541 | 'title':u'Pollution', 542 | 'extras':{u'department':'air', 543 | u'country':'UK', #invariant 544 | u'last_updated':'Monday', #variant 545 | }, 546 | 'resources':[{'url':'pollution.com/1', 547 | 'description':'ons/id/1'}], 548 | } 549 | bogus_dict = {'name':u'bogus', 550 | 'title':u'Pollution', 551 | 'extras':{u'department':'water', 552 | u'country':'UK', 553 | u'last_updated':'Monday', 554 | }, 555 | 'resources':[{'url':'pollution.com/2', 556 | 'description':'ons/id/2'}], 557 | } 558 | assert not model.Package.by_name(pkg_dict['name']) 559 | assert not model.Package.by_name(bogus_dict['name']) 560 | CreateTestData.create_arbitrary([pkg_dict, bogus_dict]) 561 | self.tsi.index() 562 | pkg = model.Package.by_name(pkg_dict['name']) 563 | assert pkg 564 | assert count_pkgs() == num_pkgs + 2, (count_pkgs() - num_pkgs) 565 | assert len(pkg.resources) == 1, pkg.resources 566 | 567 | # load the similar package: same title, updated resource, 568 | # BUT synonym department 569 | pkg_dict = {'name':u'pollution', 570 | 'title':u'Pollution', 571 | 'extras':{u'department':'sky', 572 | u'country':'UK', #invariant 573 | u'last_updated':'Tuesday', #variant 574 | }, 575 | 'resources':[{'url':'pollution.com/id/1', 576 | 'description':'ons/id/1'}], 577 | } 578 | self.loader.load_package(pkg_dict) 579 | pkg = model.Package.by_name(pkg_dict['name']) 580 | assert pkg 581 | assert pkg.name == pkg_dict['name'] 582 | assert pkg.title == pkg_dict['title'] 583 | assert pkg.extras['country'] == pkg_dict['extras']['country'] 584 | assert pkg.extras['last_updated'] == pkg_dict['extras']['last_updated'] 585 | assert count_pkgs() == num_pkgs + 2, (count_pkgs() - num_pkgs) 586 | assert len(pkg.resources) == 1, pkg.resources 587 | assert pkg.resources[0].url == pkg_dict['resources'][0]['url'], pkg.resources[0].url 588 | assert pkg.resources[0].description == pkg_dict['resources'][0]['description'], pkg.resources[0]['description'] 589 | 590 | # load the different package: because of different department 591 | pkg_dict3 = {'name':u'pollution', 592 | 'title':u'Pollution', 593 | 'extras':{u'department':'river', 594 | u'country':'UK', #invariant 595 | u'last_updated':'Tuesday', #variant 596 | }, 597 | 'resources':[{'url':'pollution.com/id/3', 598 | 'description':'Lots of pollution | ons/id/3'}], 599 | } 600 | self.loader.load_package(pkg_dict3) 601 | CreateTestData.flag_for_deletion('pollution_') 602 | assert count_pkgs() == num_pkgs + 3, (count_pkgs() - num_pkgs) 603 | pkg_names = [pkg.name for pkg in model.Session.query(model.Package).all()] 604 | pkg = model.Package.by_name(u'pollution_') 605 | assert pkg 606 | assert pkg.extras['department'] == pkg_dict3['extras']['department'] 607 | 608 | class TestLoaderNoIndexing(TestLoaderBase): 609 | '''This checks you can re-load a package when the package name 610 | is unchanged, yet it is not search indexed (due to a problem with that). 611 | 612 | ''' 613 | @classmethod 614 | def setup_class(self): 615 | # No TestSearchIndexer is initialised. 616 | if not is_search_supported(): 617 | raise SkipTest("Search not supported") 618 | super(TestLoaderNoIndexing, self).setup_class() 619 | self.loader = ReplaceByExtraFieldLoader(self.testclient, 'ref') 620 | 621 | # teardown is in the base class 622 | 623 | def test_0_reload(self): 624 | # create initial package 625 | num_pkgs = count_pkgs() 626 | pkg_dict = {'name':u'pkgname0', 627 | 'title':u'Boris', 628 | 'extras':{u'ref':'boris'}} 629 | assert not model.Package.by_name(pkg_dict['name']) 630 | CreateTestData.create_arbitrary([pkg_dict]) 631 | pkg = model.Package.by_name(pkg_dict['name']) 632 | assert pkg 633 | assert count_pkgs() == num_pkgs + 1, (count_pkgs() - num_pkgs) 634 | 635 | # load the package with same name and ref 636 | pkg_dict = {'name':u'pkgname0', 637 | 'title':u'Boris 2', 638 | 'extras':{u'ref':'boris'}} 639 | self.loader.load_package(pkg_dict) 640 | pkg = model.Package.by_name(pkg_dict['name']) 641 | assert pkg 642 | assert pkg.name == pkg_dict['name'] 643 | assert pkg.title == pkg_dict['title'] 644 | assert count_pkgs() == num_pkgs + 1, (count_pkgs() - num_pkgs) 645 | 646 | def test_1_reload_with_underscores(self): 647 | # Create decoy package 648 | pkg_dict = {'name':u'pkgname1', 649 | 'title':u'Old package decoy', 650 | 'extras':{u'ref':'decoy'}} 651 | assert not model.Package.by_name(pkg_dict['name']) 652 | CreateTestData.create_arbitrary([pkg_dict]) 653 | 654 | # create initial package 655 | num_pkgs = count_pkgs() 656 | pkg_dict = {'name':u'pkgname1_', 657 | 'title':u'The real Helga', 658 | 'extras':{u'ref':'helga'}} 659 | assert not model.Package.by_name(pkg_dict['name']) 660 | CreateTestData.create_arbitrary([pkg_dict]) 661 | pkg = model.Package.by_name(pkg_dict['name']) 662 | assert pkg 663 | assert count_pkgs() == num_pkgs + 1, (count_pkgs() - num_pkgs) 664 | 665 | # load the package with same name and ref 666 | pkg_dict = {'name':u'pkgname1', 667 | 'title':u'Helga updated', 668 | 'extras':{u'ref':'helga'}} 669 | self.loader.load_package(pkg_dict) 670 | pkg = model.Package.by_name(u'pkgname1_') 671 | assert pkg 672 | assert_equal(pkg.title, pkg_dict['title']) 673 | assert count_pkgs() == num_pkgs + 1, (count_pkgs() - num_pkgs) 674 | 675 | decoy = model.Package.by_name(u'pkgname1') 676 | assert decoy 677 | assert_equal(decoy.title, u'Old package decoy') 678 | 679 | pkg = model.Package.by_name(u'pkgname1_') 680 | assert pkg 681 | assert_equal(pkg.title, u'Helga updated') 682 | 683 | assert not model.Package.by_name(u'pkgname1__') 684 | --------------------------------------------------------------------------------