├── setup.cfg
├── ckanext
    ├── __init__.py
    └── importlib
    │   ├── __init__.py
    │   ├── tests
    │       ├── __init__.py
    │       ├── samples
    │       │   ├── test_importer_full.xls
    │       │   ├── test_importer_example.xls
    │       │   ├── test_importer_bis_example.xls
    │       │   ├── test_importer_example.csv
    │       │   └── test_importer_full.csv
    │       ├── test_spreadsheet_importer.py
    │       ├── test_spreadsheet_import_files.py
    │       └── test_loader.py
    │   ├── api_command.py
    │   ├── command.py
    │   ├── importer.py
    │   ├── spreadsheet_importer.py
    │   └── loader.py
├── .gitignore
├── pip-requirements.txt
├── setup.py
├── test.ini
└── README.txt


/setup.cfg:
--------------------------------------------------------------------------------
1 | [nosetests]
2 | with-pylons = test.ini
3 | 


--------------------------------------------------------------------------------
/ckanext/__init__.py:
--------------------------------------------------------------------------------
1 | __import__("pkg_resources").declare_namespace(__name__)
2 | 


--------------------------------------------------------------------------------
/ckanext/importlib/__init__.py:
--------------------------------------------------------------------------------
1 | __import__("pkg_resources").declare_namespace(__name__)
2 | 


--------------------------------------------------------------------------------
/ckanext/importlib/tests/__init__.py:
--------------------------------------------------------------------------------
1 | __import__("pkg_resources").declare_namespace(__name__)
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | syntax:glob
2 | # generic
3 | *.pyc
4 | *~
5 | .DS_Store
6 | *.egg-info/*
7 | sandbox/*
8 | 


--------------------------------------------------------------------------------
/pip-requirements.txt:
--------------------------------------------------------------------------------
1 | xlrd
2 | xlwt
3 | -e git+https://github.com/okfn/ckanclient.git#egg=ckanclient
4 | 


--------------------------------------------------------------------------------
/ckanext/importlib/tests/samples/test_importer_full.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/okfn/ckanext-importlib/master/ckanext/importlib/tests/samples/test_importer_full.xls


--------------------------------------------------------------------------------
/ckanext/importlib/tests/samples/test_importer_example.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/okfn/ckanext-importlib/master/ckanext/importlib/tests/samples/test_importer_example.xls


--------------------------------------------------------------------------------
/ckanext/importlib/tests/samples/test_importer_bis_example.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/okfn/ckanext-importlib/master/ckanext/importlib/tests/samples/test_importer_bis_example.xls


--------------------------------------------------------------------------------
/ckanext/importlib/tests/samples/test_importer_example.csv:
--------------------------------------------------------------------------------
1 | "name","title","resource-0-url","resource-0-format","resource-0-description","tags"
2 | "wikipedia","Wikipedia","http://static.wikipedia.org/downloads/2008-06/en/wikipedia-en-html.tar.7z","html","In English","encyclopedia reference"
3 | "tviv","TV IV","http://tviv.org/Category:Grids","","","tv encyclopedia"
4 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | setup(
 4 |     name='ckanext-importlib',
 5 |     version='0.1',
 6 |     author='Open Knowledge Foundation',
 7 |     author_email='info@okfn.org',
 8 |     license='AGPL',
 9 |     url='http://ckan.org/',
10 |     description='CKAN importer and loader library',
11 |     keywords='data packaging component tool server',
12 |     namespace_packages=['ckanext', 'ckanext.importlib'],
13 |     install_requires=[
14 |         # List of dependencies is moved to pip-requirements.txt
15 |         # to avoid conflicts with Debian packaging.
16 |         #'xlrd>=0.7.1',
17 |         #'xlwt>=0.7.2',
18 |     ],
19 |     packages=find_packages(exclude=['ez_setup']),
20 |     include_package_data=True,
21 |     package_data={'ckan': ['i18n/*/LC_MESSAGES/*.mo']},
22 |     entry_points="""
23 |     """,
24 |     test_suite = 'nose.collector',
25 | )
26 | 


--------------------------------------------------------------------------------
/test.ini:
--------------------------------------------------------------------------------
 1 | #
 2 | # ckan - Pylons testing environment configuration
 3 | #
 4 | # The %(here)s variable will be replaced with the parent directory of this file
 5 | #
 6 | [DEFAULT]
 7 | debug = true
 8 | # Uncomment and replace with the address which should receive any error reports
 9 | #email_to = you@yourdomain.com
10 | smtp_server = localhost
11 | error_email_from = paste@localhost
12 | 
13 | [server:main]
14 | use = egg:Paste#http
15 | host = 0.0.0.0
16 | port = 5000
17 | 
18 | 
19 | [app:main]
20 | use = config:../ckan/test.ini
21 | 
22 | 
23 | # Logging configuration
24 | [loggers]
25 | keys = root, ckan, sqlalchemy
26 | 
27 | [handlers]
28 | keys = console
29 | 
30 | [formatters]
31 | keys = generic
32 | 
33 | [logger_root]
34 | level = WARN
35 | handlers = console
36 | 
37 | [logger_ckan]
38 | qualname = ckan
39 | handlers = 
40 | level = INFO
41 | 
42 | [logger_sqlalchemy]
43 | handlers =
44 | qualname = sqlalchemy.engine
45 | level = WARN  
46 | 
47 | [handler_console]
48 | class = StreamHandler
49 | args = (sys.stdout,)
50 | level = NOTSET
51 | formatter = generic
52 | 
53 | [formatter_generic]
54 | format = %(asctime)s %(levelname)-5.5s [%(name)s] %(message)s
55 | 


--------------------------------------------------------------------------------
/README.txt:
--------------------------------------------------------------------------------
 1 | Library for importing datasets into CKAN using the API.
 2 | 
 3 | Introduction
 4 | ============
 5 | 
 6 | One-off imports of metadata into CKAN isn't very hard, and using ckanclient directly is probably best for that. But when you are continuously importing you have some challenges which this library aims to help with:
 7 | 
 8 | * when you reimport a dataset you want to check if it already exists in CKAN, using an ID stored in an extra field and possibly another extra field naming the source
 9 | 
10 | * you may import resources, which become grouped into datasets (e.g. time series data) - ResourceSeriesLoader
11 | 
12 | * when you derive a unique name for a dataset from its title, you need to avoid clashes.
13 | 
14 | ckanext-importlib was designed as a framework to be expanded, based on the needs of the data.gov.uk ONS importer. But TBH it is not so flexible. But even if you don't use it, you might want to steal stuff from it.
15 | 
16 | Quickstart
17 | ==========
18 | 
19 | To get the code::
20 | 
21 |     hg clone https://github.com/okfn/ckanext-importlib.git
22 | 
23 | The code also requires installed:
24 |  * importlib dependencies (pip-requirements.txt)
25 |  * ckan
26 |  * ckan dependencies (ckan/pip-requirements.txt)
27 | 
28 | To install the dependencies into a virtual environment::
29 | 
30 |     virtualenv pyenv
31 |     pip -E pyenv install -e ../ckanext-importlib
32 |     pip -E pyenv install -e ckan
33 |     pip -E ../pyenv-ckanext-importlib install -r ../ckan/pip-requirements.txt
34 |     pip -E pyenv install -r pip-requirements.txt
35 | 
36 | 
37 | Tests
38 | =====
39 | 
40 | To run the tests:: 
41 | 
42 |     pip -E pyenv install -e nose
43 |     cd ckanext-importlib
44 |     nosetests --ckan ckanext/importlib/tests/
45 | 


--------------------------------------------------------------------------------
/ckanext/importlib/api_command.py:
--------------------------------------------------------------------------------
 1 | from command import Command
 2 | 
 3 | from ckanclient import CkanClient
 4 | 
 5 | class ApiCommand(Command):
 6 |     def __init__(self, usage=None):
 7 |         '''
 8 |         Base class for commands that use the API
 9 |         '''
10 |         self.parser = Command.StandardParser(usage=usage)
11 |         super(ApiCommand, self).__init__()
12 | 
13 |     def add_options(self):
14 |         self.parser.add_option("-H", "--host",
15 |                           dest="api_url",
16 |                           help="API URL (e.g.: http://test.ckan.net/api)")
17 |         self.parser.add_option("-k", "--key",
18 |                           dest="api_key",
19 |                           help="API Key (required)")
20 |         self.parser.add_option("-u", "--username",
21 |                           dest="username",
22 |                           help="Username for HTTP Basic Authentication")
23 |         self.parser.add_option("-p", "--password",
24 |                           dest="password",
25 |                           help="Password for HTTP Basic Authentication")
26 |         
27 |     def command(self):
28 |         super(ApiCommand, self).command()
29 |         if not self.options.api_key:
30 |             self.parser.error('Please specify an API Key')
31 |         if not self.options.api_url:
32 |             self.parser.error('Please specify an API URL')
33 |         if self.options.api_url:
34 |             if not (self.options.api_url.startswith('http://') or \
35 |                     self.options.api_url.startswith('https://')):
36 |                 self.parser.error('--host must start with "http://"')
37 |             if not '/api' in self.options.api_url:
38 |                 self.parser.error('--host must have "/api" towards the end')
39 |         user_agent = self.user_agent if hasattr(self, 'user_agent') else 'ckanext-importlib/ApiCommand'
40 | 
41 |         self.client = CkanClient(base_location=self.options.api_url,
42 |                                  api_key=self.options.api_key,
43 |                                  http_user=self.options.username,
44 |                                  http_pass=self.options.password,
45 |                                  is_verbose=True,
46 |                                  user_agent=user_agent)
47 | 
48 |         # now do command
49 | 


--------------------------------------------------------------------------------
/ckanext/importlib/tests/samples/test_importer_full.csv:
--------------------------------------------------------------------------------
 1 | "name","title","version","url","author","author_email","maintainer","maintainer_email","notes","state","license","isopen","tags","groups","ckan_url","relationships","metadata_modified","metadata_created","notes_rendered","genre","original media","resource-0-url","resource-0-format","resource-0-description","resource-0-hash","resource-0-name","resource-0-resource_type","resource-0-mimetype","resource-0-mimetype_inner","resource-0-size","resource-0-last_modified","resource-0-cache_url","resource-0-cache_last_updated","resource-0-webstore_url","resource-0-webstore_last_updated","resource-0-alt_url","resource-0-size_extra","resource-1-url","resource-1-format","resource-1-description","resource-1-hash","resource-1-name","resource-1-resource_type","resource-1-mimetype","resource-1-mimetype_inner","resource-1-size","resource-1-last_modified","resource-1-cache_url","resource-1-cache_last_updated","resource-1-webstore_url","resource-1-webstore_last_updated","resource-1-alt_url","resource-1-size_extra"
 2 | "annakarenina","A Novel By Tolstoy","0.7a","http://www.annakarenina.com","","","","","Some test notes
 3 | 
 4 | ### A 3rd level heading
 5 | 
 6 | **Some bolded text.**
 7 | 
 8 | *Some italicized text.*
 9 | 
10 | Foreign characters:
11 | u with umlaut ü
12 | 66-style quote “
13 | foreign word: thümb
14 |  
15 | Needs escaping:
16 | left arrow <
17 | 
18 | <http://ckan.net/>
19 | 
20 | ","active","OKD Compliant::Other (Open)",True,"Flexible ァ russian tolstoy","david roger","http://test.ckan.net/dataset/annakarenina","","2011-12-09T17:15:57.440192","2011-12-09T17:15:57.440192","<p>Some test notes
21 | </p>
22 | 
23 | <h3>A 3rd level heading</h3>
24 | <p><strong>Some bolded text.</strong>
25 | </p>
26 | <p><em>Some italicized text.</em>
27 | </p>
28 | <p>Foreign characters:
29 |    u with umlaut ü
30 |    66-style quote “
31 |    foreign word: thümb
32 | </p>
33 | <p>Needs escaping:
34 |    left arrow &lt;
35 | </p>
36 | <p><a href=""http://ckan.net/"" target=""_blank"" rel=""nofollow"">http://ckan.net/</a>
37 | </p>","romantic novel","book","http://www.annakarenina.com/download/x=1&y=2","plain text","Full text. Needs escaping: "" Umlaut: ü","abc123","","","","","","","","","","","alt123","123","http://www.annakarenina.com/index.json","json","Index of the novel","def456","","","","","","","","","","","alt345","345"
38 | "warandpeace","A Wonderful Story","","","","","","","","active","Non-OKD Compliant::Creative Commons Non-Commercial (Any)",False,"Flexible ァ russian","david","http://test.ckan.net/dataset/warandpeace","","2011-12-09T17:15:57.440192","2011-12-09T17:15:57.440192","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","",""
39 | 


--------------------------------------------------------------------------------
/ckanext/importlib/command.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | from optparse import OptionParser
  4 | import logging
  5 | from ConfigParser import ConfigParser
  6 | 
  7 | class Command(object):
  8 |     """
  9 |     (this class is copied from :module:`ordf.command`)
 10 | 
 11 |     This class is very similar to :class:`paste.script.command.Command` but
 12 |     rather than implementing a :program:`paster` plugin it is for stand-alone
 13 |     command line programs. To implement a command line program, sub-class this
 14 |     class, and make a minimal method to instantiate and run it. As with the
 15 |     paster counterpart you have to add an option parser and a method called
 16 |     :meth:`command`. A minimal example:
 17 | 
 18 |     .. code-block:: python
 19 | 
 20 |         class Hello(Command):
 21 |             def command(self):
 22 |                 print "hello world"
 23 | 
 24 |         def hello():
 25 |             Hello().command()
 26 | 
 27 |     To create the actual script, in your package's *setup.py* needs an entry
 28 |     point like::
 29 | 
 30 |         [console_scripts]
 31 |         hello=mypackage.command:hello
 32 | 
 33 |     and then run one of::
 34 | 
 35 |         % python setup.py develop
 36 |         % python setup.py install
 37 |     """
 38 |     def __init__(self):
 39 |         usage = self.usage if hasattr(self, 'usage') else None
 40 |         self.parser = Command.StandardParser(usage=usage)
 41 |         self.add_options()
 42 |         self.parse_args()
 43 |         self.setup_logging()
 44 |         super(Command, self).__init__()
 45 | 
 46 |     @classmethod
 47 |     def StandardParser(cls, *av, **kw):
 48 |         parser = OptionParser(*av, **kw)
 49 |         parser.add_option("-l", "--logfile",
 50 |                           dest="logfile", default=None,
 51 |                           help="log to file")
 52 |         parser.add_option("-v", "--verbosity",
 53 |                           dest="verbosity", default="info",
 54 |                           help="log verbosity. one of debug, info, warning, error, critical")
 55 |         return parser
 56 | 
 57 |     def parse_args(self):
 58 |         self.options, self.args = self.parser.parse_args()
 59 | 
 60 |     def add_options(self):
 61 |         pass
 62 | 
 63 |     def setup_logging(self):
 64 |         ## set up logging
 65 |         logcfg = {
 66 |             "level": logging.INFO,
 67 |             "format": "%(asctime)s %(levelname)s  [%(name)s] %(message)s",
 68 |             }
 69 |         if self.options.logfile:
 70 |             logcfg["filename"] = self.options.logfile
 71 |         if self.options.verbosity:
 72 |             levels = {
 73 |                 "debug": logging.DEBUG,
 74 |                 "info": logging.INFO,
 75 |                 "warning": logging.WARNING,
 76 |                 "error": logging.ERROR,
 77 |                 "critical": logging.CRITICAL
 78 |                 }
 79 |             logcfg["level"] = levels.get(self.options.verbosity, logging.NOTSET)
 80 |         logging.basicConfig(**logcfg)
 81 | 
 82 |     def command(self):
 83 |         pass
 84 |     
 85 | def config(filename):
 86 |     cfgpath = os.path.abspath(filename)
 87 |     cfgfile = ConfigParser({ "here": os.path.dirname(cfgpath) })
 88 |     cfgfile.read(cfgpath)
 89 | 
 90 |     cfg = {}
 91 |     if cfgfile.has_section("app:main"):
 92 |         cfg.update(cfgfile.items("app:main"))
 93 |     return cfg
 94 | 
 95 | class ConfiguredCommand(Command):
 96 |     '''The same as Command, only with the --config option.'''
 97 |     def __init__(self):
 98 |         super(ConfiguredCommand, self).__init__()
 99 |         self.parse_config()
100 | 
101 |     @classmethod
102 |     def StandardParser(cls, *av, **kw):
103 |         parser = super(ConfiguredCommand, cls).StandardParser(*av, **kw)
104 |         parser.add_option("-c", "--config",
105 |                   dest="config", default="development.ini",
106 |                   help="configuration file (default: development.ini)")
107 |         return parser
108 | 
109 |     def parse_config(self):
110 |         self.config = {}
111 | 
112 |         if self.options.config:
113 |             cfg = config(self.options.config)
114 |             self.config.update(cfg)
115 |     
116 | 


--------------------------------------------------------------------------------
/ckanext/importlib/importer.py:
--------------------------------------------------------------------------------
  1 | import StringIO
  2 | 
  3 | import re
  4 | import datetime
  5 | 
  6 | class ImportException(Exception):
  7 |     pass
  8 | 
  9 | class RowParseError(ImportException):
 10 |     pass
 11 | 
 12 | class DataRecords(object):
 13 |     '''Represents raw data records in the form of a dictionary.
 14 |     (The raw data is not yet processed - it will be converted to package_dict
 15 |     in the next step.)
 16 |     '''
 17 |     @property
 18 |     def records(self):
 19 |         '''Yields each record as a dict.'''
 20 |         raise NotImplementedError
 21 | 
 22 | 
 23 | class PackageImporter(object):
 24 |     '''Base class for an importer that converts a particular file type
 25 |     and creates corresponding package dictionaries.'''
 26 |     _log = []
 27 | 
 28 |     def __init__(self, filepath=None, buf=None):
 29 |         assert filepath or buf, 'Must specify a filepath or a buf.'
 30 |         self._filepath = filepath
 31 |         self._buf = buf
 32 |         self.import_into_package_records()
 33 | 
 34 |     def import_into_package_records(self):
 35 |         '''Reads in the source file given by self._filepath and
 36 |         stores the resulting DataRecords in self._package_data_records.'''
 37 |         raise NotImplementedError()
 38 | 
 39 |     @classmethod
 40 |     def log(cls, msg):
 41 |         cls._log.append(msg)
 42 | 
 43 |     @classmethod
 44 |     def get_log(cls):
 45 |         return cls._log
 46 | 
 47 |     @classmethod
 48 |     def clear_log(cls):
 49 |         cls._log = []
 50 | 
 51 |     def record_2_package(self, record_dict):
 52 |         '''Converts a raw record into a package dictionary.
 53 |         @param record_dict - the raw record
 54 |         @return - pkg_dict'''
 55 |         raise NotImplementedError()
 56 | 
 57 |     def pkg_dict(self):
 58 |         '''Generates package dicts from the package data records.'''
 59 |         for row_dict in self._package_data_records.records:
 60 |             try:
 61 |                 yield self.record_2_package(row_dict)
 62 |             except RowParseError, e:
 63 |                 print 'Error with row', e
 64 |         raise StopIteration
 65 | 
 66 |     @classmethod
 67 |     def license_2_license_id(self, license_title, logger=None):
 68 |         # import is here, as it creates a dependency on ckan, which
 69 |         # many importers won't want
 70 |         from ckan.model.license import LicenseRegister
 71 | 
 72 |         licenses = LicenseRegister()
 73 |         license_obj = licenses.get_by_title(license_title)
 74 |         if license_obj:
 75 |             return u'%s' % license_obj.id
 76 |         else:
 77 |             logger('Warning: No license name matches \'%s\'. Ignoring license.' % license_title)
 78 | 
 79 | 
 80 |     @classmethod
 81 |     def munge(self, name):
 82 |         '''Munge a title into a name.
 83 | 
 84 |         Note this function must be only carefully changed, as reimporting
 85 |         data with a name munged differently may create duplicates packages.
 86 |         For this reason, this munge function is for use by the importers only.
 87 |         Other users should use the API slug creation functionality.
 88 |         '''
 89 |         # import is here, as it creates a dependency on ckan, which
 90 |         # many importers won't want
 91 |         import ckan.model as model
 92 | 
 93 |         # convert spaces to underscores
 94 |         name = re.sub(' ', '_', name).lower()        
 95 |         # convert symbols to dashes
 96 |         name = re.sub('[:]', '_-', name).lower()        
 97 |         name = re.sub('[/]', '-', name).lower()        
 98 |         # take out not-allowed characters
 99 |         name = re.sub('[^a-zA-Z0-9-_]', '', name).lower()
100 |         # remove double underscores
101 |         name = re.sub('__', '_', name).lower()
102 |         # if longer than max_length, keep last word if a year
103 |         max_length = model.PACKAGE_NAME_MAX_LENGTH - 5
104 |         # (make length less than max, in case we need a few for '_' chars
105 |         # to de-clash names.)
106 |         if len(name) > max_length:
107 |             year_match = re.match('.*?[_-]((?:\d{2,4}[-/])?\d{2,4})$', name)
108 |             if year_match:
109 |                 year = year_match.groups()[0]
110 |                 name = '%s-%s' % (name[:(max_length-len(year)-1)], year)
111 |             else:
112 |                 name = name[:max_length]
113 |         return name
114 | 
115 |     @classmethod
116 |     def name_munge(self, input_name):
117 |         '''Munges the name field in case it is not to spec.
118 | 
119 |         Note this function must be only carefully changed, as reimporting
120 |         data with a name munged differently may create duplicates packages.
121 |         For this reason, this munge function is for use by the importers only.
122 |         Other users should use the API slug creation functionality.
123 |         '''
124 |         return self.munge(input_name.replace(' ', '').replace('.', '_').replace('&', 'and'))
125 | 
126 |     @classmethod
127 |     def tidy_url(self, url, logger=None):
128 |         if url and not url.startswith('http') and not url.startswith('webcal:'):
129 |             if url.startswith('www.'):
130 |                 url = url.replace('www.', 'http://www.')
131 |             else:
132 |                 logger('Warning: URL doesn\'t start with http: %s' % url)
133 |         return url
134 | 
135 | 
136 | 


--------------------------------------------------------------------------------
/ckanext/importlib/tests/test_spreadsheet_importer.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | from pylons import config
  4 | 
  5 | import ckanext.importlib.spreadsheet_importer as spreadsheet_importer
  6 | 
  7 | TEST_DIR = os.path.dirname(os.path.abspath(__file__))
  8 | EXAMPLES_DIR = os.path.join(TEST_DIR, 'samples')
  9 | EXAMPLE_FILEBASE = 'test_importer'
 10 | EXAMPLE_TESTFILE_SUFFIX = '_example'
 11 | EXAMPLE_BIS_TESTFILE_SUFFIX = '_bis_example'
 12 | XL_EXTENSION = '.xls'
 13 | CSV_EXTENSION = '.csv'
 14 | EXTENSIONS = [CSV_EXTENSION, XL_EXTENSION]
 15 | SPREADSHEET_DATA_MAP = {XL_EXTENSION:spreadsheet_importer.XlData,
 16 |                         CSV_EXTENSION:spreadsheet_importer.CsvData}
 17 | 
 18 | class ExampleFiles(object):
 19 |     def __init__(self, examples_dir, example_filebase):
 20 |         '''
 21 |         Easy accessor for info about test fixture files. 
 22 |         @param examples_dir - absolute
 23 |         '''
 24 |         self.examples_dir = examples_dir
 25 |         self.example_filebase = example_filebase
 26 |         
 27 |     def get_spreadsheet_filepath(self, test_file_suffix, extension):
 28 |         return os.path.join(self.examples_dir, self.example_filebase + test_file_suffix + extension)
 29 | 
 30 |     def get_data(self, test_file_suffix, extension=XL_EXTENSION):
 31 |         logger = BasicLogger()
 32 |         filepath = self.get_spreadsheet_filepath(test_file_suffix, extension)
 33 |         return SPREADSHEET_DATA_MAP[extension](logger, filepath=filepath)
 34 | 
 35 | examples = ExampleFiles(EXAMPLES_DIR, EXAMPLE_FILEBASE)
 36 | 
 37 | class BasicLogger:
 38 |     def __init__(self):
 39 |         self.log = []
 40 | 
 41 | 
 42 | class TestSpreadsheetData:
 43 |     def test_0_example_file_by_filepath(self):
 44 |         for extension in EXTENSIONS:
 45 |             logger = BasicLogger()
 46 |             filepath = examples.get_spreadsheet_filepath(EXAMPLE_TESTFILE_SUFFIX, extension)
 47 |             data = SPREADSHEET_DATA_MAP[extension](logger, filepath=filepath)
 48 |             self.assert_example_data(data)
 49 |             assert logger.log == [], logger.log
 50 |         
 51 |     def test_1_example_file_by_buf(self):
 52 |         for extension in EXTENSIONS:
 53 |             logger = BasicLogger()
 54 |             filepath = examples.get_spreadsheet_filepath(EXAMPLE_TESTFILE_SUFFIX, extension)
 55 |             f = open(filepath, 'rb')
 56 |             buf = f.read()
 57 |             f.close()
 58 |             data = SPREADSHEET_DATA_MAP[extension](logger, buf=buf)
 59 |             self.assert_example_data(data)
 60 |             assert logger.log == [], logger.log
 61 | 
 62 |     def assert_example_data(self, data):
 63 |         num_rows = data.get_num_rows()
 64 |         assert 3 <= num_rows <= 4, num_rows
 65 |         rows = data.get_all_rows()
 66 |         assert len(rows) == num_rows
 67 |         first_row = data.get_row(0)
 68 |         assert first_row == rows[0]
 69 |         assert rows[0] == [u'name', u'title', u'resource-0-url', u'resource-0-format', u'resource-0-description', u'tags'], rows[0]
 70 |         assert rows[1] == [u'wikipedia', u'Wikipedia', u'http://static.wikipedia.org/downloads/2008-06/en/wikipedia-en-html.tar.7z', u'html', u'In English', u'encyclopedia reference'], rows[1]
 71 |         # xl gives None and csv gives u'' for blank cells
 72 |         assert rows[2] == [u'tviv', u'TV IV', u'http://tviv.org/Category:Grids', u'', u'', u'tv encyclopedia'] or \
 73 |                rows[2] == [u'tviv', u'TV IV', u'http://tviv.org/Category:Grids', None, None, u'tv encyclopedia'], rows[2]
 74 |         if num_rows == 4:
 75 |             assert rows[3] == [], rows[3]
 76 | 
 77 | class TestDataRecords:
 78 |     def test_0_example(self):
 79 |         data = examples.get_data(EXAMPLE_TESTFILE_SUFFIX, XL_EXTENSION)
 80 |         data_records = spreadsheet_importer.SpreadsheetDataRecords(data, 'title')
 81 |         assert data_records.titles == data.get_row(0), data_records.titles
 82 |         records = [record for record in data_records.records]
 83 |         assert len(records) == 2, records
 84 |         assert records[0].items() == [
 85 |             (u'name', u'wikipedia'),
 86 |             (u'title', u'Wikipedia'),
 87 |             (u'resource-0-url', u'http://static.wikipedia.org/downloads/2008-06/en/wikipedia-en-html.tar.7z'),
 88 |             (u'resource-0-format', u'html'),
 89 |             (u'resource-0-description', u'In English'),
 90 |             (u'tags', u'encyclopedia reference'),
 91 |             ], records[0].items()
 92 |         assert records[1].items() == [
 93 |             (u'name', u'tviv'),
 94 |             (u'title', u'TV IV'),
 95 |             (u'resource-0-url', u'http://tviv.org/Category:Grids'),
 96 |             (u'resource-0-format', None),
 97 |             (u'resource-0-description', None),
 98 |             (u'tags', u'tv encyclopedia'),
 99 |             ], records[1].items()
100 | 
101 |     def test_1_bis_example(self):
102 |         data = examples.get_data(EXAMPLE_BIS_TESTFILE_SUFFIX, XL_EXTENSION)
103 |         data_records = spreadsheet_importer.SpreadsheetDataRecords(data, 'Dataset Ref#')
104 |         assert data_records.titles[:3] == [None, 'Dataset Ref#', 'Dataset Status'], data_records.titles
105 |         records = [record for record in data_records.records]
106 |         assert len(records) == 2, records
107 |         assert records[0]['Dataset Ref#'] == 'BIS-000002', records[0]['Dataset Ref#']
108 |         assert records[1]['Dataset Ref#'] == 'BIS-000003', records[1]['Dataset Ref#']
109 | 
110 | class TestPackageImporter:
111 |     def test_munge(self):
112 |         def test_munge(title, expected_munge):
113 |             munge = spreadsheet_importer.SpreadsheetPackageImporter.munge(title)
114 |             assert munge == expected_munge, 'Got %s not %s' % (munge, expected_munge)
115 |         test_munge('Adult participation in learning', 'adult_participation_in_learning')
116 |         test_munge('Alcohol Profile: Alcohol-specific hospital admission, males', 'alcohol_profile_-_alcohol-specific_hospital_admission_males')
117 |         test_munge('Age and limiting long-term illness by NS-SeC', 'age_and_limiting_long-term_illness_by_ns-sec')
118 |         test_munge('Higher Education Statistics: HE qualifications obtained in the UK by level, mode of study, domicile, gender, class of first degree and subject area 2001/02', 'higher_education_statistics_-_he_qualifications_obtained_in_the_uk_by_level_mode_of_stu-2001-02')        
119 |         
120 |     def test_0_example_by_filepath(self):
121 |         for extension in EXTENSIONS:
122 |             filepath = examples.get_spreadsheet_filepath(EXAMPLE_TESTFILE_SUFFIX, extension)
123 |             package_import = spreadsheet_importer.SpreadsheetPackageImporter(filepath=filepath)
124 |             self.assert_example_package_import(package_import)
125 | 
126 |     def assert_example_package_import(self, package_import):
127 |         pkg_dicts = [pkg_dict for pkg_dict in package_import.pkg_dict()]
128 |         assert len(pkg_dicts) == 2, pkg_dicts
129 |         assert pkg_dicts[0].items() == [(u'name', u'wikipedia'), (u'title', u'Wikipedia'), ('resources', [{'url': u'http://static.wikipedia.org/downloads/2008-06/en/wikipedia-en-html.tar.7z', 'alt_url': u'', 'hash': u'', 'description': u'In English', 'format': u'html'}]), (u'tags', u'encyclopedia reference')], pkg_dicts[0].items()
130 |         assert pkg_dicts[1].items() == [(u'name', u'tviv'), (u'title', u'TV IV'), ('resources', [{'url': u'http://tviv.org/Category:Grids', 'alt_url': u'', 'hash': u'', 'description': u'', 'format': u''}]), (u'tags', u'tv encyclopedia')], pkg_dicts[1].items()        
131 | 


--------------------------------------------------------------------------------
/ckanext/importlib/tests/test_spreadsheet_import_files.py:
--------------------------------------------------------------------------------
  1 | import types
  2 | import tempfile
  3 | import os
  4 | 
  5 | from sqlalchemy.util import OrderedDict
  6 | from pylons import config
  7 | 
  8 | import ckan.model as model
  9 | from ckan.tests import *
 10 | from ckanext.importlib import importer
 11 | from ckanext.importlib import spreadsheet_importer
 12 | from ckanext.importlib.spreadsheet_importer import readonly_keys
 13 | import ckan.lib.dumper as dumper
 14 | 
 15 | TEST_DIR = os.path.dirname(os.path.abspath(__file__))
 16 | TEST_FILES_DIR = os.path.join(TEST_DIR, 'samples') + '/'
 17 | TEST_FILE_FULL = 'test_importer_full'
 18 | TEST_FILE_EXAMPLE = 'test_importer_example'
 19 | XL_EXTENSION = '.xls'
 20 | CSV_EXTENSION = '.csv'
 21 | EXTENSIONS = [XL_EXTENSION, CSV_EXTENSION]
 22 | 
 23 | EXAMPLE_XL_DICTS = [
 24 |     OrderedDict(
 25 |         [('name', 'wikipedia'),
 26 |          ('title', 'Wikipedia'),
 27 |          ('resource-0-url', 'http://static.wikipedia.org/downloads/2008-06/en/wikipedia-en-html.tar.7z'),
 28 |          ('resource-0-format', 'html'),
 29 |          ('resource-0-description', 'In English'),
 30 |          ('tags', 'encyclopedia reference')]),
 31 |     OrderedDict(
 32 |         [('name', 'tviv'),
 33 |          ('title', 'TV IV'),
 34 |          ('resource-0-url', 'http://tviv.org/Category:Grids'),
 35 |          ('tags', 'tv encyclopedia')]),
 36 |     ]
 37 | 
 38 | pkg_to_xl_dict = dumper.PackagesXlWriter.pkg_to_xl_dict
 39 | 
 40 | # This test recreates the sample files
 41 | class Test0FilesCreation(TestController):
 42 |     @classmethod
 43 |     def setup_class(self):
 44 |         model.repo.init_db()
 45 |         CreateTestData.create()
 46 |         full_row_dicts = [pkg_to_xl_dict(pkg) for pkg in [model.Package.by_name(u'annakarenina'), model.Package.by_name(u'warandpeace')]]
 47 |         creators = [ (dumper.PackagesXlWriter, XL_EXTENSION),
 48 |                      (dumper.PackagesCsvWriter, CSV_EXTENSION),
 49 |                      ]
 50 |         for creator, extension in creators:
 51 |             creator(full_row_dicts).save(open(TEST_FILES_DIR + TEST_FILE_FULL + extension, 'wb'))
 52 |             creator(EXAMPLE_XL_DICTS).save(open(TEST_FILES_DIR + TEST_FILE_EXAMPLE + extension, 'wb'))
 53 | 
 54 |     @classmethod
 55 |     def teardown_class(self):
 56 |         model.repo.rebuild_db()
 57 | 
 58 |     def test_exist(self):
 59 |         for filename in (TEST_FILE_EXAMPLE, TEST_FILE_FULL):
 60 |             for extension in EXTENSIONS:
 61 |                 filepath = TEST_FILES_DIR + filename + extension
 62 |                 assert os.path.exists(filepath), filepath
 63 | 
 64 | class Test1Import(TestController):
 65 |     @classmethod
 66 |     def setup_class(self):
 67 |         model.Session.remove()
 68 |         model.repo.init_db()
 69 |         CreateTestData.create()
 70 |         anna = model.Package.by_name(u'annakarenina')
 71 |         war = model.Package.by_name(u'warandpeace')
 72 |         self.anna_xl_dict = pkg_to_xl_dict(anna)
 73 |         self.war_xl_dict = pkg_to_xl_dict(war)
 74 |         self.anna_fs_dict = pkg_to_fs_dict(anna)
 75 |         self.war_fs_dict = pkg_to_fs_dict(war)
 76 |         self.full_buf = {} # extension:filebuf
 77 |         for extension in EXTENSIONS:
 78 |             filepath = TEST_FILES_DIR + TEST_FILE_FULL + XL_EXTENSION
 79 |             assert os.path.exists(filepath)
 80 |             f = open(filepath)
 81 |             self.full_buf[extension] = f.read()
 82 |             f.close()
 83 | 
 84 |     @classmethod
 85 |     def teardown_class(self):
 86 |         model.Session.remove()
 87 |         model.repo.rebuild_db()
 88 | 
 89 |     def _get_row(self, sheet, row_index):
 90 |         return [cell.value for cell in sheet.row(row_index)]
 91 | 
 92 |     def test_0_pkg_to_xl_dict(self):
 93 |         d = self.anna_xl_dict
 94 |         for key, value in d.items():
 95 |             assert isinstance(d[key], (str, unicode, types.NoneType)), '%s:%s %s' % (key, value, type(value))
 96 |         for key in ['name', 'license', 'tags', 'groups', 'genre',
 97 |                     'notes_rendered', 'metadata_modified', 'metadata_created']:
 98 |             assert d.has_key(key), key
 99 |         for key in ['id', 'license_id', 'ratings_average', 'extras']:
100 |             assert not d.has_key(key), key
101 | 
102 |     def test_1_pkg_to_fs_dict(self):
103 |         d = self.anna_fs_dict
104 |         for key, value in d.items():
105 |             if key == 'extras':
106 |                 assert isinstance(d[key], dict), '%s:%s %s' % (key, value, type(value))                
107 |             elif key == 'resources':
108 |                 assert isinstance(d[key], list), '%s:%s %s' % (key, value, type(value))
109 |                 # check each resource
110 |                 for value in d[key]:
111 |                     assert isinstance(value, dict), '%s %s' % (value, type(value))
112 |             else:
113 |                 assert isinstance(d[key], (str, unicode, types.NoneType)), '%s:%s %s' % (key, value, type(value))
114 |         for key in ['name', 'license_id', 'tags', 'groups', 'extras']:
115 |             assert d.has_key(key), '%s not in %s' % (key, d)
116 |         for key in ['id', 'license', 'ratings_average', 'genre', 'ckan_url']:
117 |             assert not d.has_key(key), key
118 | 
119 |     def test_2_creator_xl_file(self):
120 |         import xlrd
121 |         assert self.full_buf[XL_EXTENSION]
122 |         
123 |         book = xlrd.open_workbook(file_contents=self.full_buf[XL_EXTENSION])
124 |         assert book.nsheets == 1, book.nsheets
125 |         sheet = book.sheet_by_index(0)
126 |         titles = self._get_row(sheet, 0)
127 |         assert titles[:2] == ['name', 'title'], titles
128 |         row1 = self._get_row(sheet, 1)
129 |         assert row1[:2] == ['annakarenina', 'A Novel By Tolstoy'], row1
130 |         row2 = self._get_row(sheet, 2)
131 |         assert row2[:2] == ['warandpeace', 'A Wonderful Story'], row2
132 | 
133 |     def test_3_read_full_buf(self):
134 |         comparison_dicts = [self.anna_fs_dict, self.war_fs_dict]
135 |         for extension in EXTENSIONS:
136 |             log = self._test_read(buf=self.full_buf[extension], expected_dicts=comparison_dicts)
137 |             assert not log, log
138 | 
139 |     def test_3_read_full_file(self):
140 |         comparison_dicts = [self.anna_fs_dict, self.war_fs_dict]
141 |         for extension in EXTENSIONS:
142 |             filepath = TEST_FILES_DIR + TEST_FILE_FULL + extension
143 |             log = self._test_read(filepath=filepath, expected_dicts=comparison_dicts)
144 |             assert not log, log
145 | 
146 |     def test_4_read_example_file(self):
147 |         comparison_dicts = [pkg_xl_dict_to_fs_dict(xl_dict) for xl_dict in EXAMPLE_XL_DICTS]
148 |         for extension in EXTENSIONS:
149 |             log = self._test_read(filepath=TEST_FILES_DIR + TEST_FILE_EXAMPLE + extension, expected_dicts=comparison_dicts)
150 | 
151 |     def _test_read(self, buf=None, filepath=None, expected_dicts=None):
152 |         reader = spreadsheet_importer.SpreadsheetPackageImporter(buf=buf, filepath=filepath)
153 |         index = 0
154 |         for pkg_dict in reader.pkg_dict():
155 |             for key, comp_val in expected_dicts[index].items():
156 |                 err_msg = 'Package \'%s\', Key %s should be: \n%s' % (pkg_dict['name'], repr(key), repr(comp_val))
157 |                 if comp_val:
158 |                     assert pkg_dict.has_key(key), err_msg
159 |                     err_msg += ', but is: \n%s' % (repr(pkg_dict[key]))
160 |                     if key in ('groups', 'tags'):
161 |                         #order doesn't matter
162 |                         a = set(); b = set()
163 |                         [a.add(val) for val in pkg_dict[key].split(' ')]
164 |                         [b.add(val) for val in comp_val.split(' ')]
165 |                         assert a == b, err_msg
166 |                     elif key == 'license_id':
167 |                         assert pkg_dict[key] == str(comp_val), err_msg
168 |                     else:
169 |                         assert pkg_dict[key] == comp_val, err_msg
170 |                 else:
171 |                     assert not pkg_dict.has_key(key), err_msg
172 |             for key, val in pkg_dict.items():
173 |                 comp_val = expected_dicts[index].get(key, None)
174 |                 assert not (val and not comp_val), 'Package \'%s\', Key \'%s\' with value \'%s\' appeared.' % (pkg_dict['name'], key, val)
175 | 
176 |             index += 1
177 |         return reader.get_log()
178 | 
179 | # TODO: (rgrp: 2010-11-16)
180 | # why is not in the ckan/lib/spreadsheet_importer.pkg_xl_dict_to_fs_dict(cls, # pkg_xl_dict, logger=None)?
181 | # furthermore why is that not in a core module (or forms) rather than there ...
182 | def pkg_to_fs_dict(pkg):
183 |     '''Convert a Package object to a dictionary suitable for fieldset data.
184 |     e.g. {'name':'annakarenina', 'resources':{'url':'anna.com'}}'''
185 |     dict_ = pkg.as_dict()
186 |     for key, value in dict_.items():
187 |         if key in readonly_keys:
188 |             del dict_[key]
189 |         if key=='resources':
190 |             dict_[key] = [res.as_dict(core_columns_only=True) for res in pkg.resources]
191 |         elif isinstance(value, (list, tuple)):
192 |             dict_[key] = ' '.join(value)
193 |     return dict_
194 | 
195 | def pkg_xl_dict_to_fs_dict(pkg_xl_dict):
196 |     return spreadsheet_importer.SpreadsheetPackageImporter.pkg_xl_dict_to_fs_dict(pkg_xl_dict)
197 | 


--------------------------------------------------------------------------------
/ckanext/importlib/spreadsheet_importer.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import copy
  3 | 
  4 | from sqlalchemy.util import OrderedDict
  5 | 
  6 | import ckan.model as model
  7 | from importer import *
  8 | 
  9 | readonly_keys = ('id', 'revision_id',
 10 |                  'relationships',
 11 |                  'license',
 12 |                  'ratings_average', 'ratings_count',
 13 |                  'ckan_url',
 14 |                  'metadata_modified',
 15 |                  'metadata_created',
 16 |                  'notes_rendered')
 17 | 
 18 | class SpreadsheetData(object):
 19 |     '''Represents a spreadsheet file which you can access row by row.''' 
 20 |     def __init__(self, logger, filepath=None, buf=None):
 21 |         assert filepath or buf
 22 |         assert not (filepath and buf)
 23 |         self._logger = logger
 24 |         self._rows = []
 25 | 
 26 |     def get_row(self, row_index):
 27 |         'Returns a list of the cells in unicode format.'
 28 |         raise NotImplementedError
 29 | 
 30 |     def get_num_rows(self):
 31 |         'Returns the number of rows in the sheet.'
 32 |         raise NotImplementedError
 33 | 
 34 |     def get_all_rows(self):
 35 |         'A crude way to get all the rows at once.'
 36 |         return [self.get_row(i) for i in range(self.get_num_rows())]
 37 | 
 38 | 
 39 | class CsvData(SpreadsheetData):
 40 |     def __init__(self, logger, filepath=None, buf=None):
 41 |         super(CsvData, self).__init__(logger, filepath, buf)
 42 |         if 1:
 43 |             if filepath:
 44 |                 csvfile = open(filepath)
 45 |                 if not csvfile:
 46 |                     raise ImportException('Could not open file \'%s\'.' % filepath)
 47 |                 csv_snippet = csvfile.read(1024)
 48 |             elif buf:
 49 |                 csvfile = buf.split('\n')
 50 |                 if not csvfile:
 51 |                     raise ImportException('Empty csv data.')
 52 |                 csv_snippet = buf[:1024]
 53 |             try:
 54 |                 dialect = csv.Sniffer().sniff(csv_snippet)
 55 |                 dialect.doublequote = True # sniff doesn't seem to pick this up
 56 |             except csv.Error, inst:
 57 |                 dialect = None
 58 |             if filepath:
 59 |                 csvfile.seek(0)
 60 |             try:
 61 |                 reader = csv.reader(csvfile, dialect)
 62 |             except TypeError, inst:
 63 |                 raise ImportException('CSV file read error: %s' % inst)
 64 | 
 65 |         try:
 66 |             for line in reader:
 67 |                 self._rows.append(line)
 68 |         except csv.Error, inst:
 69 |             raise ImportException('CSV file corrupt: %s' % inst)
 70 |         self._num_rows = len(self._rows)
 71 |         if self._num_rows < 2:
 72 |             raise ImportException('Not enough rows')
 73 |             
 74 |     def get_num_sheets(self):
 75 |         return 1
 76 | 
 77 |     def get_row(self, row_index):
 78 |         row = self._rows[row_index]
 79 |         return [cell.decode('utf8') for cell in row]
 80 | 
 81 |     def get_num_rows(self):
 82 |         return self._num_rows
 83 | 
 84 | 
 85 | class XlData(SpreadsheetData):
 86 |     '''Spreadsheet data in Excel format.
 87 |     NB Cells with no value return None rather than u''.
 88 |     @param sheet_index - if None, warn if more than 1 sheet in workbook.
 89 |     '''
 90 |     def __init__(self, logger, filepath=None, buf=None, sheet_index=None):
 91 |         super(XlData, self).__init__(logger, filepath, buf)
 92 |         import xlrd
 93 | 
 94 |         try:
 95 |             if filepath:
 96 |                 self._book = xlrd.open_workbook(filepath)
 97 |             elif buf:
 98 |                 self._book = xlrd.open_workbook(file_contents=buf)
 99 |         except xlrd.XLRDError, e:
100 |             raise ImportException('Could not open workbook: %r' % e)
101 | 
102 |         if sheet_index == None:
103 |             if self.get_num_sheets() != 1:
104 |                 logger.log.append('Warning: Just importing from sheet %r' % self._book.sheet_by_index(0).name)
105 |             sheet_index = 0
106 |         self.sheet = self._book.sheet_by_index(sheet_index)
107 | 
108 |     def get_num_sheets(self):
109 |         return self._book.nsheets
110 | 
111 |     def get_sheet_names(self):
112 |         return self._book.sheet_names()
113 | 
114 |     def get_data_by_sheet(self):
115 |         data_list = []
116 |         for sheet_index in range(self.get_num_sheets()):
117 |             data = copy.deepcopy(self)
118 |             data.sheet = self._book.sheet_by_index(sheet_index)
119 |             data_list.append(data)
120 |         return data_list
121 | 
122 |     def get_row(self, row_index):
123 |         import xlrd
124 |         row = self.sheet.row(row_index)
125 |         row_values = []
126 |         for cell in row:
127 |             value = None
128 |             if cell.ctype == xlrd.XL_CELL_TEXT:
129 |                 value = cell.value
130 |             elif cell.ctype == xlrd.XL_CELL_NUMBER:
131 |                 if cell.value == int(cell.value):
132 |                     value = int(cell.value)
133 |                 else:
134 |                     value = cell.value
135 |             elif cell.ctype == xlrd.XL_CELL_DATE:
136 |                 date_tuple = xlrd.xldate_as_tuple(cell.value, self._book.datemode)
137 |                 value = datetime.date(*date_tuple[:3])
138 |             elif cell.ctype == xlrd.XL_CELL_EMPTY:
139 |                 value = None
140 |             else:
141 |                 raise ImportException, 'Unknown cell type: %s' % cell.ctype
142 |             row_values.append(value)    
143 |         return row_values
144 | 
145 |     def get_num_rows(self):
146 |         return self.sheet.nrows
147 | 
148 | 
149 | class SpreadsheetDataRecords(DataRecords):
150 |     '''Takes SpreadsheetData and converts it its titles and
151 |     data records. Handles title rows and filters out rows of rubbish.
152 |     '''
153 |     def __init__(self, spreadsheet_data, essential_title):
154 |         assert isinstance(spreadsheet_data, SpreadsheetData), spreadsheet_data
155 |         self._data = spreadsheet_data
156 |         # find titles row
157 |         self.titles, last_titles_row_index = self.find_titles(essential_title)
158 |         self._first_record_row = self.find_first_record_row(last_titles_row_index + 1)     
159 | 
160 |     def find_titles(self, essential_title):
161 |         row_index = 0
162 |         titles = []
163 |         essential_title_lower = essential_title.lower()
164 |         while True:
165 |             if row_index >= self._data.get_num_rows():
166 |                 raise ImportException('Could not find title row')
167 |             row = self._data.get_row(row_index)
168 |             if essential_title in row or essential_title_lower in row:
169 |                 for row_val in row:
170 |                     titles.append(row_val.strip() if isinstance(row_val, basestring) else None)
171 |                 return (titles, row_index)
172 |             row_index += 1
173 | 
174 |     def find_first_record_row(self, row_index_to_start_looking):
175 |         row_index = row_index_to_start_looking
176 |         while True:
177 |             if row_index >= self._data.get_num_rows():
178 |                 raise ImportException('Could not find first record row')
179 |             row = self._data.get_row(row_index)
180 |             if not (u'<< Datasets Displayed Below' in row or\
181 |                     row[:5] == [None, None, None, None, None] or\
182 |                     row[:5] == ['', '', '', '', '']\
183 |                     ):
184 |                 return row_index
185 |             row_index += 1
186 | 
187 |     @property
188 |     def records(self):
189 |         '''Returns each record as a dict.'''
190 |         for row_index in range(self._first_record_row, self._data.get_num_rows()):
191 |             row = self._data.get_row(row_index)
192 |             row_has_content = False
193 |             for cell in row:
194 |                 if cell:
195 |                     row_has_content = True
196 |                     break
197 |             if row_has_content:
198 |                 record_dict = OrderedDict(zip(self.titles, row))
199 |                 if record_dict.has_key(None):
200 |                     del record_dict[None]
201 |                 yield record_dict
202 | 
203 | 
204 | class SpreadsheetPackageImporter(PackageImporter):
205 |     '''From a filepath of an Excel or csv file, extracts package
206 |     dictionaries.'''
207 |     def __init__(self, record_params=None, record_class=SpreadsheetDataRecords, **kwargs):
208 |         self._record_params = record_params if record_params != None else ['Title']
209 |         self._record_class = record_class
210 |         super(SpreadsheetPackageImporter, self).__init__(**kwargs)
211 |         
212 |     def import_into_package_records(self):
213 |         try: 
214 |             package_data = CsvData(self.log, filepath=self._filepath,
215 |                                    buf=self._buf)
216 |         except ImportException:
217 |             package_data = XlData(self.log, filepath=self._filepath,
218 |                                   buf=self._buf, sheet_index=0)
219 |             if package_data.get_num_sheets() > 1:
220 |                 package_data = [XlData(self.log, filepath=self._filepath,
221 |                                   buf=self._buf, sheet_index=i) for i in range(package_data.get_num_sheets())]
222 |         self._package_data_records = MultipleSpreadsheetDataRecords(
223 |             data_list=package_data,
224 |             record_params=self._record_params,
225 |             record_class=self._record_class)
226 |         
227 |     def record_2_package(self, row_dict):
228 |         pkg_dict = self.pkg_xl_dict_to_fs_dict(row_dict, self.log)
229 |         return pkg_dict
230 |         
231 |     @classmethod
232 |     def pkg_xl_dict_to_fs_dict(cls, pkg_xl_dict, logger=None):
233 |         '''Convert a Package represented in an Excel-type dictionary to a
234 |         dictionary suitable for fieldset data.
235 |         Takes Excel-type dict:
236 |             {'name':'wikipedia', 
237 |              'resource-0-url':'http://static.wikipedia.org/'}
238 |         Returns Fieldset-type dict:
239 |             {'name':'wikipedia',
240 |              'resources':[{'url':'http://static.wikipedia.org/'}]}
241 |         '''
242 |         import ckan.forms
243 |         standard_fields = model.Package.get_fields()
244 | 
245 |         pkg_fs_dict = OrderedDict()
246 |         for title, cell in pkg_xl_dict.items():
247 |             if cell:
248 |                 if title in standard_fields:
249 |                     pkg_fs_dict[title] = cell
250 |                 elif title == 'license':
251 |                     license_id = cls.license_2_license_id(cell)
252 |                     if license:
253 |                         pkg_fs_dict['license_id'] = license_id
254 |                     else:
255 |                         logger('Warning: No license name matches \'%s\'. Ignoring license.' % cell)
256 |                 elif title.startswith('resource-'):
257 |                     match = re.match('resource-(\d+)-(\w+)', title)
258 |                     if match:
259 |                         res_index, field = match.groups()
260 |                         res_index = int(res_index)
261 |                         field = str(field)
262 |                         if not pkg_fs_dict.has_key('resources'):
263 |                             pkg_fs_dict['resources'] = []
264 |                         resources = pkg_fs_dict['resources']
265 |                         num_new_resources = 1 + res_index - len(resources)
266 |                         for i in range(num_new_resources):
267 |                             blank_dict = OrderedDict()
268 |                             for blank_field in model.Resource.get_columns():
269 |                                 blank_dict[blank_field] = u''
270 |                             pkg_fs_dict['resources'].append(blank_dict)
271 | 
272 |                         pkg_fs_dict['resources'][res_index][field] = cell
273 |                     else:
274 |                         logger('Warning: Could not understand resource title \'%s\'. Ignoring value: %s' % (title, cell))
275 |                 elif title.startswith('relationships'):
276 |                     # TODO
277 |                     pass
278 |                 elif title == 'download_url':
279 |                     # deprecated - only in there for compatibility
280 |                     pass
281 |                 elif title in readonly_keys:
282 |                     pass
283 |                 else:
284 |                     if not pkg_fs_dict.has_key('extras'):
285 |                         pkg_fs_dict['extras'] = {}
286 |                     pkg_fs_dict['extras'][title] = cell
287 |         return pkg_fs_dict
288 | 
289 |                 
290 | class MultipleSpreadsheetDataRecords(DataRecords):
291 |     '''Takes several SpreadsheetData objects and returns records for all
292 |     of them combined.
293 |     '''
294 |     def __init__(self, data_list, record_params, record_class=SpreadsheetDataRecords):
295 |         self.records_list = []
296 |         if not isinstance(data_list, (list, tuple)):
297 |             data_list = [data_list]
298 |         for data in data_list:
299 |             self.records_list.append(record_class(data, *record_params))
300 |             
301 |     @property
302 |     def records(self):
303 |         for spreadsheet_records in self.records_list:
304 |             for spreadsheet_record in spreadsheet_records.records:
305 |                 yield spreadsheet_record
306 | 
307 |         
308 | 


--------------------------------------------------------------------------------
/ckanext/importlib/loader.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Takes a package dictionary and loads into CKAN via the API.
  3 | Checks to see if it already exists by name and preferably a unique field in
  4 | the extras too.
  5 | Uses ckanclient.
  6 | '''
  7 | import re
  8 | import copy
  9 | from traceback import format_exc
 10 | from pprint import pformat
 11 | import itertools
 12 | 
 13 | from ckanclient import CkanApiError, CkanApiNotAuthorizedError
 14 | 
 15 | PACKAGE_NAME_MAX_LENGTH = 100 # this should match with ckan/model/package.py
 16 |                               # but we avoid requiring ckan in this loader.
 17 | 
 18 | ACTIVE = 'active'             # should match ckan.model.ACTIVE
 19 |                               
 20 | log = __import__("logging").getLogger(__name__)
 21 | 
 22 | class LoaderError(Exception):
 23 |     pass
 24 | 
 25 | class PackageLoader(object):
 26 |     def __init__(self, ckanclient, stats=None):
 27 |         '''
 28 |         Loader for packages into a CKAN server. Takes package dictionaries
 29 |         and loads them using the ckanclient. Can also add packages to a
 30 |         specified group.
 31 | 
 32 |         It checks to see if a package of the same name is already on the
 33 |         CKAN server and if so, updates it with the new info. Create a subclass
 34 |         implementing _find_package, which determines how an existing package
 35 |         is discovered.
 36 | 
 37 |         @param ckanclient - ckanclient object, which contains the
 38 |                             connection to CKAN server
 39 |         '''
 40 |         # Note: we pass in the ckanclient (rather than deriving from it), so
 41 |         # that we can choose to pass a test client instead of a real one.
 42 |         self.ckanclient = ckanclient
 43 |         self._stats = stats
 44 |     
 45 |     def load_package(self, pkg_dict):
 46 |         '''
 47 |         May raise LoaderError or CkanApiNotAuthorizedError (which implies API
 48 |         key is wrong, so stop).
 49 |         '''
 50 | 
 51 |         log.info('..Loading "%s"' % pkg_dict['name'])
 52 |         
 53 |         # see if the package is already there
 54 |         existing_pkg_name, existing_pkg = self._find_package(pkg_dict)
 55 |         log.debug('Check for dataset already existing: %s', existing_pkg_name)
 56 | 
 57 |         # if creating a new package, check the name is available
 58 |         if not existing_pkg_name:
 59 |             self._ensure_pkg_name_is_available(pkg_dict)
 60 | 
 61 |         # write package
 62 |         # (May raise LoaderError or CkanApiNotAuthorizedError)
 63 |         pkg_dict = self._write_package(pkg_dict, existing_pkg_name, existing_pkg)
 64 |         pkg_dict = self.ckanclient.last_message
 65 |         
 66 |         log.debug('Package written: %s %r', pkg_dict['name'], pkg_dict)
 67 |         return pkg_dict
 68 | 
 69 |     def load_packages(self, pkg_dicts):
 70 |         '''Loads multiple packages.
 71 | 
 72 |         @return results and resulting package names/ids.
 73 |         '''
 74 |         num_errors = 0
 75 |         num_loaded = 0
 76 |         pkg_ids = []
 77 |         pkg_names = []
 78 |         for pkg_dict in pkg_dicts:
 79 |             try:
 80 |                 pkg_dict = self.load_package(pkg_dict)
 81 |             except CkanApiNotAuthorizedError, e:
 82 |                 log.error('Authorization Error (fatal) loading dict "%s":\n%s' % (pkg_dict['name'], format_exc()))
 83 |                 num_errors = 'fatal'
 84 |                 self._add_stat('Authorization Error %s' % e, pkg_dict)
 85 |                 break
 86 |             except LoaderError, e:
 87 |                 log.error('Error loading dict "%s":\n%s' % (pkg_dict['name'], format_exc()))
 88 |                 num_errors += 1
 89 |                 self._add_stat('Error %s' % e, pkg_dict)
 90 |             else:
 91 |                 pkg_ids.append(pkg_dict['id'])
 92 |                 pkg_names.append(pkg_dict['name'])
 93 |                 num_loaded += 1
 94 |         return {'pkg_names':pkg_names,
 95 |                 'pkg_ids':pkg_ids,
 96 |                 'num_loaded':num_loaded,
 97 |                 'num_errors':num_errors}
 98 | 
 99 |     def _add_stat(self, message, pkg_dict):
100 |         if not self._stats:
101 |             return
102 |         pub_date = pkg_dict.get('extras', {}).get('date_released')
103 |         item_id = '%s (%s)' % (pkg_dict['title'], pub_date)
104 |         return self._stats.add(message, item_id)
105 | 
106 |     def _find_package(self, pkg_dict):
107 |         raise NotImplemented
108 | 
109 |     def _write_package(self, pkg_dict, existing_pkg_name, existing_pkg=None):
110 |         '''
111 |         Writes a package (pkg_dict). If there is an existing package to
112 |         be changed, then supply existing_pkg_name. If the caller has already
113 |         got the existing package then pass it in, to save getting it twice.
114 | 
115 |         @return pkg_dict - the package as it was written
116 | 
117 |         May raise LoaderError or CkanApiNotAuthorizedError (which implies API
118 |         key is wrong, so stop).
119 |         '''
120 |         if existing_pkg_name:
121 |             if not existing_pkg:
122 |                 existing_pkg = self._get_package(existing_pkg_name)
123 |             if existing_pkg_name != pkg_dict["name"]:
124 |                 pkg_dict = pkg_dict.copy()
125 |                 pkg_dict["name"] = existing_pkg_name
126 |             if self._pkg_has_changed(existing_pkg, pkg_dict):
127 |                 log.info('..Updating existing package')
128 |                 try:
129 |                     self.ckanclient.package_entity_put(pkg_dict)
130 |                 except CkanApiError:
131 |                     raise LoaderError(
132 |                         'Error (%s) editing package over API: %s' % \
133 |                         (self.ckanclient.last_status,
134 |                          self.ckanclient.last_message))
135 |                 pkg_dict = self.ckanclient.last_message
136 |                 self._add_stat('Updated package', pkg_dict)
137 |             else:
138 |                 log.info('..No change')
139 |                 self._add_stat('No change', pkg_dict)
140 |         else:
141 |             log.info('..Creating package')
142 |             try:
143 |                 self.ckanclient.package_register_post(pkg_dict)
144 |             except CkanApiNotAuthorizedError:
145 |                 raise
146 |             except CkanApiError:
147 |                 raise LoaderError(
148 |                     'Error (%s) creating package over API: %s' % \
149 |                     (self.ckanclient.last_status,
150 |                      self.ckanclient.last_message))
151 |             pkg_dict = self.ckanclient.last_message
152 |             self._add_stat('Created package', pkg_dict)
153 |         return pkg_dict
154 | 
155 |     def add_pkg_to_group(self, pkg_name, group_name):
156 |         return self.add_pkgs_to_group([pkg_name], group_name)
157 | 
158 |     def add_pkgs_to_group(self, pkg_names, group_name):
159 |         for pkg_name in pkg_names:
160 |             assert not self.ckanclient.is_id(pkg_name), pkg_name
161 |         assert not self.ckanclient.is_id(group_name), group_name
162 |         try:
163 |             group_dict = self.ckanclient.group_entity_get(group_name)
164 |         except CkanApiError, e:
165 |             if self.ckanclient.last_status == 404:
166 |                 raise LoaderError('Group named %r does not exist' % group_name)
167 |             else:
168 |                 raise LoaderError('Unexpected status (%s) checking for group name %r: %r') % (self.ckanclient.last_status, group_name, group_dict)
169 |         group_dict['packages'] = (group_dict['packages'] or []) + pkg_names
170 |         try:
171 |             group_dict = self.ckanclient.group_entity_put(group_dict)
172 |         except CkanApiError, e:
173 |             raise LoaderError('Unexpected status %s writing to group \'%s\': %r' % (self.ckanclient.last_status, group_dict, e.args))
174 | 
175 |     def _get_package(self, pkg_name):
176 |         try:
177 |             pkg = self.ckanclient.package_entity_get(pkg_name)
178 |         except CkanApiError, e:
179 |             if self.ckanclient.last_status == 404:
180 |                 pkg = None
181 |             else:
182 |                 raise LoaderError('Unexpected status %s checking for package under \'%s\': %r' % (self.ckanclient.last_status, pkg_name, e.args))
183 |         return pkg
184 | 
185 |     def _find_package_by_fields(self, field_keys, pkg_dict):
186 |         '''Looks for a package that has matching keys to the pkg supplied.
187 |         Requires a unique match or it raises LoaderError.
188 |         @return (pkg_name, pkg) - pkg_name - the name of the matching
189 |                                   package or None if there is none.
190 |                                   pkg - the matching package dict if it
191 |                                   happens to have been requested,
192 |                                   otherwise None
193 |         '''
194 |         if field_keys == ['name']:
195 |             pkg = self._get_package(pkg_dict['name'])
196 |             pkg_name = pkg_dict['name'] if pkg else None
197 |         else:
198 |             search_options = self._get_search_options(field_keys, pkg_dict)
199 |             pkg_name, pkg = self._find_package_by_options(search_options)
200 | 
201 |         if not pkg_name:
202 |             # Just in case search is not being well indexed, look for the
203 |             # package under its name as well
204 |             try_pkg_name = pkg_dict['name']
205 |             pkg = self._get_package(try_pkg_name)
206 |             while pkg:
207 |                 if self._pkg_matches_search_options(pkg, search_options):
208 |                     log.warn('Search failed to find package %r with ref %r, '
209 |                              'but luckily the name is what was expected so loader '
210 |                              'found it anyway.' % (pkg_dict['name'], search_options))
211 |                     pkg_name = try_pkg_name
212 |                     break
213 |                 try_pkg_name += '_'
214 |                 pkg = self._get_package(try_pkg_name)
215 |             else:
216 |                 pkg_name = pkg = None
217 | 
218 |         log.info('..Search for existing package found: %r with filter: %r',
219 |                  pkg_name, search_options)
220 |         return pkg_name, pkg 
221 | 
222 |     def _get_search_options(self, field_keys, pkg_dict):
223 |         search_options = {}
224 |         has_a_value = False
225 |         for field_key in field_keys:
226 |             field_value = pkg_dict.get(field_key) or (pkg_dict['extras'].get(field_key) if pkg_dict.has_key('extras') else None)
227 | ##            else:
228 | ##                # This is how solr searches for blank values
229 | ##                # http://stackoverflow.com/questions/4238609/how-to-query-solr-for-empty-fields
230 | ##                #search_options['-%s' % field_key] = u'["" TO *]'
231 | ##                search_options['q'] = u'-%s:["" TO *]' % field_key
232 |             if field_value:
233 |                 if isinstance(field_value, list):
234 |                     for value in field_value:
235 |                         search_options[field_key] = value or u''
236 |                 else:
237 |                     search_options[field_key] = field_value or u''
238 |                 has_a_value = True
239 |         if not has_a_value:
240 |             raise LoaderError('Package %r has blank values for identifying fields: %r' % (pkg_dict['name'], field_keys))
241 |         return search_options
242 |         
243 |     def _package_search(self, search_options):
244 |         try:
245 |             res = self.ckanclient.package_search(q='', search_options=search_options)
246 |         except CkanApiError, e:
247 |             raise LoaderError('Search request failed (status %s): %r' % (self.ckanclient.last_status, e.args))
248 |         return res
249 | 
250 |     def _find_package_by_options(self, search_options):
251 |         '''The search_options specify values a package must have and this
252 |         returns the package.
253 | 
254 |         If more than one package matching then it logs an error but returns
255 |         the first one as we prefer to save the data to one, rather than
256 |         lose it.
257 | 
258 |         If none match then it returns (None, None).
259 | 
260 |         A successful search returns (pkg_name, pkg) where pkg may be None,
261 |         or returned filled, as a convenience.
262 | 
263 |         '''
264 |         search = self._package_search(search_options)
265 |         # Search doesn't do exact match (e.g. sql search searches *in*
266 |         # a field), so check matches thoroughly.
267 |         # Also check the package is active
268 |         exactly_matching_pkg_names = []
269 |         pkg = None
270 |         for pkg_ref in search['results']:
271 |             pkg = self._get_package(pkg_ref)
272 |             if pkg['state'] == ACTIVE and \
273 |                    self._pkg_matches_search_options(pkg, search_options):
274 |                 exactly_matching_pkg_names.append(pkg["name"])
275 |         if len(exactly_matching_pkg_names) > 1:
276 |             log.error('More than one record matches the search options %r: %r (so picking the first one)' % (search_options, exactly_matching_pkg_names))
277 |             pkg_name = exactly_matching_pkg_names[0]
278 |         elif len(exactly_matching_pkg_names) == 1:
279 |             pkg_name = exactly_matching_pkg_names[0]
280 |         else:
281 |             pkg_name = None
282 |         # Only carry through value for pkg if it was the last one and only
283 |         # one fetched
284 |         if not(search['count'] == 1 and pkg and pkg['name'] == pkg_name):
285 |             pkg = None
286 |         return pkg_name, pkg
287 | 
288 |     def _ensure_pkg_name_is_available(self, pkg_dict):
289 |         '''Checks the CKAN db to see if the name for this package has been
290 |         already taken, and if so, changes the pkg_dict to have another
291 |         name that is free.
292 |         @return nothing - changes the name in the pkg_dict itself
293 |         '''
294 |         preferred_name = pkg_dict['name']
295 |         clashing_pkg = self._get_package(pkg_dict['name'])
296 |         original_clashing_pkg = clashing_pkg
297 |         while clashing_pkg:
298 |             if len(pkg_dict['name']) >= PACKAGE_NAME_MAX_LENGTH:
299 |                 new_name = pkg_dict['name'].rstrip('_')[:-1]
300 |                 new_name = new_name.ljust(PACKAGE_NAME_MAX_LENGTH, '_')
301 |                 pkg_dict['name'] = new_name
302 |             else:
303 |                 pkg_dict['name'] += '_'
304 |             clashing_pkg = self._get_package(pkg_dict['name'])
305 | 
306 |         if pkg_dict['name'] != preferred_name:
307 |             log.warn('Name %r already exists so new package renamed '
308 |                      'to %r.' % (preferred_name, pkg_dict['name']))
309 |         else:
310 |             log.debug('Name %r available', pkg_dict['name'])
311 |                 
312 |     def _pkg_has_changed(self, existing_value, value):
313 |         changed = False
314 |         if isinstance(value, dict):
315 |             for key, sub_value in value.items():
316 |                 if key in ('owner_org', 'import_source'):
317 |                     # loader doesn't setup groups
318 |                     # import_source changing alone doesn't require an update
319 |                     continue
320 |                 existing_sub_value = existing_value.get(key)
321 |                 if self._pkg_has_changed(existing_sub_value, sub_value):
322 |                     changed = True
323 |                     break
324 |         elif isinstance(value, list) and \
325 |                isinstance(existing_value, list):
326 |             if len(existing_value) != len(value):
327 |                 changed = True
328 |             else:
329 |                 for i, sub_value in enumerate(value):
330 |                     if self._pkg_has_changed(existing_value[i], sub_value):
331 |                         changed = True
332 |                         break
333 |         elif (existing_value or None) != (value or None):
334 |             changed = True
335 |             
336 |         if changed:
337 |             return True
338 |         return False
339 | 
340 |     def lower(self, value):
341 |         '''If given a string, returns lowercase version of it.
342 |         Blank strings and None values are standardized on None.
343 | 
344 |         This is allowed for matching, because SOLR search returns values for
345 |         either case.
346 |         '''
347 |         if isinstance(value, basestring):
348 |             value = value.lower().strip()
349 |         if not value:
350 |             return None
351 |         return value
352 | 
353 |     def _pkg_matches_search_options(self, pkg_dict, search_options):
354 |         '''Returns True if pkg_dict matches all of the search_options.'''
355 |         matches = True
356 |         for key, value in search_options.items():
357 |             pkg_dict_value = pkg_dict.get(key) or pkg_dict['extras'].get(key)
358 | 
359 |             if isinstance(pkg_dict_value, list):
360 |                 # e.g. must have the tag or be in that group
361 |                 if value and self.lower(value) not in \
362 |                        [self.lower(val) for val in pkg_dict_value]:
363 |                     matches = False
364 |                     log.info('Match failed %s on field %s=%r but should have included %r',
365 |                              pkg_dict['name'], key, pkg_dict_value, value)
366 |                     break
367 |             else:
368 |                 if self.lower(pkg_dict_value) != self.lower(value):
369 |                     matches = False
370 |                     log.info('Match failed %s on field %s=%r but should be %r',
371 |                              pkg_dict['name'], key, pkg_dict_value, value)
372 |                     break
373 |         return matches
374 |         
375 | class ReplaceByNameLoader(PackageLoader):
376 |     '''Loader finds a package based on its name.
377 |     Load replaces the package with the supplied pkg_dict.'''
378 | 
379 |     def _find_package(self, pkg_dict):
380 |         find_pkg_by_keys = ['name']
381 |         return self._find_package_by_fields(find_pkg_by_keys, pkg_dict)
382 | 
383 | class ReplaceByExtraFieldLoader(PackageLoader):
384 |     '''Loader finds a package based on a unique id in an extra field.
385 |     Loader replaces the package with the supplied pkg_dict.'''
386 |     def __init__(self, ckanclient, package_id_extra_key, stats=None):
387 |         super(ReplaceByExtraFieldLoader, self).__init__(ckanclient, stats)
388 |         assert package_id_extra_key
389 |         self.package_id_extra_key = package_id_extra_key
390 | 
391 |     def _find_package(self, pkg_dict):
392 |         find_pkg_by_keys = [self.package_id_extra_key]
393 |         return self._find_package_by_fields(find_pkg_by_keys, pkg_dict)
394 | 
395 | class ResourceSeriesLoader(PackageLoader):
396 |     '''Loader finds package based on a specified field and checks to see
397 |     if most fields (listed in field_keys_to_expect_invariant) match the
398 |     pkg_dict. Loader then inserts the resources in the pkg_dict into
399 |     the package and updates any fields that have changed (e.g. last_updated).
400 |     It checks to see if the particular resource is already in the package
401 |     by a custom resource ID which is contained in the description field,
402 |     as a word containing the given prefix.
403 |     @param synonyms - a list of tuples describing values of a field that
404 |                       should be regarded as equal, for when searching for
405 |                       an existing package.
406 |                       e.g. {'department': [('DfE', 'DCSF'), ('DCLG', 'CLG')]}
407 |                       means resources for the department DfE would be inserted
408 |                       into a package which still had the old deparment name
409 |                       of DCSF (and the same for CLG and GCLG).
410 |     '''
411 |     def __init__(self, ckanclient,
412 |                  field_keys_to_find_pkg_by,
413 |                  field_keys_to_expect_invariant=None,
414 |                  synonyms=None,
415 |                  extras_to_not_overwrite=None,
416 |                  stats=None):
417 |         super(ResourceSeriesLoader, self).__init__(ckanclient, stats=stats)
418 |         assert field_keys_to_find_pkg_by
419 |         assert isinstance(field_keys_to_find_pkg_by, (list, tuple))
420 |         self.field_keys_to_find_pkg_by = field_keys_to_find_pkg_by
421 |         self.field_keys_to_expect_invariant = field_keys_to_expect_invariant \
422 |                                               or []
423 |         self.synonyms = synonyms or {}
424 |         self.extras_to_not_overwrite = extras_to_not_overwrite or []
425 | 
426 |     def _find_package(self, pkg_dict):
427 |         # take a copy of the keys since the find routine may change them
428 |         find_pkg_by_keys = self.field_keys_to_find_pkg_by[:]
429 |         return self._find_package_by_fields(find_pkg_by_keys, pkg_dict)
430 | 
431 |     def _get_search_options(self, field_keys, pkg_dict):
432 |         search_options = super(ResourceSeriesLoader, self)._get_search_options(field_keys, pkg_dict)
433 |         # now take account of the synonyms to search for
434 |         search_options_list = [search_options]
435 |         for field_key, field_value in search_options.items():
436 |             if field_key in self.synonyms:
437 |                 for synonym_list in self.synonyms[field_key]:
438 |                     if field_value in synonym_list:
439 |                         alt_field_values = list(synonym_list)
440 |                         alt_field_values.remove(field_value)
441 |                         for opts in search_options_list[:]:
442 |                             for alt_field_value in alt_field_values:
443 |                                 alt_opts = opts.copy()
444 |                                 alt_opts[field_key] = alt_field_value
445 |                                 search_options_list.append(alt_opts)
446 |         return search_options_list
447 | 
448 |     def _package_search(self, search_options_list):
449 |         try:
450 |             result_count = 0
451 |             result_generators = []
452 |             for search_options in search_options_list:
453 |                 res = self.ckanclient.package_search(q='', search_options=search_options)
454 |                 result_count += res['count']
455 |                 result_generators.append(res['results'])
456 |         except CkanApiError, e:
457 |             raise LoaderError('Search request failed (status %s): %r' % (self.ckanclient.last_status, e.args))
458 |         return {'count': result_count,
459 |                 'results': itertools.chain(*result_generators)}
460 | 
461 |     def _pkg_matches_search_options(self, pkg_dict, search_options_list):
462 |         '''Returns True if pkg_dict matches any of the search_options
463 |         listed.'''
464 |         matches = False
465 |         for search_options in search_options_list:
466 |             if super(ResourceSeriesLoader, self)._pkg_matches_search_options(pkg_dict, search_options):
467 |                 matches = True
468 |                 break
469 |         return matches
470 | 
471 |     def _write_package(self, pkg_dict, existing_pkg_name, existing_pkg=None):
472 |         '''
473 |         Writes a package (pkg_dict). If there is an existing package to
474 |         be changed, then supply existing_pkg_name. If the caller has already
475 |         got the existing package then pass it in, to save getting it twice.
476 | 
477 |         May raise LoaderError or CkanApiNotAuthorizedError (which implies API
478 |         key is wrong, so stop).
479 |         '''
480 |         if existing_pkg_name:
481 |             if not existing_pkg:
482 |                 existing_pkg = self._get_package(existing_pkg_name)
483 |             try:
484 |                 pkg_dict = self._merge_resources(existing_pkg, pkg_dict)
485 |             except Exception, e:
486 |                 raise LoaderError('Could not merge resources.\n'
487 |                                   '  existing_pkg: %r\n'
488 |                                   '  pkg_dict: %r\n'
489 |                                   '  Exception: %s'% (existing_pkg, pkg_dict, e))
490 |             if self.extras_to_not_overwrite and \
491 |                     self.extras_to_not_overwrite == ['theme-primary', 'themes-secondary']:
492 |                 if existing_pkg and existing_pkg['extras'].get('theme-primary'):
493 |                     pkg_dict['extras']['theme-primary'] = existing_pkg['extras']['theme-primary']
494 |                     pkg_dict['extras']['themes-secondary'] = existing_pkg['extras'].get('themes-secondary')
495 |         super(ResourceSeriesLoader, self)._write_package(pkg_dict,
496 |                                                         existing_pkg_name,
497 |                                                         existing_pkg)
498 | 
499 |     def _merge_resources(self, existing_pkg, pkg):
500 |         '''Takes an existing_pkg and merges in resources from the pkg.
501 |         '''
502 |         log.info("..Merging resources into %s" % existing_pkg["name"])
503 |         log.debug("....Existing resources:\n%s" % pformat(existing_pkg["resources"]))
504 |         log.debug("....New resources:\n%s" % pformat(pkg["resources"]))
505 | 
506 |         # check invariant fields aren't different
507 |         warnings = []
508 |         for key in self.field_keys_to_expect_invariant:
509 |             if key in existing_pkg or key in pkg:
510 |                 if (existing_pkg.get(key) or None) != (pkg.get(key) or None):
511 |                     warnings.append('%s: %r -> %r' % (key, existing_pkg.get(key), pkg.get(key)))
512 |             else:
513 |                 if (existing_pkg['extras'].get(key) or None) != (pkg['extras'].get(key) or None):
514 |                     warnings.append('%s: %r -> %r' % (key, existing_pkg['extras'].get(key), pkg['extras'].get(key)))
515 |                 
516 |         if warnings:
517 |             log.warn('Warning: uploading package \'%s\' and surprised to see '
518 |                      'changes in these values:\n%s' % (existing_pkg['name'], 
519 |                                                        '; '.join(warnings)))
520 | 
521 |         # copy over all fields but use the existing resources
522 |         merged_dict = pkg.copy()
523 |         merged_dict['resources'] = copy.deepcopy(existing_pkg['resources'])
524 | 
525 |         # merge resources
526 |         for pkg_res in pkg['resources']:
527 |             # look for resource ID already being there
528 |             pkg_res_id = self._get_resource_id(pkg_res)
529 |             for i, existing_res in enumerate(merged_dict['resources']):
530 |                 res_id = self._get_resource_id(existing_res)
531 |                 if res_id == pkg_res_id:
532 |                     # edit existing resource
533 |                     merged_dict['resources'][i] = pkg_res
534 |                     break
535 |             else:
536 |                 # insert new res
537 |                 merged_dict['resources'].append(pkg_res)
538 | 
539 |         log.debug("....Merged resources:\n%s" % pformat(merged_dict["resources"]))
540 | 
541 |         return merged_dict
542 | 
543 |     def _get_resource_id(self, res):
544 |         raise NotImplementedError
545 | 


--------------------------------------------------------------------------------
/ckanext/importlib/tests/test_loader.py:
--------------------------------------------------------------------------------
  1 | import urllib2
  2 | import time
  3 | 
  4 | from sqlalchemy.util import OrderedDict
  5 | from nose.tools import assert_equal
  6 | 
  7 | from ckan import model
  8 | from ckan.lib.create_test_data import CreateTestData
  9 | from ckan.tests import *
 10 | from ckan.tests import CreateTestData, TestSearchIndexer, is_search_supported
 11 | from ckan.tests.wsgi_ckanclient import WsgiCkanClient
 12 | from ckanclient import CkanClient
 13 | from ckanext.importlib.loader import ReplaceByNameLoader, ReplaceByExtraFieldLoader, ResourceSeriesLoader, LoaderError
 14 | 
 15 | USER = u'annafan'
 16 | 
 17 | # Set to true for quicker tests using wsgi_ckanclient
 18 | # otherwise it uses ckanclient
 19 | # (some tests still fail with ckanclient currently)
 20 | WSGI_CLIENT = True
 21 | 
 22 | #TODO: test log statements
 23 | 
 24 | def count_pkgs():
 25 |     return model.Session.query(model.Package).count()
 26 | 
 27 | class TestLoaderBase(TestController):
 28 |     @classmethod
 29 |     def setup_class(self):
 30 |         if hasattr(super(TestLoaderBase, self), 'setup_class'):
 31 |             super(TestLoaderBase, self).setup_class()
 32 |         CreateTestData.create_arbitrary([], extra_user_names=[USER])
 33 |         user = model.User.by_name(USER)
 34 |         assert user
 35 |         if WSGI_CLIENT:
 36 |             self.testclient = WsgiCkanClient(self.app, api_key=user.apikey)
 37 |         else:
 38 |             self.sub_proc = self._start_ckan_server('test.ini')
 39 |             self.testclient = CkanClient(base_location='http://localhost:5000/api',
 40 |                                          api_key=user.apikey)
 41 |             self._wait_for_url(url='http://localhost:5000/api')
 42 | 
 43 | 
 44 |     @classmethod
 45 |     def teardown_class(self):
 46 |         if hasattr(super(TestLoaderBase, self), 'teardown_class'):
 47 |             super(TestLoaderBase, self).teardown_class()
 48 |         if WSGI_CLIENT:
 49 |             model.Session.remove()
 50 |             model.repo.rebuild_db()
 51 |         else:
 52 |             try:
 53 |                 self._stop_ckan_server(self.sub_proc)
 54 |             finally:
 55 |                 model.repo.rebuild_db()
 56 | 
 57 | def assert_equal_dicts(dict1, dict2, only_assert_these_keys=None):
 58 |     only_assert_these_keys = set(only_assert_these_keys) if only_assert_these_keys else set([])
 59 |     dict1_keys = set(dict1.keys()) & only_assert_these_keys
 60 |     dict2_keys = set(dict2.keys()) & only_assert_these_keys
 61 |     key_diffs = dict1_keys ^ dict2_keys
 62 |     if key_diffs:
 63 |         print '%i keys not in both dicts.' % len(key_diffs)
 64 |         print 'Only in dict1: %r' % (dict1_keys - dict2_keys)
 65 |         print 'Only in dict2: %r' % (dict2_keys - dict1_keys)
 66 |         print '\nDict1: %r\nDict2: %r' % \
 67 |               (dict1, dict2)
 68 |         raise AssertionError
 69 |     for key in dict1_keys:
 70 |         if dict1[key] != dict2[key]:
 71 |             print 'Value for key %r is different. %r != %r' % \
 72 |                   (key, dict1[key], dict2[key])
 73 |             raise AssertionError
 74 | 
 75 | class TestLoader(TestLoaderBase):
 76 |     @classmethod
 77 |     def setup_class(self):
 78 |         super(TestLoader, self).setup_class()
 79 |         self.loader = ReplaceByNameLoader(self.testclient)
 80 | 
 81 |     # teardown is in the base class
 82 | 
 83 |     def test_0_simple_load(self):
 84 |         pkg_dict = {'name':u'pkgname',
 85 |                     'title':u'Boris'}
 86 |         assert not model.Package.by_name(pkg_dict['name'])
 87 |         CreateTestData.flag_for_deletion(pkg_names=[pkg_dict['name']])
 88 |         res_pkg_dict = self.loader.load_package(pkg_dict)
 89 |         assert res_pkg_dict
 90 |         pkg = model.Package.by_name(pkg_dict['name'])
 91 |         assert_equal_dicts(res_pkg_dict, pkg.as_dict(),
 92 |                            only_assert_these_keys=('name', 'title'))
 93 |         assert pkg
 94 |         assert pkg.name == pkg_dict['name']
 95 |         assert pkg.title == pkg_dict['title']
 96 | 
 97 |     def test_1_load_several(self):
 98 |         num_pkgs = count_pkgs()
 99 |         pkg_dicts = [{'name':u'pkgname_a',
100 |                       'title':u'BorisA'},
101 |                      {'name':u'pkgname_b',
102 |                       'title':u'BorisB'},
103 |                      ]
104 |         assert not model.Package.by_name(pkg_dicts[0]['name'])
105 |         CreateTestData.flag_for_deletion(pkg_names=[pkg_dict['name'] for pkg_dict in pkg_dicts])
106 |         res = self.loader.load_packages(pkg_dicts)
107 |         assert (res['num_loaded'], res['num_errors']) == (2, 0), \
108 |                (res['num_loaded'], res['num_errors'])
109 |         assert count_pkgs() == num_pkgs + 2, (count_pkgs() - num_pkgs)
110 |         for pkg_index, pkg_dict in enumerate(pkg_dicts):
111 |             pkg_name = pkg_dict['name']
112 |             pkg = model.Package.by_name(pkg_name)
113 |             assert pkg.id == res['pkg_ids'][pkg_index], \
114 |                    '%s != %s' % (pkg.id, res['pkg_ids'][pkg_index])
115 | 
116 |     def test_1_load_several_with_errors(self):
117 |         num_pkgs = count_pkgs()
118 |         pkg_dicts = [{'name':u'pkgnameA', # not allowed uppercase name
119 |                       'title':u'BorisA'},
120 |                      {'name':u'pkgnameB',
121 |                       'title':u'BorisB'},
122 |                      ]
123 |         assert not model.Package.by_name(pkg_dicts[0]['name'])
124 |         CreateTestData.flag_for_deletion(pkg_names=[pkg_dict['name'] for pkg_dict in pkg_dicts])
125 |         res = self.loader.load_packages(pkg_dicts)
126 |         assert (res['num_loaded'], res['num_errors']) == (0, 2), \
127 |                (res['num_loaded'], res['num_errors'])               
128 |         assert count_pkgs() == num_pkgs, (count_pkgs() - num_pkgs)
129 |         assert res['pkg_ids'] == [], res['pkg_ids']
130 | 
131 |     def test_2_reload(self):
132 |         # load the package once
133 |         num_pkgs = count_pkgs()
134 |         pkg_dict = {'name':u'pkgname2',
135 |                     'title':u'Boris'}
136 |         assert not model.Package.by_name(pkg_dict['name'])
137 |         CreateTestData.flag_for_deletion(pkg_names=[pkg_dict['name']])
138 |         self.loader.load_package(pkg_dict)
139 |         pkg = model.Package.by_name(pkg_dict['name'])
140 |         assert pkg
141 |         assert count_pkgs() == num_pkgs + 1, (count_pkgs() - num_pkgs)
142 | 
143 |         # load the package again
144 |         pkg_dict = {'name':u'pkgname2',
145 |                     'title':u'Boris Becker'}
146 |         self.loader.load_package(pkg_dict)
147 |         pkg = model.Package.by_name(pkg_dict['name'])
148 |         assert pkg
149 |         assert pkg.name == pkg_dict['name']
150 |         assert pkg.title == pkg_dict['title'], pkg.title
151 |         assert count_pkgs() == num_pkgs + 1, (count_pkgs() - num_pkgs)
152 | 
153 | 
154 | class TestLoaderUsingUniqueFields(TestLoaderBase):
155 |     @classmethod
156 |     def setup_class(self):
157 |         self.tsi = TestSearchIndexer()
158 |         super(TestLoaderUsingUniqueFields, self).setup_class()
159 |         self.loader = ReplaceByExtraFieldLoader(self.testclient, 'ref')
160 | 
161 |     # teardown is in the base class
162 | 
163 |     def test_0_reload(self):
164 |         # create initial package
165 |         num_pkgs = count_pkgs()
166 |         pkg_dict = {'name':u'pkgname0',
167 |                     'title':u'Boris',
168 |                     'extras':{u'ref':'boris'}}
169 |         assert not model.Package.by_name(pkg_dict['name'])
170 |         CreateTestData.create_arbitrary([pkg_dict])
171 |         self.tsi.index()
172 |         pkg = model.Package.by_name(pkg_dict['name'])
173 |         assert pkg
174 |         assert count_pkgs() == num_pkgs + 1, (count_pkgs() - num_pkgs)
175 | 
176 |         # load the package with same name and ref
177 |         pkg_dict = {'name':u'pkgname0',
178 |                     'title':u'Boris 2',
179 |                     'extras':{u'ref':'boris'}}
180 |         self.loader.load_package(pkg_dict)
181 |         pkg = model.Package.by_name(pkg_dict['name'])
182 |         assert pkg
183 |         assert pkg.name == pkg_dict['name']
184 |         assert pkg.title == pkg_dict['title']
185 |         assert count_pkgs() == num_pkgs + 1, (count_pkgs() - num_pkgs)
186 | 
187 |         # load the package with different name, same ref
188 |         pkg_dict = {'name':u'pkgname0changed',
189 |                     'title':u'Boris 3',
190 |                     'extras':{u'ref':'boris'}}
191 |         CreateTestData.flag_for_deletion(pkg_names=[pkg_dict['name']])
192 | 
193 |         self.loader.load_package(pkg_dict)
194 |         assert count_pkgs() == num_pkgs + 1, (count_pkgs() - num_pkgs)
195 |         # for now we do not support renaming
196 |         pkg = model.Package.by_name(pkg_dict['name'])
197 |         assert pkg is None, pkg
198 |         pkg = model.Package.by_name(u'pkgname0')
199 |         assert pkg
200 |         assert pkg.title == pkg_dict['title']
201 | 
202 |         # load the package with same name, different ref - new package
203 |         other_pkg_dict = pkg_dict
204 |         pkg_dict = {'name':u'pkgname0',
205 |                     'title':u'Boris 4',
206 |                     'extras':{u'ref':'boris-4'}}
207 |         CreateTestData.flag_for_deletion(pkg_names=[pkg_dict['name']])
208 |         self.loader.load_package(pkg_dict)
209 |         assert pkg_dict['name'] == 'pkgname0_'
210 |         orig_pkg = model.Package.by_name(u'pkgname0')
211 |         assert orig_pkg
212 |         assert orig_pkg.title == u'Boris 3'
213 |         pkg = model.Package.by_name(pkg_dict['name'])
214 |         assert pkg
215 |         assert pkg.name == pkg_dict['name']
216 |         assert pkg.title == pkg_dict['title']
217 |         assert count_pkgs() == num_pkgs + 2, (count_pkgs() - num_pkgs)
218 | 
219 |     def test_1_avoid_long_name_clash(self):
220 |         # load the package once
221 |         num_pkgs = count_pkgs()
222 |         pkg_dict = {'name':u'a'*99,
223 |                     'title':u'99 char name',
224 |                     'extras':{u'ref':'aaa'}}
225 |         assert not model.Package.by_name(pkg_dict['name'])
226 |         CreateTestData.flag_for_deletion(pkg_names=[pkg_dict['name']])
227 |         self.loader.load_package(pkg_dict)
228 |         pkg = model.Package.by_name(pkg_dict['name'])
229 |         assert pkg
230 |         assert count_pkgs() == num_pkgs + 1, (count_pkgs() - num_pkgs)
231 | 
232 |         # load a clashing package - name appended '_'
233 |         orig_pkg = pkg_dict
234 |         pkg_dict = {'name':orig_pkg['name'],
235 |                      'title':u'bbb',
236 |                      'extras':{u'ref':'bbb'}}
237 |         self.loader.load_package(pkg_dict)
238 |         clash_name = u'a'*99 + u'_'
239 |         pkg = model.Package.by_name(clash_name)
240 |         assert pkg
241 |         assert pkg.title == pkg_dict['title'], pkg.title
242 |         assert count_pkgs() == num_pkgs + 2, (count_pkgs() - num_pkgs)
243 | 
244 |         # load another clashing package - name over 100 chars so shortened
245 |         # and finishes '__'
246 |         orig_pkg = pkg_dict
247 |         pkg_dict = {'name':orig_pkg['name'],
248 |                      'title':u'ccc',
249 |                      'extras':{u'ref':'ccc'}}
250 |         self.loader.load_package(pkg_dict)
251 |         clash_name = u'a'*98 + u'__'
252 |         assert pkg_dict['name'] == clash_name, (pkg_dict['name'], clash_name)
253 |         pkg = model.Package.by_name(clash_name)
254 |         assert pkg
255 |         assert pkg.title == pkg_dict['title'], pkg.title
256 |         assert count_pkgs() == num_pkgs + 3, (count_pkgs() - num_pkgs)
257 | 
258 |         
259 | class TestLoaderNoSearch(TestLoaderBase):
260 |     '''Cope as best as possible if search indexing is flakey.'''
261 |     @classmethod
262 |     def setup_class(self):
263 |         '''NB, no search indexing started'''
264 |         if not is_search_supported():
265 |             raise SkipTest("Search not supported")
266 |         super(TestLoaderNoSearch, self).setup_class()
267 |         self.loader = ReplaceByExtraFieldLoader(self.testclient, 'ref')
268 | 
269 |     # teardown is in the base class
270 | 
271 |     def test_0_reload(self):
272 |         # create initial package
273 |         num_pkgs = count_pkgs()
274 |         pkg_dict = {'name':u'pkgname0',
275 |                     'title':u'Boris',
276 |                     'extras':{u'ref':'boris'}}
277 |         assert not model.Package.by_name(pkg_dict['name'])
278 |         CreateTestData.create_arbitrary([pkg_dict])
279 |         pkg = model.Package.by_name(pkg_dict['name'])
280 |         assert pkg
281 |         assert count_pkgs() == num_pkgs + 1, (count_pkgs() - num_pkgs)
282 | 
283 |         # load the package with same name and ref
284 |         pkg_dict = {'name':u'pkgname0',
285 |                     'title':u'Boris 2',
286 |                     'extras':{u'ref':'boris'}}
287 |         self.loader.load_package(pkg_dict)
288 |         pkg = model.Package.by_name(pkg_dict['name'])
289 |         assert pkg
290 |         assert pkg.name == pkg_dict['name']
291 |         assert pkg.title == pkg_dict['title']
292 |         assert count_pkgs() == num_pkgs + 1, (count_pkgs() - num_pkgs)
293 |         # i.e. not tempted to create pkgname0_ alongside pkgname0
294 | 
295 |         
296 | class TestLoaderGroups(TestLoaderBase):
297 |     @classmethod
298 |     def setup_class(self):
299 |         super(TestLoaderGroups, self).setup_class()
300 |         self.loader = ReplaceByNameLoader(self.testclient)
301 | 
302 |         assert count_pkgs() == 0, count_pkgs()
303 |         pkg_dicts = [{'name':u'pkga'},
304 |                      {'name':u'pkgb'},
305 |                      {'name':u'pkgc'},
306 |                      ]
307 |         CreateTestData.create_arbitrary(pkg_dicts)
308 |         group_dicts = [
309 |             {'name':u'g1', 'packages':[u'pkga']},
310 |             {'name':u'g2'},
311 |             {'name':u'g3'},
312 |             ]
313 |         CreateTestData.create_groups(group_dicts, USER)
314 |         self.pkgs = [model.Package.by_name(pkg_dict['name']) \
315 |                      for pkg_dict in pkg_dicts]
316 |         self.pkg_ids = [pkg.id for pkg in self.pkgs]
317 |         
318 |     # teardown is in the base class
319 | 
320 |     def test_0_add_to_empty_group(self):
321 |         pkg_name = u'pkga'
322 |         group_name = u'g2'
323 |         pkg = model.Package.by_name(pkg_name)
324 |         group = model.Group.by_name(group_name)
325 |         assert group
326 |         assert not group.packages, group.packages
327 |         self.loader.add_pkg_to_group(pkg.name, group.name)
328 |         group = model.Group.by_name(group_name)
329 |         pkg = model.Package.by_name(pkg_name)
330 |         assert group.packages == [pkg], group.packages
331 |         
332 |     def test_1_add_to_non_empty_group(self):
333 |         pkg_name = u'pkgb'
334 |         group_name = u'g1'
335 |         pkg = model.Package.by_name(pkg_name)
336 |         group = model.Group.by_name(group_name)
337 |         assert group
338 |         assert len(group.packages) == 1, group.packages
339 |         self.loader.add_pkg_to_group(pkg.name, group.name)
340 |         group = model.Group.by_name(group_name)
341 |         pkg = model.Package.by_name(pkg_name)
342 |         assert pkg in group.packages, group.packages
343 |         assert len(group.packages) == 2, group.packages
344 | 
345 |     def test_2_add_multiple_packages(self):
346 |         pkg_names = [u'pkgb', u'pkgc']
347 |         group_name = u'g2'
348 |         pkgs = [model.Package.by_name(pkg_name) for pkg_name in pkg_names]
349 |         group = model.Group.by_name(group_name)
350 |         assert group
351 |         num_pkgs_at_start = len(group.packages)
352 |         assert num_pkgs_at_start in (0, 1), group.packages
353 |         self.loader.add_pkgs_to_group(pkg_names, group.name)
354 |         group = model.Group.by_name(group_name)
355 |         pkgs = [model.Package.by_name(pkg_name) for pkg_name in pkg_names]
356 |         for pkg in pkgs:
357 |             assert pkg in group.packages, group.packages
358 |         assert len(group.packages) == num_pkgs_at_start + 2, group.packages
359 | 
360 |     def test_3_add_to_missing_group(self):
361 |         pkg_names = [u'pkgb', u'pkgc']
362 |         try:
363 |             self.loader.add_pkgs_to_group(pkg_names, 'random_name')
364 |         except LoaderError, e:
365 |             assert e.args[0] == 'Group named \'random_name\' does not exist', e.args
366 |         else:
367 |             assert 0, 'Should have raise a LoaderError for the missing group'
368 |         
369 | 
370 | class TestLoaderInsertingResources(TestLoaderBase):
371 |     @classmethod
372 |     def setup_class(self):
373 |         self.tsi = TestSearchIndexer()
374 |         super(TestLoaderInsertingResources, self).setup_class()
375 |         self.loader = ResourceSeriesLoader(
376 |             self.testclient,
377 |             ['title', 'department'],
378 |             'ons/id/',
379 |             field_keys_to_expect_invariant=['country'])
380 | 
381 |     # teardown is in the base class
382 | 
383 |     def test_0_reload(self):
384 |         # create initial package
385 |         num_pkgs = count_pkgs()
386 |         pkg_dict = {'name':u'pollution',
387 |                     'title':u'Pollution',
388 |                     'extras':{u'department':'air',
389 |                               u'country':'UK', #invariant
390 |                               u'last_updated':'Monday', #variant
391 |                               },
392 |                     'resources':[{'url':'pollution.com/1',
393 |                                   'description':'ons/id/1'}],
394 |                     }
395 |         bogus_dict = {'name':u'bogus',
396 |                       'title':u'Pollution',
397 |                       'extras':{u'department':'water',
398 |                               u'country':'UK', 
399 |                               u'last_updated':'Monday',
400 |                               },
401 |                     'resources':[{'url':'pollution.com/2',
402 |                                   'description':'ons/id/2'}],
403 |                     }
404 |         assert not model.Package.by_name(pkg_dict['name'])
405 |         assert not model.Package.by_name(bogus_dict['name'])
406 |         CreateTestData.create_arbitrary([pkg_dict, bogus_dict])
407 |         self.tsi.index()
408 |         pkg = model.Package.by_name(pkg_dict['name'])
409 |         assert pkg
410 |         assert count_pkgs() == num_pkgs + 2, (count_pkgs() - num_pkgs)
411 |         assert len(pkg.resources) == 1, pkg.resources
412 | 
413 |         # load the same package: same title, department, updated resource
414 |         pkg_dict = {'name':u'pollution',
415 |                     'title':u'Pollution',
416 |                     'extras':{u'department':'air',
417 |                               u'country':'UK', #invariant
418 |                               u'last_updated':'Tuesday', #variant
419 |                               },
420 |                     'resources':[{'url':'pollution.com/id/1',
421 |                                   'description':'ons/id/1'}],
422 |                     }
423 |         self.loader.load_package(pkg_dict)
424 |         pkg = model.Package.by_name(pkg_dict['name'])
425 |         assert pkg
426 |         assert pkg.name == pkg_dict['name']
427 |         assert pkg.title == pkg_dict['title']
428 |         assert pkg.extras['country'] == pkg_dict['extras']['country']
429 |         assert pkg.extras['last_updated'] == pkg_dict['extras']['last_updated']
430 |         assert count_pkgs() == num_pkgs + 2, (count_pkgs() - num_pkgs)
431 |         assert len(pkg.resources) == 1, pkg.resources
432 |         assert pkg.resources[0].url == pkg_dict['resources'][0]['url'], pkg.resources[0].url
433 |         assert pkg.resources[0].description == pkg_dict['resources'][0]['description'], pkg.resources[0]['description']
434 | 
435 |         # load the same package: same title, department, new resource
436 |         pkg_dict2 = {'name':u'pollution',
437 |                     'title':u'Pollution',
438 |                     'extras':{u'department':'air',
439 |                               u'country':'UK', #invariant
440 |                               u'last_updated':'Tuesday', #variant
441 |                               },
442 |                     'resources':[{'url':'pollution.com/id/3',
443 |                                   'description':'ons/id/3'}],
444 |                     }
445 |         self.loader.load_package(pkg_dict2)
446 |         pkg = model.Package.by_name(pkg_dict2['name'])
447 |         assert pkg
448 |         assert pkg.name == pkg_dict2['name']
449 |         assert pkg.title == pkg_dict2['title']
450 |         assert pkg.extras['country'] == pkg_dict2['extras']['country']
451 |         assert pkg.extras['last_updated'] == pkg_dict2['extras']['last_updated']
452 |         assert count_pkgs() == num_pkgs + 2, (count_pkgs() - num_pkgs)
453 |         assert len(pkg.resources) == 2, pkg.resources
454 |         print pkg.resources
455 |         assert_equal(pkg.resources[0].url, pkg_dict['resources'][0]['url'])
456 |         assert pkg.resources[0].description == pkg_dict['resources'][0]['description'], pkg.resources[0]['description']
457 |         assert pkg.resources[1].url == pkg_dict2['resources'][0]['url'], pkg.resources[1].url
458 |         assert pkg.resources[1].description == pkg_dict2['resources'][0]['description'], pkg.resources[1]['description']
459 | 
460 |         # load the different package: because of different department
461 |         pkg_dict3 = {'name':u'pollution',
462 |                     'title':u'Pollution',
463 |                     'extras':{u'department':'river',
464 |                               u'country':'UK', #invariant
465 |                               u'last_updated':'Tuesday', #variant
466 |                               },
467 |                     'resources':[{'url':'pollution.com/id/3',
468 |                                   'description':'Lots of pollution | ons/id/3'}],
469 |                     }
470 |         self.loader.load_package(pkg_dict3)
471 |         CreateTestData.flag_for_deletion('pollution_')
472 |         assert count_pkgs() == num_pkgs + 3, (count_pkgs() - num_pkgs)
473 |         pkg_names = [pkg.name for pkg in model.Session.query(model.Package).all()]
474 |         pkg = model.Package.by_name(u'pollution_')
475 |         assert pkg
476 |         assert pkg.extras['department'] == pkg_dict3['extras']['department']
477 | 
478 |         # load the same package: but with different country
479 |         # should just get a warning
480 |         pkg_dict4 = {'name':u'pollution',
481 |                     'title':u'Pollution',
482 |                     'extras':OrderedDict([
483 |                          (u'department', 'air'),
484 |                          (u'country', 'UK and France'), #invariant
485 |                          (u'last_updated', 'Tuesday'), #variant
486 |                          ]),
487 |                     'resources':[OrderedDict([
488 |                          ('url', 'pollution.com/id/3'),
489 |                          ('description', 'Lots of pollution | ons/id/3'),
490 |                          ])],
491 |                     }
492 |         self.loader.load_package(pkg_dict4)
493 |         pkg = model.Package.by_name(pkg_dict4['name'])
494 |         assert pkg
495 |         assert pkg.name == pkg_dict4['name']
496 |         assert pkg.title == pkg_dict4['title']
497 |         assert pkg.extras['country'] == pkg_dict4['extras']['country']
498 |         assert pkg.extras['last_updated'] == pkg_dict4['extras']['last_updated']
499 |         assert count_pkgs() == num_pkgs + 3, (count_pkgs() - num_pkgs)
500 |         assert len(pkg.resources) == 2, pkg.resources
501 |         assert pkg.resources[0].url == pkg_dict['resources'][0]['url'], pkg.resources[0].url
502 |         assert pkg.resources[0].description == pkg_dict['resources'][0]['description'], pkg.resources[0]['description']
503 |         assert pkg.resources[1].url == pkg_dict4['resources'][0]['url'], pkg.resources[1].url
504 |         assert pkg.resources[1].description == pkg_dict4['resources'][0]['description'], pkg.resources[1]['description']
505 | 
506 | 
507 | class TestLoaderInsertingResourcesWithSynonym(TestLoaderBase):
508 |     @classmethod
509 |     def setup_class(self):
510 |         self.tsi = TestSearchIndexer()
511 |         super(TestLoaderInsertingResourcesWithSynonym, self).setup_class()
512 |         self.loader = ResourceSeriesLoader(
513 |             self.testclient,
514 |             ['title', 'department'],
515 |             'ons/id/',
516 |             field_keys_to_expect_invariant=['country'],
517 |             synonyms={'department': [('air', 'sky')]}
518 |             )
519 | 
520 |     # teardown is in the base class
521 | 
522 |     def test_0_search_options(self):
523 |         loader = ResourceSeriesLoader(
524 |             self.testclient,
525 |             ['title', 'department'],
526 |             'ons/id/',
527 |             field_keys_to_expect_invariant=['country'],
528 |             synonyms={'department': [('dept1', 'dept2', 'dept3')],
529 |                       'title': [('titleA', 'titleB', 'titleC')]}
530 |             )
531 |         field_keys = ['title', 'department']
532 |         pkg_dict = {'title':'titleA',
533 |                     'extras':{'department':'dept1'}}
534 |         opts = loader._get_search_options(field_keys, pkg_dict)
535 |         self.assert_equal(opts, [{'department': 'dept1', 'title': 'titleA'}, {'department': 'dept2', 'title': 'titleA'}, {'department': 'dept3', 'title': 'titleA'}, {'department': 'dept1', 'title': 'titleB'}, {'department': 'dept1', 'title': 'titleC'}, {'department': 'dept2', 'title': 'titleB'}, {'department': 'dept2', 'title': 'titleC'}, {'department': 'dept3', 'title': 'titleB'}, {'department': 'dept3', 'title': 'titleC'}])
536 | 
537 |     def test_1_reload(self):
538 |         # create initial package
539 |         num_pkgs = count_pkgs()
540 |         pkg_dict = {'name':u'pollution',
541 |                     'title':u'Pollution',
542 |                     'extras':{u'department':'air',
543 |                               u'country':'UK', #invariant
544 |                               u'last_updated':'Monday', #variant
545 |                               },
546 |                     'resources':[{'url':'pollution.com/1',
547 |                                   'description':'ons/id/1'}],
548 |                     }
549 |         bogus_dict = {'name':u'bogus',
550 |                       'title':u'Pollution',
551 |                       'extras':{u'department':'water',
552 |                               u'country':'UK', 
553 |                               u'last_updated':'Monday',
554 |                               },
555 |                     'resources':[{'url':'pollution.com/2',
556 |                                   'description':'ons/id/2'}],
557 |                     }
558 |         assert not model.Package.by_name(pkg_dict['name'])
559 |         assert not model.Package.by_name(bogus_dict['name'])
560 |         CreateTestData.create_arbitrary([pkg_dict, bogus_dict])
561 |         self.tsi.index()
562 |         pkg = model.Package.by_name(pkg_dict['name'])
563 |         assert pkg
564 |         assert count_pkgs() == num_pkgs + 2, (count_pkgs() - num_pkgs)
565 |         assert len(pkg.resources) == 1, pkg.resources
566 | 
567 |         # load the similar package: same title, updated resource,
568 |         # BUT synonym department
569 |         pkg_dict = {'name':u'pollution',
570 |                     'title':u'Pollution',
571 |                     'extras':{u'department':'sky',
572 |                               u'country':'UK', #invariant
573 |                               u'last_updated':'Tuesday', #variant
574 |                               },
575 |                     'resources':[{'url':'pollution.com/id/1',
576 |                                   'description':'ons/id/1'}],
577 |                     }
578 |         self.loader.load_package(pkg_dict)
579 |         pkg = model.Package.by_name(pkg_dict['name'])
580 |         assert pkg
581 |         assert pkg.name == pkg_dict['name']
582 |         assert pkg.title == pkg_dict['title']
583 |         assert pkg.extras['country'] == pkg_dict['extras']['country']
584 |         assert pkg.extras['last_updated'] == pkg_dict['extras']['last_updated']
585 |         assert count_pkgs() == num_pkgs + 2, (count_pkgs() - num_pkgs)
586 |         assert len(pkg.resources) == 1, pkg.resources
587 |         assert pkg.resources[0].url == pkg_dict['resources'][0]['url'], pkg.resources[0].url
588 |         assert pkg.resources[0].description == pkg_dict['resources'][0]['description'], pkg.resources[0]['description']
589 | 
590 |         # load the different package: because of different department
591 |         pkg_dict3 = {'name':u'pollution',
592 |                     'title':u'Pollution',
593 |                     'extras':{u'department':'river',
594 |                               u'country':'UK', #invariant
595 |                               u'last_updated':'Tuesday', #variant
596 |                               },
597 |                     'resources':[{'url':'pollution.com/id/3',
598 |                                   'description':'Lots of pollution | ons/id/3'}],
599 |                     }
600 |         self.loader.load_package(pkg_dict3)
601 |         CreateTestData.flag_for_deletion('pollution_')
602 |         assert count_pkgs() == num_pkgs + 3, (count_pkgs() - num_pkgs)
603 |         pkg_names = [pkg.name for pkg in model.Session.query(model.Package).all()]
604 |         pkg = model.Package.by_name(u'pollution_')
605 |         assert pkg
606 |         assert pkg.extras['department'] == pkg_dict3['extras']['department']
607 | 
608 | class TestLoaderNoIndexing(TestLoaderBase):
609 |     '''This checks you can re-load a package when the package name
610 |     is unchanged, yet it is not search indexed (due to a problem with that).
611 | 
612 |     '''
613 |     @classmethod
614 |     def setup_class(self):
615 |         # No TestSearchIndexer is initialised.
616 |         if not is_search_supported():
617 |             raise SkipTest("Search not supported")
618 |         super(TestLoaderNoIndexing, self).setup_class()
619 |         self.loader = ReplaceByExtraFieldLoader(self.testclient, 'ref')
620 | 
621 |     # teardown is in the base class
622 | 
623 |     def test_0_reload(self):
624 |         # create initial package
625 |         num_pkgs = count_pkgs()
626 |         pkg_dict = {'name':u'pkgname0',
627 |                     'title':u'Boris',
628 |                     'extras':{u'ref':'boris'}}
629 |         assert not model.Package.by_name(pkg_dict['name'])
630 |         CreateTestData.create_arbitrary([pkg_dict])
631 |         pkg = model.Package.by_name(pkg_dict['name'])
632 |         assert pkg
633 |         assert count_pkgs() == num_pkgs + 1, (count_pkgs() - num_pkgs)
634 | 
635 |         # load the package with same name and ref
636 |         pkg_dict = {'name':u'pkgname0',
637 |                     'title':u'Boris 2',
638 |                     'extras':{u'ref':'boris'}}
639 |         self.loader.load_package(pkg_dict)
640 |         pkg = model.Package.by_name(pkg_dict['name'])
641 |         assert pkg
642 |         assert pkg.name == pkg_dict['name']
643 |         assert pkg.title == pkg_dict['title']
644 |         assert count_pkgs() == num_pkgs + 1, (count_pkgs() - num_pkgs)
645 | 
646 |     def test_1_reload_with_underscores(self):
647 |         # Create decoy package
648 |         pkg_dict = {'name':u'pkgname1',
649 |                     'title':u'Old package decoy',
650 |                     'extras':{u'ref':'decoy'}}
651 |         assert not model.Package.by_name(pkg_dict['name'])
652 |         CreateTestData.create_arbitrary([pkg_dict])
653 | 
654 |         # create initial package
655 |         num_pkgs = count_pkgs()
656 |         pkg_dict = {'name':u'pkgname1_',
657 |                     'title':u'The real Helga',
658 |                     'extras':{u'ref':'helga'}}
659 |         assert not model.Package.by_name(pkg_dict['name'])
660 |         CreateTestData.create_arbitrary([pkg_dict])
661 |         pkg = model.Package.by_name(pkg_dict['name'])
662 |         assert pkg
663 |         assert count_pkgs() == num_pkgs + 1, (count_pkgs() - num_pkgs)
664 | 
665 |         # load the package with same name and ref
666 |         pkg_dict = {'name':u'pkgname1',
667 |                     'title':u'Helga updated',
668 |                     'extras':{u'ref':'helga'}}
669 |         self.loader.load_package(pkg_dict)
670 |         pkg = model.Package.by_name(u'pkgname1_')
671 |         assert pkg
672 |         assert_equal(pkg.title, pkg_dict['title'])
673 |         assert count_pkgs() == num_pkgs + 1, (count_pkgs() - num_pkgs)
674 | 
675 |         decoy = model.Package.by_name(u'pkgname1')
676 |         assert decoy
677 |         assert_equal(decoy.title, u'Old package decoy')
678 | 
679 |         pkg = model.Package.by_name(u'pkgname1_')
680 |         assert pkg
681 |         assert_equal(pkg.title, u'Helga updated')
682 | 
683 |         assert not model.Package.by_name(u'pkgname1__')
684 | 


--------------------------------------------------------------------------------