├── .github
    ├── dependabot.yml
    └── workflows
    │   └── codeql-analysis.yml
├── .gitignore
├── LICENCE
├── README.md
├── databaker
    ├── __init__.py
    ├── constants.py
    ├── databaker_nbconvert.py
    ├── framework.py
    ├── jupybakecsv.py
    ├── jupybakehtml.py
    ├── jupybakeutils.py
    ├── overrides.py
    ├── richxlrd
    │   ├── __init__.py
    │   ├── rich.xls
    │   └── richxlrd.py
    ├── structure_csv_default.py
    ├── tutorial.py
    └── tutorial
    │   ├── Finding_your_way.ipynb
    │   ├── Introduction.ipynb
    │   ├── Real_world_example.ipynb
    │   ├── blank_template.ipynb
    │   ├── construction_output_tables.ipynb
    │   ├── example1.xls
    │   ├── nbconvert_demo.ipynb
    │   ├── ott.xls
    │   └── tutorial_reference.ipynb
├── docwdaspecs
    ├── Interface Specification for Generic Load.doc
    ├── wda.txt
    └── wda_notes.txt
├── requirements.txt
└── setup.py


/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | updates:
 3 |   - package-ecosystem: "github-actions"
 4 |     directory: "/"
 5 |     schedule:
 6 |       interval: "daily"
 7 | 
 8 |   - package-ecosystem: "pip"
 9 |     directory: "/"
10 |     schedule:
11 |       interval: "daily"
12 | 


--------------------------------------------------------------------------------
/.github/workflows/codeql-analysis.yml:
--------------------------------------------------------------------------------
 1 | # For most projects, this workflow file will not need changing; you simply need
 2 | # to commit it to your repository.
 3 | #
 4 | # You may wish to alter this file to override the set of languages analyzed,
 5 | # or to provide custom queries or build logic.
 6 | name: "CodeQL"
 7 | 
 8 | on:
 9 |   push:
10 |     branches: [master]
11 |   pull_request:
12 |     # The branches below must be a subset of the branches above
13 |     branches: [master]
14 |   schedule:
15 |     - cron: '0 3 * * 2'
16 | 
17 | jobs:
18 |   analyze:
19 |     name: Analyze
20 |     runs-on: ubuntu-latest
21 | 
22 |     strategy:
23 |       fail-fast: false
24 |       matrix:
25 |         # Override automatic language detection by changing the below list
26 |         # Supported options are ['csharp', 'cpp', 'go', 'java', 'javascript', 'python']
27 |         language: ['python']
28 |         # Learn more...
29 |         # https://docs.github.com/en/github/finding-security-vulnerabilities-and-errors-in-your-code/configuring-code-scanning#overriding-automatic-language-detection
30 | 
31 |     steps:
32 |     - name: Checkout repository
33 |       uses: actions/checkout@v3
34 |       with:
35 |         # We must fetch at least the immediate parents so that if this is
36 |         # a pull request then we can checkout the head.
37 |         fetch-depth: 2
38 | 
39 |     # If this run was triggered by a pull request event, then checkout
40 |     # the head of the pull request instead of the merge commit.
41 |     - run: git checkout HEAD^2
42 |       if: ${{ github.event_name == 'pull_request' }}
43 | 
44 |     # Initializes the CodeQL tools for scanning.
45 |     - name: Initialize CodeQL
46 |       uses: github/codeql-action/init@v2
47 |       with:
48 |         languages: ${{ matrix.language }}
49 |         # If you wish to specify custom queries, you can do so here or in a config file.
50 |         # By default, queries listed here will override any specified in a config file. 
51 |         # Prefix the list here with "+" to use these queries and those in the config file.
52 |         # queries: ./path/to/local/query, your-org/your-repo/queries@main
53 | 
54 |     # Autobuild attempts to build any compiled languages  (C/C++, C#, or Java).
55 |     # If this step fails, then you should remove it and run the build manually (see below)
56 |     - name: Autobuild
57 |       uses: github/codeql-action/autobuild@v2
58 | 
59 |     # ℹ️ Command-line programs to run using the OS shell.
60 |     # 📚 https://git.io/JvXDl
61 | 
62 |     # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines
63 |     #    and modify them (or add more) to build your code if your project
64 |     #    uses a compiled language
65 | 
66 |     #- run: |
67 |     #   make bootstrap
68 |     #   make release
69 | 
70 |     - name: Perform CodeQL Analysis
71 |       uses: github/codeql-action/analyze@v2
72 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.csv
 2 | *.swp
 3 | *.pyc
 4 | venv
 5 | xypath
 6 | .~lock*
 7 | *.egg-info
 8 | 
 9 | /test/t_out.xls
10 | /test/t_rich.xls
11 | 


--------------------------------------------------------------------------------
/LICENCE:
--------------------------------------------------------------------------------
  1 | This software is Copyright (c) 2016 The Sensible Code Company Limited.
  2 | 
  3 | Unless otherwise stated in particular files or directories, this
  4 | software is free software; you can redistribute it and/or modify it
  5 | under the terms of the GNU Affero General Public License as published
  6 | by the Free Software Foundation, either version 3 of the License, or
  7 | (at your option) any later version.
  8 | 
  9 | This software is distributed in the hope that it will be useful, but
 10 | WITHOUT ANY WARRANTY; without even the implied warranty of
 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 12 | Affero General Public License for more details.
 13 | 
 14 | Information about the GNU Affero GPL:
 15 | http://www.fsf.org/licensing/licenses/agpl-3.0.html
 16 | 
 17 | A copy of the GNU Affero General Public License follows.
 18 | 
 19 | -----------------------------------------------------------------------
 20 | 
 21 |                     GNU AFFERO GENERAL PUBLIC LICENSE
 22 |                        Version 3, 19 November 2007
 23 | 
 24 |  Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
 25 |  Everyone is permitted to copy and distribute verbatim copies
 26 |  of this license document, but changing it is not allowed.
 27 | 
 28 |                             Preamble
 29 | 
 30 |   The GNU Affero General Public License is a free, copyleft license for
 31 | software and other kinds of works, specifically designed to ensure
 32 | cooperation with the community in the case of network server software.
 33 | 
 34 |   The licenses for most software and other practical works are designed
 35 | to take away your freedom to share and change the works.  By contrast,
 36 | our General Public Licenses are intended to guarantee your freedom to
 37 | share and change all versions of a program--to make sure it remains free
 38 | software for all its users.
 39 | 
 40 |   When we speak of free software, we are referring to freedom, not
 41 | price.  Our General Public Licenses are designed to make sure that you
 42 | have the freedom to distribute copies of free software (and charge for
 43 | them if you wish), that you receive source code or can get it if you
 44 | want it, that you can change the software or use pieces of it in new
 45 | free programs, and that you know you can do these things.
 46 | 
 47 |   Developers that use our General Public Licenses protect your rights
 48 | with two steps: (1) assert copyright on the software, and (2) offer
 49 | you this License which gives you legal permission to copy, distribute
 50 | and/or modify the software.
 51 | 
 52 |   A secondary benefit of defending all users' freedom is that
 53 | improvements made in alternate versions of the program, if they
 54 | receive widespread use, become available for other developers to
 55 | incorporate.  Many developers of free software are heartened and
 56 | encouraged by the resulting cooperation.  However, in the case of
 57 | software used on network servers, this result may fail to come about.
 58 | The GNU General Public License permits making a modified version and
 59 | letting the public access it on a server without ever releasing its
 60 | source code to the public.
 61 | 
 62 |   The GNU Affero General Public License is designed specifically to
 63 | ensure that, in such cases, the modified source code becomes available
 64 | to the community.  It requires the operator of a network server to
 65 | provide the source code of the modified version running there to the
 66 | users of that server.  Therefore, public use of a modified version, on
 67 | a publicly accessible server, gives the public access to the source
 68 | code of the modified version.
 69 | 
 70 |   An older license, called the Affero General Public License and
 71 | published by Affero, was designed to accomplish similar goals.  This is
 72 | a different license, not a version of the Affero GPL, but Affero has
 73 | released a new version of the Affero GPL which permits relicensing under
 74 | this license.
 75 | 
 76 |   The precise terms and conditions for copying, distribution and
 77 | modification follow.
 78 | 
 79 |                        TERMS AND CONDITIONS
 80 | 
 81 |   0. Definitions.
 82 | 
 83 |   "This License" refers to version 3 of the GNU Affero General Public License.
 84 | 
 85 |   "Copyright" also means copyright-like laws that apply to other kinds of
 86 | works, such as semiconductor masks.
 87 | 
 88 |   "The Program" refers to any copyrightable work licensed under this
 89 | License.  Each licensee is addressed as "you".  "Licensees" and
 90 | "recipients" may be individuals or organizations.
 91 | 
 92 |   To "modify" a work means to copy from or adapt all or part of the work
 93 | in a fashion requiring copyright permission, other than the making of an
 94 | exact copy.  The resulting work is called a "modified version" of the
 95 | earlier work or a work "based on" the earlier work.
 96 | 
 97 |   A "covered work" means either the unmodified Program or a work based
 98 | on the Program.
 99 | 
100 |   To "propagate" a work means to do anything with it that, without
101 | permission, would make you directly or secondarily liable for
102 | infringement under applicable copyright law, except executing it on a
103 | computer or modifying a private copy.  Propagation includes copying,
104 | distribution (with or without modification), making available to the
105 | public, and in some countries other activities as well.
106 | 
107 |   To "convey" a work means any kind of propagation that enables other
108 | parties to make or receive copies.  Mere interaction with a user through
109 | a computer network, with no transfer of a copy, is not conveying.
110 | 
111 |   An interactive user interface displays "Appropriate Legal Notices"
112 | to the extent that it includes a convenient and prominently visible
113 | feature that (1) displays an appropriate copyright notice, and (2)
114 | tells the user that there is no warranty for the work (except to the
115 | extent that warranties are provided), that licensees may convey the
116 | work under this License, and how to view a copy of this License.  If
117 | the interface presents a list of user commands or options, such as a
118 | menu, a prominent item in the list meets this criterion.
119 | 
120 |   1. Source Code.
121 | 
122 |   The "source code" for a work means the preferred form of the work
123 | for making modifications to it.  "Object code" means any non-source
124 | form of a work.
125 | 
126 |   A "Standard Interface" means an interface that either is an official
127 | standard defined by a recognized standards body, or, in the case of
128 | interfaces specified for a particular programming language, one that
129 | is widely used among developers working in that language.
130 | 
131 |   The "System Libraries" of an executable work include anything, other
132 | than the work as a whole, that (a) is included in the normal form of
133 | packaging a Major Component, but which is not part of that Major
134 | Component, and (b) serves only to enable use of the work with that
135 | Major Component, or to implement a Standard Interface for which an
136 | implementation is available to the public in source code form.  A
137 | "Major Component", in this context, means a major essential component
138 | (kernel, window system, and so on) of the specific operating system
139 | (if any) on which the executable work runs, or a compiler used to
140 | produce the work, or an object code interpreter used to run it.
141 | 
142 |   The "Corresponding Source" for a work in object code form means all
143 | the source code needed to generate, install, and (for an executable
144 | work) run the object code and to modify the work, including scripts to
145 | control those activities.  However, it does not include the work's
146 | System Libraries, or general-purpose tools or generally available free
147 | programs which are used unmodified in performing those activities but
148 | which are not part of the work.  For example, Corresponding Source
149 | includes interface definition files associated with source files for
150 | the work, and the source code for shared libraries and dynamically
151 | linked subprograms that the work is specifically designed to require,
152 | such as by intimate data communication or control flow between those
153 | subprograms and other parts of the work.
154 | 
155 |   The Corresponding Source need not include anything that users
156 | can regenerate automatically from other parts of the Corresponding
157 | Source.
158 | 
159 |   The Corresponding Source for a work in source code form is that
160 | same work.
161 | 
162 |   2. Basic Permissions.
163 | 
164 |   All rights granted under this License are granted for the term of
165 | copyright on the Program, and are irrevocable provided the stated
166 | conditions are met.  This License explicitly affirms your unlimited
167 | permission to run the unmodified Program.  The output from running a
168 | covered work is covered by this License only if the output, given its
169 | content, constitutes a covered work.  This License acknowledges your
170 | rights of fair use or other equivalent, as provided by copyright law.
171 | 
172 |   You may make, run and propagate covered works that you do not
173 | convey, without conditions so long as your license otherwise remains
174 | in force.  You may convey covered works to others for the sole purpose
175 | of having them make modifications exclusively for you, or provide you
176 | with facilities for running those works, provided that you comply with
177 | the terms of this License in conveying all material for which you do
178 | not control copyright.  Those thus making or running the covered works
179 | for you must do so exclusively on your behalf, under your direction
180 | and control, on terms that prohibit them from making any copies of
181 | your copyrighted material outside their relationship with you.
182 | 
183 |   Conveying under any other circumstances is permitted solely under
184 | the conditions stated below.  Sublicensing is not allowed; section 10
185 | makes it unnecessary.
186 | 
187 |   3. Protecting Users' Legal Rights From Anti-Circumvention Law.
188 | 
189 |   No covered work shall be deemed part of an effective technological
190 | measure under any applicable law fulfilling obligations under article
191 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or
192 | similar laws prohibiting or restricting circumvention of such
193 | measures.
194 | 
195 |   When you convey a covered work, you waive any legal power to forbid
196 | circumvention of technological measures to the extent such circumvention
197 | is effected by exercising rights under this License with respect to
198 | the covered work, and you disclaim any intention to limit operation or
199 | modification of the work as a means of enforcing, against the work's
200 | users, your or third parties' legal rights to forbid circumvention of
201 | technological measures.
202 | 
203 |   4. Conveying Verbatim Copies.
204 | 
205 |   You may convey verbatim copies of the Program's source code as you
206 | receive it, in any medium, provided that you conspicuously and
207 | appropriately publish on each copy an appropriate copyright notice;
208 | keep intact all notices stating that this License and any
209 | non-permissive terms added in accord with section 7 apply to the code;
210 | keep intact all notices of the absence of any warranty; and give all
211 | recipients a copy of this License along with the Program.
212 | 
213 |   You may charge any price or no price for each copy that you convey,
214 | and you may offer support or warranty protection for a fee.
215 | 
216 |   5. Conveying Modified Source Versions.
217 | 
218 |   You may convey a work based on the Program, or the modifications to
219 | produce it from the Program, in the form of source code under the
220 | terms of section 4, provided that you also meet all of these conditions:
221 | 
222 |     a) The work must carry prominent notices stating that you modified
223 |     it, and giving a relevant date.
224 | 
225 |     b) The work must carry prominent notices stating that it is
226 |     released under this License and any conditions added under section
227 |     7.  This requirement modifies the requirement in section 4 to
228 |     "keep intact all notices".
229 | 
230 |     c) You must license the entire work, as a whole, under this
231 |     License to anyone who comes into possession of a copy.  This
232 |     License will therefore apply, along with any applicable section 7
233 |     additional terms, to the whole of the work, and all its parts,
234 |     regardless of how they are packaged.  This License gives no
235 |     permission to license the work in any other way, but it does not
236 |     invalidate such permission if you have separately received it.
237 | 
238 |     d) If the work has interactive user interfaces, each must display
239 |     Appropriate Legal Notices; however, if the Program has interactive
240 |     interfaces that do not display Appropriate Legal Notices, your
241 |     work need not make them do so.
242 | 
243 |   A compilation of a covered work with other separate and independent
244 | works, which are not by their nature extensions of the covered work,
245 | and which are not combined with it such as to form a larger program,
246 | in or on a volume of a storage or distribution medium, is called an
247 | "aggregate" if the compilation and its resulting copyright are not
248 | used to limit the access or legal rights of the compilation's users
249 | beyond what the individual works permit.  Inclusion of a covered work
250 | in an aggregate does not cause this License to apply to the other
251 | parts of the aggregate.
252 | 
253 |   6. Conveying Non-Source Forms.
254 | 
255 |   You may convey a covered work in object code form under the terms
256 | of sections 4 and 5, provided that you also convey the
257 | machine-readable Corresponding Source under the terms of this License,
258 | in one of these ways:
259 | 
260 |     a) Convey the object code in, or embodied in, a physical product
261 |     (including a physical distribution medium), accompanied by the
262 |     Corresponding Source fixed on a durable physical medium
263 |     customarily used for software interchange.
264 | 
265 |     b) Convey the object code in, or embodied in, a physical product
266 |     (including a physical distribution medium), accompanied by a
267 |     written offer, valid for at least three years and valid for as
268 |     long as you offer spare parts or customer support for that product
269 |     model, to give anyone who possesses the object code either (1) a
270 |     copy of the Corresponding Source for all the software in the
271 |     product that is covered by this License, on a durable physical
272 |     medium customarily used for software interchange, for a price no
273 |     more than your reasonable cost of physically performing this
274 |     conveying of source, or (2) access to copy the
275 |     Corresponding Source from a network server at no charge.
276 | 
277 |     c) Convey individual copies of the object code with a copy of the
278 |     written offer to provide the Corresponding Source.  This
279 |     alternative is allowed only occasionally and noncommercially, and
280 |     only if you received the object code with such an offer, in accord
281 |     with subsection 6b.
282 | 
283 |     d) Convey the object code by offering access from a designated
284 |     place (gratis or for a charge), and offer equivalent access to the
285 |     Corresponding Source in the same way through the same place at no
286 |     further charge.  You need not require recipients to copy the
287 |     Corresponding Source along with the object code.  If the place to
288 |     copy the object code is a network server, the Corresponding Source
289 |     may be on a different server (operated by you or a third party)
290 |     that supports equivalent copying facilities, provided you maintain
291 |     clear directions next to the object code saying where to find the
292 |     Corresponding Source.  Regardless of what server hosts the
293 |     Corresponding Source, you remain obligated to ensure that it is
294 |     available for as long as needed to satisfy these requirements.
295 | 
296 |     e) Convey the object code using peer-to-peer transmission, provided
297 |     you inform other peers where the object code and Corresponding
298 |     Source of the work are being offered to the general public at no
299 |     charge under subsection 6d.
300 | 
301 |   A separable portion of the object code, whose source code is excluded
302 | from the Corresponding Source as a System Library, need not be
303 | included in conveying the object code work.
304 | 
305 |   A "User Product" is either (1) a "consumer product", which means any
306 | tangible personal property which is normally used for personal, family,
307 | or household purposes, or (2) anything designed or sold for incorporation
308 | into a dwelling.  In determining whether a product is a consumer product,
309 | doubtful cases shall be resolved in favor of coverage.  For a particular
310 | product received by a particular user, "normally used" refers to a
311 | typical or common use of that class of product, regardless of the status
312 | of the particular user or of the way in which the particular user
313 | actually uses, or expects or is expected to use, the product.  A product
314 | is a consumer product regardless of whether the product has substantial
315 | commercial, industrial or non-consumer uses, unless such uses represent
316 | the only significant mode of use of the product.
317 | 
318 |   "Installation Information" for a User Product means any methods,
319 | procedures, authorization keys, or other information required to install
320 | and execute modified versions of a covered work in that User Product from
321 | a modified version of its Corresponding Source.  The information must
322 | suffice to ensure that the continued functioning of the modified object
323 | code is in no case prevented or interfered with solely because
324 | modification has been made.
325 | 
326 |   If you convey an object code work under this section in, or with, or
327 | specifically for use in, a User Product, and the conveying occurs as
328 | part of a transaction in which the right of possession and use of the
329 | User Product is transferred to the recipient in perpetuity or for a
330 | fixed term (regardless of how the transaction is characterized), the
331 | Corresponding Source conveyed under this section must be accompanied
332 | by the Installation Information.  But this requirement does not apply
333 | if neither you nor any third party retains the ability to install
334 | modified object code on the User Product (for example, the work has
335 | been installed in ROM).
336 | 
337 |   The requirement to provide Installation Information does not include a
338 | requirement to continue to provide support service, warranty, or updates
339 | for a work that has been modified or installed by the recipient, or for
340 | the User Product in which it has been modified or installed.  Access to a
341 | network may be denied when the modification itself materially and
342 | adversely affects the operation of the network or violates the rules and
343 | protocols for communication across the network.
344 | 
345 |   Corresponding Source conveyed, and Installation Information provided,
346 | in accord with this section must be in a format that is publicly
347 | documented (and with an implementation available to the public in
348 | source code form), and must require no special password or key for
349 | unpacking, reading or copying.
350 | 
351 |   7. Additional Terms.
352 | 
353 |   "Additional permissions" are terms that supplement the terms of this
354 | License by making exceptions from one or more of its conditions.
355 | Additional permissions that are applicable to the entire Program shall
356 | be treated as though they were included in this License, to the extent
357 | that they are valid under applicable law.  If additional permissions
358 | apply only to part of the Program, that part may be used separately
359 | under those permissions, but the entire Program remains governed by
360 | this License without regard to the additional permissions.
361 | 
362 |   When you convey a copy of a covered work, you may at your option
363 | remove any additional permissions from that copy, or from any part of
364 | it.  (Additional permissions may be written to require their own
365 | removal in certain cases when you modify the work.)  You may place
366 | additional permissions on material, added by you to a covered work,
367 | for which you have or can give appropriate copyright permission.
368 | 
369 |   Notwithstanding any other provision of this License, for material you
370 | add to a covered work, you may (if authorized by the copyright holders of
371 | that material) supplement the terms of this License with terms:
372 | 
373 |     a) Disclaiming warranty or limiting liability differently from the
374 |     terms of sections 15 and 16 of this License; or
375 | 
376 |     b) Requiring preservation of specified reasonable legal notices or
377 |     author attributions in that material or in the Appropriate Legal
378 |     Notices displayed by works containing it; or
379 | 
380 |     c) Prohibiting misrepresentation of the origin of that material, or
381 |     requiring that modified versions of such material be marked in
382 |     reasonable ways as different from the original version; or
383 | 
384 |     d) Limiting the use for publicity purposes of names of licensors or
385 |     authors of the material; or
386 | 
387 |     e) Declining to grant rights under trademark law for use of some
388 |     trade names, trademarks, or service marks; or
389 | 
390 |     f) Requiring indemnification of licensors and authors of that
391 |     material by anyone who conveys the material (or modified versions of
392 |     it) with contractual assumptions of liability to the recipient, for
393 |     any liability that these contractual assumptions directly impose on
394 |     those licensors and authors.
395 | 
396 |   All other non-permissive additional terms are considered "further
397 | restrictions" within the meaning of section 10.  If the Program as you
398 | received it, or any part of it, contains a notice stating that it is
399 | governed by this License along with a term that is a further
400 | restriction, you may remove that term.  If a license document contains
401 | a further restriction but permits relicensing or conveying under this
402 | License, you may add to a covered work material governed by the terms
403 | of that license document, provided that the further restriction does
404 | not survive such relicensing or conveying.
405 | 
406 |   If you add terms to a covered work in accord with this section, you
407 | must place, in the relevant source files, a statement of the
408 | additional terms that apply to those files, or a notice indicating
409 | where to find the applicable terms.
410 | 
411 |   Additional terms, permissive or non-permissive, may be stated in the
412 | form of a separately written license, or stated as exceptions;
413 | the above requirements apply either way.
414 | 
415 |   8. Termination.
416 | 
417 |   You may not propagate or modify a covered work except as expressly
418 | provided under this License.  Any attempt otherwise to propagate or
419 | modify it is void, and will automatically terminate your rights under
420 | this License (including any patent licenses granted under the third
421 | paragraph of section 11).
422 | 
423 |   However, if you cease all violation of this License, then your
424 | license from a particular copyright holder is reinstated (a)
425 | provisionally, unless and until the copyright holder explicitly and
426 | finally terminates your license, and (b) permanently, if the copyright
427 | holder fails to notify you of the violation by some reasonable means
428 | prior to 60 days after the cessation.
429 | 
430 |   Moreover, your license from a particular copyright holder is
431 | reinstated permanently if the copyright holder notifies you of the
432 | violation by some reasonable means, this is the first time you have
433 | received notice of violation of this License (for any work) from that
434 | copyright holder, and you cure the violation prior to 30 days after
435 | your receipt of the notice.
436 | 
437 |   Termination of your rights under this section does not terminate the
438 | licenses of parties who have received copies or rights from you under
439 | this License.  If your rights have been terminated and not permanently
440 | reinstated, you do not qualify to receive new licenses for the same
441 | material under section 10.
442 | 
443 |   9. Acceptance Not Required for Having Copies.
444 | 
445 |   You are not required to accept this License in order to receive or
446 | run a copy of the Program.  Ancillary propagation of a covered work
447 | occurring solely as a consequence of using peer-to-peer transmission
448 | to receive a copy likewise does not require acceptance.  However,
449 | nothing other than this License grants you permission to propagate or
450 | modify any covered work.  These actions infringe copyright if you do
451 | not accept this License.  Therefore, by modifying or propagating a
452 | covered work, you indicate your acceptance of this License to do so.
453 | 
454 |   10. Automatic Licensing of Downstream Recipients.
455 | 
456 |   Each time you convey a covered work, the recipient automatically
457 | receives a license from the original licensors, to run, modify and
458 | propagate that work, subject to this License.  You are not responsible
459 | for enforcing compliance by third parties with this License.
460 | 
461 |   An "entity transaction" is a transaction transferring control of an
462 | organization, or substantially all assets of one, or subdividing an
463 | organization, or merging organizations.  If propagation of a covered
464 | work results from an entity transaction, each party to that
465 | transaction who receives a copy of the work also receives whatever
466 | licenses to the work the party's predecessor in interest had or could
467 | give under the previous paragraph, plus a right to possession of the
468 | Corresponding Source of the work from the predecessor in interest, if
469 | the predecessor has it or can get it with reasonable efforts.
470 | 
471 |   You may not impose any further restrictions on the exercise of the
472 | rights granted or affirmed under this License.  For example, you may
473 | not impose a license fee, royalty, or other charge for exercise of
474 | rights granted under this License, and you may not initiate litigation
475 | (including a cross-claim or counterclaim in a lawsuit) alleging that
476 | any patent claim is infringed by making, using, selling, offering for
477 | sale, or importing the Program or any portion of it.
478 | 
479 |   11. Patents.
480 | 
481 |   A "contributor" is a copyright holder who authorizes use under this
482 | License of the Program or a work on which the Program is based.  The
483 | work thus licensed is called the contributor's "contributor version".
484 | 
485 |   A contributor's "essential patent claims" are all patent claims
486 | owned or controlled by the contributor, whether already acquired or
487 | hereafter acquired, that would be infringed by some manner, permitted
488 | by this License, of making, using, or selling its contributor version,
489 | but do not include claims that would be infringed only as a
490 | consequence of further modification of the contributor version.  For
491 | purposes of this definition, "control" includes the right to grant
492 | patent sublicenses in a manner consistent with the requirements of
493 | this License.
494 | 
495 |   Each contributor grants you a non-exclusive, worldwide, royalty-free
496 | patent license under the contributor's essential patent claims, to
497 | make, use, sell, offer for sale, import and otherwise run, modify and
498 | propagate the contents of its contributor version.
499 | 
500 |   In the following three paragraphs, a "patent license" is any express
501 | agreement or commitment, however denominated, not to enforce a patent
502 | (such as an express permission to practice a patent or covenant not to
503 | sue for patent infringement).  To "grant" such a patent license to a
504 | party means to make such an agreement or commitment not to enforce a
505 | patent against the party.
506 | 
507 |   If you convey a covered work, knowingly relying on a patent license,
508 | and the Corresponding Source of the work is not available for anyone
509 | to copy, free of charge and under the terms of this License, through a
510 | publicly available network server or other readily accessible means,
511 | then you must either (1) cause the Corresponding Source to be so
512 | available, or (2) arrange to deprive yourself of the benefit of the
513 | patent license for this particular work, or (3) arrange, in a manner
514 | consistent with the requirements of this License, to extend the patent
515 | license to downstream recipients.  "Knowingly relying" means you have
516 | actual knowledge that, but for the patent license, your conveying the
517 | covered work in a country, or your recipient's use of the covered work
518 | in a country, would infringe one or more identifiable patents in that
519 | country that you have reason to believe are valid.
520 | 
521 |   If, pursuant to or in connection with a single transaction or
522 | arrangement, you convey, or propagate by procuring conveyance of, a
523 | covered work, and grant a patent license to some of the parties
524 | receiving the covered work authorizing them to use, propagate, modify
525 | or convey a specific copy of the covered work, then the patent license
526 | you grant is automatically extended to all recipients of the covered
527 | work and works based on it.
528 | 
529 |   A patent license is "discriminatory" if it does not include within
530 | the scope of its coverage, prohibits the exercise of, or is
531 | conditioned on the non-exercise of one or more of the rights that are
532 | specifically granted under this License.  You may not convey a covered
533 | work if you are a party to an arrangement with a third party that is
534 | in the business of distributing software, under which you make payment
535 | to the third party based on the extent of your activity of conveying
536 | the work, and under which the third party grants, to any of the
537 | parties who would receive the covered work from you, a discriminatory
538 | patent license (a) in connection with copies of the covered work
539 | conveyed by you (or copies made from those copies), or (b) primarily
540 | for and in connection with specific products or compilations that
541 | contain the covered work, unless you entered into that arrangement,
542 | or that patent license was granted, prior to 28 March 2007.
543 | 
544 |   Nothing in this License shall be construed as excluding or limiting
545 | any implied license or other defenses to infringement that may
546 | otherwise be available to you under applicable patent law.
547 | 
548 |   12. No Surrender of Others' Freedom.
549 | 
550 |   If conditions are imposed on you (whether by court order, agreement or
551 | otherwise) that contradict the conditions of this License, they do not
552 | excuse you from the conditions of this License.  If you cannot convey a
553 | covered work so as to satisfy simultaneously your obligations under this
554 | License and any other pertinent obligations, then as a consequence you may
555 | not convey it at all.  For example, if you agree to terms that obligate you
556 | to collect a royalty for further conveying from those to whom you convey
557 | the Program, the only way you could satisfy both those terms and this
558 | License would be to refrain entirely from conveying the Program.
559 | 
560 |   13. Remote Network Interaction; Use with the GNU General Public License.
561 | 
562 |   Notwithstanding any other provision of this License, if you modify the
563 | Program, your modified version must prominently offer all users
564 | interacting with it remotely through a computer network (if your version
565 | supports such interaction) an opportunity to receive the Corresponding
566 | Source of your version by providing access to the Corresponding Source
567 | from a network server at no charge, through some standard or customary
568 | means of facilitating copying of software.  This Corresponding Source
569 | shall include the Corresponding Source for any work covered by version 3
570 | of the GNU General Public License that is incorporated pursuant to the
571 | following paragraph.
572 | 
573 |   Notwithstanding any other provision of this License, you have
574 | permission to link or combine any covered work with a work licensed
575 | under version 3 of the GNU General Public License into a single
576 | combined work, and to convey the resulting work.  The terms of this
577 | License will continue to apply to the part which is the covered work,
578 | but the work with which it is combined will remain governed by version
579 | 3 of the GNU General Public License.
580 | 
581 |   14. Revised Versions of this License.
582 | 
583 |   The Free Software Foundation may publish revised and/or new versions of
584 | the GNU Affero General Public License from time to time.  Such new versions
585 | will be similar in spirit to the present version, but may differ in detail to
586 | address new problems or concerns.
587 | 
588 |   Each version is given a distinguishing version number.  If the
589 | Program specifies that a certain numbered version of the GNU Affero General
590 | Public License "or any later version" applies to it, you have the
591 | option of following the terms and conditions either of that numbered
592 | version or of any later version published by the Free Software
593 | Foundation.  If the Program does not specify a version number of the
594 | GNU Affero General Public License, you may choose any version ever published
595 | by the Free Software Foundation.
596 | 
597 |   If the Program specifies that a proxy can decide which future
598 | versions of the GNU Affero General Public License can be used, that proxy's
599 | public statement of acceptance of a version permanently authorizes you
600 | to choose that version for the Program.
601 | 
602 |   Later license versions may give you additional or different
603 | permissions.  However, no additional obligations are imposed on any
604 | author or copyright holder as a result of your choosing to follow a
605 | later version.
606 | 
607 |   15. Disclaimer of Warranty.
608 | 
609 |   THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
610 | APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
611 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
612 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
613 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
614 | PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
615 | IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
616 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
617 | 
618 |   16. Limitation of Liability.
619 | 
620 |   IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
621 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
622 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
623 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
624 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
625 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
626 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
627 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
628 | SUCH DAMAGES.
629 | 
630 |   17. Interpretation of Sections 15 and 16.
631 | 
632 |   If the disclaimer of warranty and limitation of liability provided
633 | above cannot be given local legal effect according to their terms,
634 | reviewing courts shall apply local law that most closely approximates
635 | an absolute waiver of all civil liability in connection with the
636 | Program, unless a warranty or assumption of liability accompanies a
637 | copy of the Program in return for a fee.
638 | 
639 |                      END OF TERMS AND CONDITIONS
640 | 
641 |             How to Apply These Terms to Your New Programs
642 | 
643 |   If you develop a new program, and you want it to be of the greatest
644 | possible use to the public, the best way to achieve this is to make it
645 | free software which everyone can redistribute and change under these terms.
646 | 
647 |   To do so, attach the following notices to the program.  It is safest
648 | to attach them to the start of each source file to most effectively
649 | state the exclusion of warranty; and each file should have at least
650 | the "copyright" line and a pointer to where the full notice is found.
651 | 
652 |     <one line to give the program's name and a brief idea of what it does.>
653 |     Copyright (C) <year>  <name of author>
654 | 
655 |     This program is free software: you can redistribute it and/or modify
656 |     it under the terms of the GNU Affero General Public License as published by
657 |     the Free Software Foundation, either version 3 of the License, or
658 |     (at your option) any later version.
659 | 
660 |     This program is distributed in the hope that it will be useful,
661 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
662 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
663 |     GNU Affero General Public License for more details.
664 | 
665 |     You should have received a copy of the GNU Affero General Public License
666 |     along with this program.  If not, see <http://www.gnu.org/licenses/>.
667 | 
668 | Also add information on how to contact you by electronic and paper mail.
669 | 
670 |   If your software can interact with users remotely through a computer
671 | network, you should also make sure that it provides a way for users to
672 | get its source.  For example, if your program is a web application, its
673 | interface could display a "Source" link that leads users to an archive
674 | of the code.  There are many ways you could offer source, and different
675 | solutions will be better for different programs; see section 13 for the
676 | specific requirements.
677 | 
678 |   You should also get your employer (if you work as a programmer) or school,
679 | if any, to sign a "copyright disclaimer" for the program, if necessary.
680 | For more information on this, and how to apply and follow the GNU AGPL, see
681 | <http://www.gnu.org/licenses/>.
682 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Databaker
 2 | 
 3 | Jupyter notebook tool for converting data that is laid out in a formatted Excel 
 4 | spreadsheet into a normalized form for use by databases.
 5 | 
 6 | It depends on [okfn/messytables](https://github.com/okfn/messytables) and 
 7 | [sensiblecodeio/xypath](https://github.com/sensiblecodeio/xypath)
 8 | 
 9 | Python 3.4+ supported.
10 | 
11 | ## Starting up
12 | 
13 | ### For development
14 | 
15 | To install for development, the easiest way is create a virtualenv,
16 | activate it:
17 | 
18 | `source bin/activate`
19 | 
20 | and then type
21 | 
22 | `pip install -e git+https://github.com/sensiblecodeio/databaker.git#egg=databaker`
23 | 
24 | This will install the code into `src/databaker` where you can edit and commit it.  
25 | 
26 | ### For normal use
27 | 
28 | Install with `pip install databaker`
29 | 
30 | ## Usage
31 | 
32 | Launch a Jupyter notebook:
33 | 
34 | `jupyter notebook` 
35 | 
36 | and then follow the tutorials as described below. 
37 | 
38 | ## Documentation
39 | 
40 | The current documentation is in the form of Jupyter notebooks located
41 | inside the [tutorial](databaker/tutorial) directory.
42 | 
43 | You can access these directly by creating a new Jupyter notebook and
44 | running the following in a Jupyter cell:
45 | 
46 | ```
47 | from databaker.tutorial import tutorial
48 | tutorial()
49 | ```
50 | 
51 | which will copy the tutorials to your current directory and provide
52 | links to these copied notebooks.
53 | 
54 | ## Authors
55 | 
56 | Made by the [Sensible Code Company](http://sensiblecode.io) on behalf of the 
57 | [Office of National Statistics](https://www.ons.gov.uk/) (UK).
58 | 


--------------------------------------------------------------------------------
/databaker/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cantabular/databaker/930b65e85038a3afd59574bbdc1d4cb79ce9b6ad/databaker/__init__.py


--------------------------------------------------------------------------------
/databaker/constants.py:
--------------------------------------------------------------------------------
 1 | from xypath import DOWN, UP, LEFT, RIGHT
 2 | from hamcrest import *
 3 | 
 4 | # IF theres a custom template use it, Otherwise use the default.
 5 | try:
 6 |     from structure_csv_user import *
 7 |     import structure_csv_user as template
 8 | except ImportError:
 9 |     from .structure_csv_default import *
10 |     from . import structure_csv_default as template
11 | 
12 | 
13 | ABOVE = UP
14 | BELOW = DOWN
15 | 
16 | DIRECTLY = True
17 | CLOSEST = False
18 | 
19 | 
20 | 


--------------------------------------------------------------------------------
/databaker/databaker_nbconvert.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import os
 3 | import subprocess
 4 | import sys
 5 | 
 6 | 
 7 | def main(argv=sys.argv[1:]):
 8 |     if argv is None or len(argv) == 0 or len(argv) > 2:
 9 |         print("Usage: databaker_process.py <notebook_file> <input_file>")
10 |         print()
11 |         print("<input_file> is optional; it replaces DATABAKER_INPUT_FILE")
12 |         print("in the notebook.")
13 |         print("The input file should also be in the same directory as the")
14 |         print("notebook.")
15 |         sys.exit(1)
16 | 
17 |     process_env = os.environ.copy()
18 | 
19 |     if len(argv) == 2:
20 |         process_env['DATABAKER_INPUT_FILE'] = argv[1]
21 | 
22 |     # TODO get custom templates working; according to this:
23 |     # https://github.com/jupyter/nbconvert/issues/391
24 |     # they should work, but I get TemplateNotFound when using absolute path
25 |     # for template.
26 |     cmd_line = ['jupyter', 'nbconvert', '--to', 'html', '--execute', argv[0]]
27 |     print("Running:", ' '.join(cmd_line))
28 |     subprocess.call(args=cmd_line, env=process_env)
29 | 
30 | 
31 | if __name__ == '__main__':
32 |     main()
33 | 


--------------------------------------------------------------------------------
/databaker/framework.py:
--------------------------------------------------------------------------------
 1 | import os, warnings
 2 | import xypath
 3 | import xypath.loader
 4 | import databaker.constants
 5 | from databaker.constants import *      # also brings in template
 6 | import databaker.overrides as overrides       # warning: injects additional class functions into xypath and messytables
 7 | 
 8 | # core classes and functionality
 9 | from databaker.jupybakeutils import HDim, HDimConst, ConversionSegment, Ldatetimeunitloose, Ldatetimeunitforce, pdguessforceTIMEUNIT
10 | from databaker.jupybakecsv import writetechnicalCSV, readtechnicalCSV
11 | from databaker.jupybakehtml import savepreviewhtml
12 | 
13 | # this lot should be deprecated
14 | from databaker.jupybakecsv import headersfromwdasegment, extraheaderscheck, checktheconstantdimensions, checksegmentobsvalues
15 | from databaker.jupybakecsv import wdamsgstrings, CompareConversionSegments
16 | 
17 | def loadxlstabs(inputfile, sheetids="*", verbose=True):
18 |     if verbose:
19 |         print("Loading %s which has size %d bytes" % (inputfile, os.path.getsize(inputfile)))
20 |     tableset = xypath.loader.table_set(inputfile, extension='xls')
21 |     tabs = list(xypath.loader.get_sheets(tableset, sheetids))
22 |     tabnames = [ tab.name  for tab in tabs ]
23 |     if verbose:
24 |         print("Table names: %s" % str(tabnames))
25 |     
26 |     if sheetids != "*":
27 |         if type(sheetids) == str:
28 |             sheetids = [sheetids]
29 |         assert type(sheetids) in [list, tuple], ("What type is this?", type(sheetids))
30 |         for sid in sheetids:
31 |             assert sid in tabnames, (sid, "missing from found tables")
32 |         assert len(sheetids) == len(tabnames), ("Number of selected tables disagree", "len(sheetids) == len(tabnames)", len(sheetids), len(tabnames))
33 |     if len(set(tabnames)) != len(tabnames):
34 |         warnings.warn("Duplicates found in table names list")
35 |     return tabs
36 | 
37 | DATABAKER_INPUT_FILE = None
38 | 
39 | 
40 | def getinputfilename():
41 |     """ Return DATABAKER_INPUT_FILE from os.environ or this module.
42 | 
43 |     This is so that DATABAKER_INPUT_FILE could be specified in a notebook and
44 |     then overridden by an environment variable if not.
45 | 
46 |     Use of environment variables is because nbconvert doesn't allow you to
47 |     easily pass arguments to the notebook.
48 | 
49 |     Use in notebook is along the lines of:
50 | 
51 |     DATABAKER_INPUT_FILE = 'myfile.xls'
52 |     f = getinputfilename()
53 | 
54 |     This way, we can set the filename in the notebook, or at the commmand line
55 |     with environment variables.
56 |     """
57 |     try:
58 |         return os.environ['DATABAKER_INPUT_FILE']
59 |     except KeyError as e:
60 |         return DATABAKER_INPUT_FILE
61 | 


--------------------------------------------------------------------------------
/databaker/jupybakecsv.py:
--------------------------------------------------------------------------------
  1 | # encoding: utf-8
  2 | # HTML preview of the dimensions and table (will be moved to a function in databakersolo)
  3 | 
  4 | import io, os, collections, re, warnings, csv, datetime
  5 | import databaker.constants
  6 | from databaker.jupybakeutils import ConversionSegment
  7 | template = databaker.constants.template
  8 | 
  9 | try:   import pandas
 10 | except ImportError:  pandas = None  # no pandas in pypy
 11 | 
 12 | def HLDUPgenerate_header_row(numheaderadditionals):
 13 |     res = [ (k[0] if isinstance(k, tuple) else k)  for k in template.headermeasurements ]
 14 |     for i in range(numheaderadditionals):
 15 |         for k in template.headeradditionals:
 16 |             if isinstance(k, tuple):
 17 |                 sk = k[0]
 18 |             else:
 19 |                 sk = k
 20 |             res.append("%s_%d" % (sk, i+1))
 21 |     return res
 22 | 
 23 | 
 24 | 
 25 | def Lyield_dimension_values(dval, isegmentnumber, Cheaderadditionals):
 26 |     for k in template.headermeasurements:
 27 |         if isinstance(k, tuple):
 28 |             yield dval.get(k[1], '')
 29 |         elif k == template.conversionsegmentnumbercolumn:
 30 |             yield isegmentnumber
 31 |         else:
 32 |             yield ''
 33 |             
 34 |     for dlab in Cheaderadditionals:
 35 |         for k in template.headeradditionals:
 36 |             if isinstance(k, tuple):
 37 |                 if k[1] == "NAME":
 38 |                     yield dlab
 39 |                 else:
 40 |                     assert k[1] == "VALUE"
 41 |                     yield dval[dlab]
 42 |             else:
 43 |                 yield ''
 44 | 
 45 | 
 46 | def writetechnicalCSV(outputfile, conversionsegments):
 47 |     "Output the CSV into the bloated WDA format (takes lists of conversionsegments or pandas tables)"
 48 |     if not isinstance(conversionsegments, (list, tuple)):
 49 |         conversionsegments = [ conversionsegments ]
 50 |         
 51 |     if outputfile is not None:
 52 |         print("writing %d conversion segments into %s" % (len(conversionsegments), os.path.abspath(outputfile)))
 53 |         try:
 54 |             filehandle = open(outputfile, "w", newline='\n', encoding='utf-8')
 55 |         except TypeError:  # this happens if you run in pypy2 because the newline parameter is not recognized
 56 |             filehandle = open(outputfile, "w")
 57 |     else:
 58 |         filehandle = io.StringIO()  # to return as string for print preview perhaps
 59 |     csv_writer = csv.writer(filehandle)
 60 |     row_count = 0
 61 |         
 62 |     for isegmentnumber, conversionsegment in enumerate(conversionsegments):
 63 |         if isegmentnumber == 0:   # only first segment gets a CSV header for the whole file (even if it is not consistent for the remaining segments)
 64 |             if isinstance(conversionsegment, ConversionSegment):
 65 |                 Cheaderadditionals = [ dimension.label  for dimension in conversionsegment.dimensions  if dimension.label not in template.headermeasurementnamesSet ]
 66 |                 assert len(Cheaderadditionals) == conversionsegment.numheaderadditionals
 67 |             elif pandas is not None:
 68 |                 assert isinstance(conversionsegment, pandas.DataFrame), "function takes only ConversionSegments of pandas.DataFrames"
 69 |                 if not isinstance(conversionsegment.index, pandas.RangeIndex):
 70 |                     conversionsegment = conversionsegment.reset_index()  # in case of playing around with indexes
 71 |                 Cheaderadditionals = [colname  for colname in conversionsegment.columns  if colname not in template.headermeasurementnamesSet and colname[:2] != "__"]
 72 |             csv_writer.writerow(HLDUPgenerate_header_row(len(Cheaderadditionals)))
 73 | 
 74 |         if isinstance(conversionsegment, ConversionSegment):
 75 |             timeunitmessage = ""
 76 |             if conversionsegment.processedrows is None: 
 77 |                 timeunitmessage = conversionsegment.process()  
 78 | 
 79 |             if outputfile is not None:
 80 |                 print("conversionwrite segment size %d table '%s'; %s" % (len(conversionsegment.processedrows), conversionsegment.tab.name, timeunitmessage))
 81 |             for row in conversionsegment.processedrows:
 82 |                 csv_writer.writerow(Lyield_dimension_values(row, isegmentnumber, Cheaderadditionals))
 83 |                 row_count += 1
 84 | 
 85 |         else:  # pandas.Dataframe case
 86 |             assert pandas is not None
 87 |             if outputfile is not None:
 88 |                 print("pdconversionwrite segment size %d" % (len(conversionsegment)))
 89 |             for i in range(len(conversionsegment)):  # quick and dirty to use same dict-based function
 90 |                 csv_writer.writerow(Lyield_dimension_values(dict(conversionsegment.iloc[i].dropna()), isegmentnumber, Cheaderadditionals))
 91 |                 row_count += 1
 92 | 
 93 |     csv_writer.writerow(["*"*9, row_count])
 94 |     if outputfile is not None:
 95 |         filehandle.close()
 96 |     else:
 97 |         return filehandle.getvalue()
 98 | 
 99 | 
100 | 
101 | def readtechnicalCSV(wdafile, bverbose=False, baspandas=True):
102 |     if baspandas and pandas is None:
103 |         baspandas = False
104 |         
105 |     "Read a WDA CSV back from its file into an lookup table from segment number to (each a list of dicts)"
106 |     if isinstance(wdafile, str):
107 |         if len(wdafile) > 200 and '\n' in wdafile:
108 |             filehandle = io.StringIO(wdafile)
109 |         else:
110 |             filehandle = open(wdafile, "r", encoding='utf-8')
111 |     else:
112 |         assert isinstance(wdafile, io.StringIO)
113 |         filehandle = wdafile
114 |         
115 |     wdain = csv.reader(filehandle)
116 |     # First check that the headers are what we expect
117 |     wdaheaders = wdain.__next__()
118 |     numheaderadditionals = (len(wdaheaders) - len(template.headermeasurements))//len(template.headeradditionals)
119 |     if not (wdaheaders == HLDUPgenerate_header_row(numheaderadditionals)):
120 |         print("WDA heades don't match.  nothing is likely to work now")
121 |         
122 |     wdasegments = { }             # { segmentnumber: ( [ data_dicts ], [ordered_header_list] ) }
123 |     previsegmentnumber = None
124 |     segmentheaderssegmentL = [ ]  # [ [ordered_header_list] ]
125 |     
126 |     for row in wdain:
127 |         if row[0] == '*********':
128 |             nrows = sum(len(wdasegment)  for wdasegment, segmentheaders in wdasegments.values())
129 |             if int(row[1]) != nrows:
130 |                 warnings.warn("row number doesn't match %d should be %d" % (int(row[1]), nrows))
131 |             assert len(list(wdain)) == 0, "***** must be on last row"
132 |             break
133 | 
134 |         dval = { }
135 |         isegmentnumber = None
136 |         for r, k in zip(row, template.headermeasurements):
137 |             if isinstance(k, tuple):
138 |                 nk = k[1]
139 |                 if r:
140 |                     assert nk not in dval or dval[nk] == r
141 |                     dval[nk] = r
142 |                 else:
143 |                     assert not dval.get(nk)
144 |             elif k == template.conversionsegmentnumbercolumn and r:
145 |                 isegmentnumber = int(r)
146 |             else:
147 |                 assert not r
148 |                 
149 |         lnumheaderadditionals = (len(row) - len(template.headermeasurements))
150 |         assert lnumheaderadditionals % len(template.headeradditionals) == 0
151 |         numheaderadditionals = lnumheaderadditionals//len(template.headeradditionals)
152 |         
153 |         segmentheaderssegmentJ = [ ]  
154 |         for i in range(numheaderadditionals):
155 |             rname, rvalue = None, None
156 |             i0 = len(template.headermeasurements) + i*len(template.headeradditionals)
157 |             for r, k in zip(row[i0:i0+len(template.headeradditionals)], template.headeradditionals):
158 |                 if isinstance(k, tuple):
159 |                     if k[1] == "NAME":
160 |                         assert rname is None or rname == r, (rname, r)
161 |                         rname = r
162 |                     else:
163 |                         assert k[1] == "VALUE"
164 |                         assert rvalue is None or rvalue == r
165 |                         rvalue = r
166 |                 else:
167 |                     assert not r
168 |             assert rname, (rname, dval, row)
169 |             dval[rname] = rvalue
170 |             segmentheaderssegmentJ.append(rname)
171 |                 
172 |         if isegmentnumber is None:
173 |             if not segmentheaderssegmentL or segmentheaderssegmentL[-1] != segmentheaderssegmentJ:
174 |                 segmentheaderssegmentL.append(segmentheaderssegmentJ)
175 |             isegmentnumber = len(segmentheaderssegmentL) - 1
176 |         elif isegmentnumber in wdasegments:
177 |             assert wdasegments[isegmentnumber][1] == segmentheaderssegmentJ
178 |                 
179 |         if isegmentnumber not in wdasegments:
180 |             if bverbose and previsegmentnumber is not None:
181 |                 print("segment %d loaded with %d rows" % (previsegmentnumber, len(wdasegments[previsegmentnumber][0])))
182 |             wdasegments[isegmentnumber] = ([ ], segmentheaderssegmentJ)
183 |             
184 |         wdasegments[isegmentnumber][0].append(dval)
185 |         previsegmentnumber = isegmentnumber
186 |     if bverbose and previsegmentnumber is not None:
187 |         print("segment %d loaded with %d rows" % (previsegmentnumber, len(wdasegments[previsegmentnumber][0])))
188 |     filehandle.close()
189 |     
190 |     if not baspandas:
191 |         return [ wdasegment  for wdasegment, segmentheaders in wdasegments.values() ]
192 |     
193 |     res = [ ]
194 |     for wdasegment, segmentheaders in wdasegments.values():
195 |         df = pandas.DataFrame.from_dict(wdasegment)
196 |         
197 |         # sort the columns (problem with using from_dict)
198 |         dfcols = list(df.columns)
199 |         newdfcols = [ ]
200 |         for k in template.headermeasurements:
201 |             if isinstance(k, tuple):
202 |                 if k[1] in dfcols:
203 |                     newdfcols.append(k[1])
204 |                     dfcols.remove(k[1])
205 |         for segmentheader in segmentheaders:
206 |             assert segmentheader in dfcols
207 |             newdfcols.append(segmentheader)
208 |             dfcols.remove(segmentheader)
209 |         assert not dfcols, ("unexplained extra columns", dfcols)
210 |         
211 |         res.append(df[newdfcols])   # map the new column list in
212 |     return res
213 |         
214 | 
215 | 
216 | # code below should probably be deprecated, or at least upgraded to pandas comparison functionality
217 | 
218 | # separated out so we can decide the severity of them before printing them out
219 | wdamsgstrings = { 
220 |     "WDAHEADERSINCONSISTENT": "Inconsistent extra headings headings in segment: %s", 
221 |     "WDAHEADERSMISSING": "Extra headings in segment than in wda file: %s", 
222 |     "WDAHEADERSEXTRA": "Extra headings in wda file not in segment: %s", 
223 |     "WDACOLUMNNOTCONSTANT": "Constant column %s has multiple values: %s", 
224 |     "WDACOLUMNCONSTCHANGED": "Constant column %s value changed %s but was in wda file %s", 
225 |     "NEWVALUESINSEGMENT": "Unmatched new values in segment %s", 
226 |     "WDAEXTRAVALUES": "Unmatched extra values in wda file %s", 
227 |     "WDADUPLICATESMISMATCH": "Duplicates mismatch counts %s", 
228 |     "EXTRAWDACONVERSIONSEGMENTS": "Extra conversion segments in wda file %s",
229 | }
230 | 
231 | def headersfromwdasegment(wdaseg, msglist):
232 |     derivedheaders = [ databaker.constants.OBS ] + (template.SH_Create_ONS_time and [ databaker.constants.TIMEUNIT ] or []) + (databaker.constants.DATAMARKER and [databaker.constants.DATAMARKER] or [])
233 |     headersunion = None
234 |     headersintersection = None
235 |     for wdarow in wdaseg:
236 |         ahset = set(k  for k in wdarow.keys()  if k not in derivedheaders)
237 |         if headersunion is None:
238 |             headersintersection = set(ahset)
239 |             headersunion = set(ahset)
240 |         else:
241 |             headersunion.update(ahset)
242 |             headersintersection.intersection_update(ahset)
243 |     if headersunion != headersintersection:
244 |         msglist.append(("WDAHEADERSINCONSISTENT", headersunion.difference(headersintersection)))
245 |     return headersintersection
246 | 
247 | def extraheaderscheck(conversionsegment, wdaseg, msglist):
248 |     wdaheaders = headersfromwdasegment(wdaseg, msglist)
249 |     segmentheaders = set([c.label  for c in conversionsegment.dimensions])
250 |     extraheadersinsegment = segmentheaders.difference(wdaheaders)
251 |     extraheadersinwdaseg = wdaheaders.difference(segmentheaders)
252 |     if extraheadersinsegment:
253 |         msglist.append(("WDAHEADERSMISSING", extraheadersinsegment))
254 |     if extraheadersinwdaseg:
255 |         msglist.append(("WDAHEADERSEXTRA", extraheadersinwdaseg))
256 |     return wdaheaders.intersection(segmentheaders)
257 | 
258 | def checktheconstantdimensions(conversionsegment, headers, wdaseg, msglist):
259 |     for dimension in conversionsegment.dimensions:
260 |         if dimension.label in headers:
261 |             if dimension.hbagset is None:
262 |                 constval = dimension.cellvalueoverride.get(None)
263 |                 wdaconst = set(row.get(dimension.label)  for row in wdaseg)
264 |                 if len(wdaconst) != 1:
265 |                     msglist.append(("WDACOLUMNNOTCONSTANT", (dimension.label, wdaconst)))
266 |                 elif constval not in wdaconst:
267 |                     msglist.append(("WDACOLUMNCONSTCHANGED", (dimension.label, constval, wdaconst.pop())))
268 |                 headers.remove(dimension.label)
269 |     return headers
270 | 
271 | def checksegmentobsvalues(processedrows, headers, wdaseg, msglist):
272 |     oheaders = [databaker.constants.OBS]+list(headers)
273 | 
274 |     # produce counts of each element in case there are duplicates (we are not keeping the orders of the lists)
275 |     ccounts = collections.Counter(tuple(row.get(h)  for h in oheaders)  for row in processedrows)
276 |     wcounts = collections.Counter(tuple(wrow.get(h)  for h in oheaders)  for wrow in wdaseg)
277 |     cset = set(ccounts.keys())
278 |     wset = set(wcounts.keys())
279 |     
280 |     cdiffextra = cset.difference(wset)
281 |     sdiffextra = wset.difference(cset)
282 |     if cdiffextra:
283 |         msglist.append(("NEWVALUESINSEGMENT", cdiffextra))
284 |     if sdiffextra:
285 |         msglist.append(("WDAEXTRAVALUES", sdiffextra))
286 |         
287 |     dupmismatch = { }
288 |     for s in cset.intersection(wset):
289 |         if ccounts[s] != wcounts[s]:
290 |             dupmismatch[s] = (ccounts[s], wcounts[s])
291 |     if dupmismatch:
292 |         msglist.append(("WDADUPLICATESMISMATCH", dupmismatch))
293 | 
294 | 
295 | def CompareConversionSegments(conversionsegments, wdafile, bprintwarnings):
296 |     bverbose = True
297 |     if type(conversionsegments) is ConversionSegment:
298 |         conversionsegments = [conversionsegments]
299 |     
300 |     msglistperseg = { }
301 |     wdasegs = readtechnicalCSV(wdafile, bverbose)
302 |     extracsegs = [ c  for c in wdasegs.keys()  if not 0<=c<len(conversionsegments) ]
303 |     if extracsegs:
304 |         msglistperseg[-1] = [ ("EXTRAWDACONVERSIONSEGMENTS", extracsegs) ]
305 |     for isegmentnumber, conversionsegment in enumerate(conversionsegments):
306 |         
307 |         if conversionsegment.processedrows is None: 
308 |             timeunitmessage = conversionsegment.process()  
309 |             print("conversionwrite segment size %d table '%s; %s" % (len(conversionsegment.processedrows), conversionsegment.tab.name, timeunitmessage))
310 |         
311 |         msglist = [ ]
312 |         wdaseg = wdasegs[isegmentnumber]
313 |         headers = extraheaderscheck(conversionsegment, wdaseg, msglist)
314 |         headers = checktheconstantdimensions(conversionsegment, headers, wdaseg, msglist)
315 |         checksegmentobsvalues(conversionsegment.processedrows, headers, wdaseg, msglist)
316 |         if msglist:
317 |             msglistperseg[isegmentnumber] = msglist
318 |             
319 |     if bprintwarnings:
320 |         for isegmentnumber, msglist in msglistperseg.items():
321 |             for msg, v in msglist:
322 |                 print("Conversion seg %d error: %s" % (isegmentnumber, (wdamsgstrings[msg] % v)))
323 | 
324 |     return msglistperseg
325 | 
326 | 


--------------------------------------------------------------------------------
/databaker/jupybakehtml.py:
--------------------------------------------------------------------------------
  1 | # encoding: utf-8
  2 | 
  3 | import io, os, collections, re, warnings
  4 | 
  5 | try:
  6 |     from IPython.display import display, FileLink
  7 |     from IPython.core.display import HTML
  8 | except ImportError:
  9 |     display, FileLink, HTML = None, None, None
 10 | import databaker.constants
 11 | 
 12 | OBS = databaker.constants.OBS   # used to evaluate to -9, does to "OBS" now
 13 | 
 14 | from databaker.jupybakeutils import HDim, ConversionSegment, svalue
 15 | 
 16 | # copied out again
 17 | def create_colourlist():
 18 |     # Function to dynamically assign colours to dimensions for preview
 19 |     colchange = {"rose":"misty_rose", "ice_blue":"cornflower_blue", "periwinkle":"burly_wood", "pale_blue":"deep_sky_blue", "gray25":"light_gray", "light_turquoise":"pale_turquoise"}
 20 |     "https://github.com/python-excel/xlwt/blob/master/xlwt/Style.py#L309"
 21 |     ocolours = ["aqua", "light_green", "rose", "gray25", "periwinkle", "coral", "gold", "lime", 
 22 |               "light_blue", "plum", "ice_blue", "light_yellow", 
 23 |               "lavender", "violet", "sea_green", 
 24 |               "pale_blue", "blue", "gray25", "light_turquoise", "tan", "sky_blue", 
 25 | ]
 26 |               
 27 |     colours = [ "".join(lv.capitalize()  for lv in colchange.get(col, col).split("_"))  for col in ocolours ]
 28 |     numbers = list(range(len(colours)))
 29 |     colourlist = dict(list(zip(numbers, colours)))
 30 |     return colourlist
 31 | colourlist = create_colourlist()
 32 | 
 33 | 
 34 | ndividNUM = 1000
 35 | dividNUM = "kkkk"
 36 | def incrementdividNUM():
 37 |     global ndividNUM, dividNUM
 38 |     ndividNUM += 1
 39 |     dividNUM = "injblock%d" % ndividNUM
 40 | 
 41 | def tabletohtml(tab, tsubs, consolidatedcellvalueoverride, blocalstylesheet):
 42 |     key = [ ]
 43 |     ixyheaderlookup = { }
 44 |     if tsubs:
 45 |         key.append('<table class="exkey">\n')
 46 |         key.append('<tr>')
 47 |         for i, label, bag in tsubs:
 48 |             for h in bag:
 49 |                 ixyheaderlookup[(h.x, h.y)] = i
 50 |             if blocalstylesheet:
 51 |                 key.append('<td class="xc%s">%s</td>' % (i, label))
 52 |             else:
 53 |                 key.append('<td class="xc%s" style="background-color:%s">%s</td>' % (i, colourlist.get(i,"white"), label))
 54 |         key.append('</tr>')
 55 |         key.append('</table>\n')
 56 |     
 57 |     
 58 |     sty = [ ]
 59 |     sty.append("<style>\n")
 60 |     sty.append("table.ex, table.exkey { border: thin black solid }\n")
 61 |     sty.append("table.ex td, table.ex tr { border: none }\n")
 62 |     if blocalstylesheet:
 63 |         sty.append("td.xb { font-weight: bold }\n")
 64 |         sty.append("td.xn { color: green }\n")
 65 |         sty.append("td.xd { color: purple }\n")
 66 |         sty.append("span.xo { text-decoration: line-through; }\n")
 67 |         sty.append("span.xn { }\n")
 68 |         sty.append("table { border-collapse: collapse }\n")
 69 |         sty.extend("td.xc%d { background-color: %s }\n" % (i, col)  for (i, col) in colourlist.items())
 70 |     sty.append("table.ex td:hover { border: thin blue solid }\n")
 71 |     sty.append("table.ex td.exc%s:hover { border: thin red solid }\n" % OBS)
 72 |     if blocalstylesheet:
 73 |         sty.append("table.ex td.selected { background-color: red; border: thin blue dotted }\n")
 74 |     else:
 75 |         sty.append("table.ex td.selected { border: thick red solid }\n")
 76 |     sty.append("</style>\n\n")
 77 | 
 78 |     htm = [ ]
 79 |     htm.append('<table class="ex">\n')
 80 |     htm.append('<caption style="text-align:center; padding:0px; caption-side:bottom">%s</caption>\n' % tab.name)
 81 |     for row in tab.rows():
 82 |         htm.append("<tr>")
 83 |         assert len(row) == tab._max_x + 1
 84 |         rrow = sorted(row, key=lambda X: X.x)
 85 |         for c in rrow:
 86 |             ih = ixyheaderlookup.get((c.x, c.y))
 87 |             if blocalstylesheet:
 88 |                 cs = [ ]
 89 |                 if ih is not None:             cs.append("xc%s" % ih)
 90 |                 if c.properties.cell.sheet.book.font_list:  # overcome bug in messytables caused by https://www.communities-ni.gov.uk/sites/default/files/publications/communities/ni-housing-stats-15-16-tables1.xlsx
 91 |                     if c.properties.get_bold():    cs.append("xb")
 92 |                 if c.is_number():              cs.append("xn")
 93 |                 htm.append('<td class="%s" title="%d %d">' % (" ".join(cs), c.x, c.y))
 94 |             else:
 95 |                 ls = [ ]
 96 |                 if ih is not None:             ls.append("background-color:%s" % colourlist.get(ih,"white"))
 97 |                 if c.properties.cell.sheet.book.font_list:
 98 |                     if c.properties.get_bold():    ls.append("font-weight:bold")
 99 |                 lss = ' style="%s"' % ";".join(ls)  if ls  else ''
100 |                 htm.append('<td%s title="%d %d">' % (lss, c.x, c.y))
101 |                 
102 |             if (c.x, c.y) in consolidatedcellvalueoverride:
103 |                 prevcellval = svalue(c) or "*blank*" # want to see empty cells that have been overwritten
104 |                 overridecellval = consolidatedcellvalueoverride[(c.x, c.y)]
105 |                 if blocalstylesheet:
106 |                     htm.append('<span class="xo">%s</span><span class="xn">%s</span>' % (prevcellval, overridecellval))
107 |                 else:
108 |                     htm.append('<strike>%s</strike>%s' % (prevcellval, overridecellval))
109 |             else:
110 |                 htm.append(svalue(c))
111 |                 
112 |             if (c.x, c.y) in consolidatedcellvalueoverride:
113 |                 consolidatedcellvalueoverride
114 |             htm.append("</td>")
115 |         htm.append("</tr>\n")
116 |     htm.append("</table>\n")
117 | 
118 |     jsty = "".join(sty)
119 |     jkey = "".join(key)
120 |     jhtm = "".join(htm)
121 |     return "%s\n%s\n%s\n" % (jsty, jkey, jhtm)
122 | 
123 | jscode = """
124 | <script>
125 | var jslookup = %s; 
126 | var jdividNUM = "%s"; 
127 | var Dclickedcell = null; 
128 | function clickedcell() 
129 | { 
130 |     Dclickedcell = this; 
131 |     console.log("jjjj", this); 
132 |     var rgc = new RegExp('(^|\\b)' + "selected".split(' ').join('|') + '(\\b|$)', 'gi'); 
133 |     Array.prototype.forEach.call(document.querySelectorAll("div#"+jdividNUM+" table.ex td.selected"), function(el, i) { 
134 |         if (el.classList)  el.classList.remove("selected");
135 |         else  el.className = el.className.replace(rgc, ' ');
136 |     }); 
137 |     if (this.classList)  this.classList.add("selected");
138 |     else this.className += ' ' + "selected";
139 | 
140 |     var dimpairs = jslookup[this.title]; 
141 |     if (dimpairs !== undefined) {
142 |         for (var i = 1; i < dimpairs.length; i += 2) {
143 |             var row = document.querySelectorAll("div#"+jdividNUM+" table.ex tr")[dimpairs[i]]; 
144 |             var el = row.querySelectorAll("td")[dimpairs[i-1]]; 
145 |             if (el.classList)  el.classList.add("selected");
146 |             else el.className += ' ' + "selected";
147 |         }
148 |     }
149 | }
150 | Array.prototype.forEach.call(document.querySelectorAll("div#"+jdividNUM+" table.ex td"), function(item, i) { item.onclick=clickedcell; }); 
151 | </script>
152 | """
153 | 
154 | # generate the lookup table from titles to references
155 | def calcjslookup(conversionsegment):
156 |     obslist = list(conversionsegment.segment.unordered_cells)  # list(segment) otherwise gives bags of one element
157 | 
158 |     # this is where we could check/override the lookup values in some way
159 |     dimvalues = [ [ hdim.cellvalobs(ob)[0]  for hdim in conversionsegment.dimensions  if hdim.hbagset is not None ]  for ob in obslist ]
160 |     jslookup = '{%s}' % ",".join('"%d %d":[%s]' % (k.x, k.y, ",".join("%d,%d" % (d.x, d.y)  for d in tup  if d))  \
161 |                            for k, tup in zip(obslist, dimvalues))
162 |     return jslookup
163 |     
164 |     
165 | # could do this as a html-frame and reload
166 | def sidewindowhtmldisplay():
167 |     sjs = '''
168 | <script type="text/Javascript">
169 | var injblock = document.getElementById("%s"); 
170 | console.log(injblock.innerHTML); 
171 | var sidewin = window.open("", "abc123", "toolbar=no,location=no,directories=no,status=no,menubar=no,scrollbars=yes,resizable=yes,width=780,height=200,top=200,left=200"); 
172 | if (sidewin) 
173 |     sidewin.document.body.innerHTML = injblock.innerHTML;
174 | else
175 |     alert("sidewindow didn't work"); 
176 | </script>
177 | '''
178 |     display(HTML(sjs % dividNUM))
179 |     
180 |     
181 | def savepreviewhtml(conversionsegment, fname=None, verbose=True):
182 |     "Preview a highlighted table, cellbag, dimension, list of bags or ConversionSegment inline or into a secondary html file"
183 |     # wrap a singleton or list of bags, tables and HDims to a ConversionSegment
184 |     if not isinstance(conversionsegment, ConversionSegment): 
185 |         param1 = conversionsegment
186 |         if not isinstance(param1, (tuple, list)):
187 |             param1 = [param1]
188 |             
189 |         tab = None
190 |         dimensions = [ ]
191 |         for i, p in enumerate(param1):
192 |             if "Table" in str(type(p)):
193 |                 ltab = p
194 |                 lhdim = None
195 |             elif isinstance(p, HDim):
196 |                 ltab = p.hbagset.table
197 |                 lhdim = p
198 |             else:
199 |                 ltab = p.table
200 |                 lhdim = HDim(p, "item %d"%i, databaker.constants.DIRECTLY, databaker.constants.ABOVE)   # (fake lookup)
201 |             
202 |             if not tab:
203 |                 tab = ltab
204 |             else:
205 |                 assert ltab is tab, "must all be same table"
206 | 
207 |             if lhdim:
208 |                 dimensions.append(lhdim)
209 |                 
210 |         conversionsegment = ConversionSegment(tab, dimensions, [])
211 |     
212 |     # now we have a ConversionSegment
213 |     incrementdividNUM()
214 |     if fname is None:
215 |         fout = io.StringIO()
216 |         blocalstylesheet = not (len(conversionsegment.tab) < 1500)
217 |     else:
218 |         fout = io.open(fname, "w", encoding='utf-8')
219 |         fout.write("<html>\n<head><title>%s</title><meta charset=\"UTF-8\"></head>\n<body>\n" % conversionsegment.tab.name)
220 |         blocalstylesheet = True
221 |         
222 |     htmtable = tabletohtml(conversionsegment.tab, conversionsegment.dsubsets(), conversionsegment.consolidatedcellvalueoverride(), blocalstylesheet)
223 |     fout.write('<div id="%s">\n' % (dividNUM))
224 |     fout.write(htmtable)
225 |     fout.write('</div>\n')
226 | 
227 |     if fname is not None and verbose:
228 |         print("tablepart '%s' written #%s" % (conversionsegment.tab.name, dividNUM))
229 |     if conversionsegment.dimensions and conversionsegment.segment:
230 |         jslookup = calcjslookup(conversionsegment)
231 |         if fname is not None and verbose:
232 |             print("javascript calculated")
233 |         fout.write(jscode % (jslookup, dividNUM))
234 |     
235 |     if fname is None:
236 |         display(HTML(fout.getvalue()))
237 |     else:
238 |         fout.write("</body></html>\n")
239 |         fout.close()
240 |         local_file = FileLink(path=os.path.basename(fname),
241 |                               result_html_prefix="Written to file: ")
242 |         display(local_file)
243 | 


--------------------------------------------------------------------------------
/databaker/jupybakeutils.py:
--------------------------------------------------------------------------------
  1 | # encoding: utf-8
  2 | # HTML preview of the dimensions and table (will be moved to a function in databakersolo)
  3 | 
  4 | import io, os, collections, re, warnings, csv, datetime
  5 | import databaker.constants
  6 | import xypath
  7 | from databaker import richxlrd
  8 | template = databaker.constants.template
  9 | 
 10 | try:   import pandas
 11 | except ImportError:  pandas = None  # no pandas in pypy
 12 | 
 13 | def svalue(cell):
 14 |     if not isinstance(cell.value, datetime.datetime):
 15 |         return str(cell.value)
 16 |     # the fmt string is some excel generated garbage format, like: '[$-809]dd\\ mmmm\\ yyyy;@'
 17 |     # the xlrd module does its best and creates a date tuple, which messytables constructs into a datetime using xldate_as_tuple()
 18 |     xls_format = cell.properties['formatting_string'].upper()
 19 |     quarter = int((cell.value.month -1 ) // 3) + 1
 20 |     if   'Q' in xls_format:   py_format = "%Y Q{quarter}"   # may be very rare
 21 |     elif 'D' in xls_format:   py_format = "%Y-%m-%d"
 22 |     elif 'M' in xls_format:   py_format = "%b %Y"
 23 |     elif 'Y' in xls_format:   py_format = "%Y"
 24 |     else:                     py_format = "%Y-%m-%d"
 25 |     return cell.value.strftime(py_format).format(quarter=quarter)
 26 | 
 27 | 
 28 | class HDim:
 29 |     "Dimension object which defines the lookup between an observation cell and a bag of header cells"
 30 |     def __init__(self, hbagset, label, strict=None, direction=None, cellvalueoverride=None):
 31 |         self.label = label
 32 |         self.name = label
 33 |             
 34 |         self.cellvalueoverride = cellvalueoverride or {} # do not put {} into default value otherwise there is only one static one for everything
 35 |         assert not isinstance(hbagset, str), "Use empty set and default value for single value dimension"
 36 |         self.hbagset = hbagset
 37 |         self.bhbagsetCopied = False
 38 |         
 39 |         if self.hbagset is None:   # single value type
 40 |             assert direction is None and strict is None
 41 |             assert len(cellvalueoverride) == 1 and None in cellvalueoverride, "single value type should have cellvalueoverride={None:defaultvalue}"
 42 |             return
 43 |         
 44 |         assert isinstance(self.hbagset, xypath.xypath.Bag), "dimension should be made from xypath.Bag type, not %s" % type(self.hbagset)
 45 |         self.strict = strict
 46 |         self.direction = direction
 47 |         assert direction is not None and strict is not None
 48 | 
 49 |         self.bxtype = (self.direction[1] == 0)
 50 |         self.samerowlookup = None
 51 |     
 52 |             
 53 |     def celllookup(self, scell):
 54 |         "Lookup function from a given cell to the matching header cell"
 55 |         
 56 |         # caching that can be removed in AddCellValueOverride
 57 |         if self.strict and self.samerowlookup is None:
 58 |             self.samerowlookup = {}
 59 |             for hcell in self.hbagset.unordered_cells:
 60 |                 k = hcell.y if self.bxtype else hcell.x
 61 |                 if k not in self.samerowlookup:
 62 |                     self.samerowlookup[k] = []
 63 |                 self.samerowlookup[k].append(hcell)
 64 |         
 65 |         def mult(cell):
 66 |             return cell.x * self.direction[0] + cell.y * self.direction[1]
 67 |         def dgap(cell, target_cell):
 68 |             if direction[1] == 0:
 69 |                 return abs(cell.x - target_cell.x)
 70 |             return abs(cell.y - target_cell.y)
 71 |         
 72 |         def betweencells(scell, target_cell, best_cell):
 73 |             if mult(scell) <= mult(target_cell):
 74 |                 if not best_cell or mult(target_cell) <= mult(best_cell):
 75 |                     return True
 76 |             return False
 77 |         
 78 |         def same_row_col(a, b):
 79 |             return  (a.x - b.x  == 0 and self.direction[0] == 0) or (a.y - b.y  == 0 and self.direction[1] == 0)
 80 |     
 81 |         if self.strict:
 82 |             hcells = self.samerowlookup.get(scell.y if self.bxtype else scell.x, [])
 83 |         else:
 84 |             hcells = self.hbagset.unordered_cells
 85 |         hcells = self.hbagset.unordered_cells
 86 |         
 87 |         best_cell = None
 88 |         second_best_cell = None
 89 | 
 90 |         #if strict:  print(len(list(hcells)), len(list(hbagset.unordered_cells)))
 91 |         for target_cell in hcells:
 92 |             if betweencells(scell, target_cell, best_cell):
 93 |                 if not self.strict or same_row_col(scell, target_cell):
 94 |                     second_best_cell = best_cell
 95 |                     best_cell = target_cell
 96 |         if second_best_cell and mult(best_cell) == mult(second_best_cell):
 97 |             raise xypath.LookupConfusionError("{!r} is as good as {!r} for {!r}".format(best_cell, second_best_cell, scell))
 98 |         if best_cell is None:
 99 |             return None
100 |         return best_cell
101 | 
102 |     def headcellval(self, hcell):
103 |         "Extract the string value of a member header cell (including any value overrides)"
104 |         if hcell is not None:
105 |             assert isinstance(hcell, xypath.xypath._XYCell), "celllookups should only go to an _XYCell"
106 |             if hcell in self.cellvalueoverride:
107 |                 val = self.cellvalueoverride[hcell]
108 |                 assert isinstance(val, (str, float, int)), "Override from hcell value should go directly to a str,float,int,None-value (%s)" % type(val)
109 |                 return val
110 |             val = svalue(hcell)
111 |             #assert val is None or isinstance(val, (str, float, int)), "cell value should only be str,float,int,None (%s)" % type(val)
112 |         else:
113 |             val = None
114 |          
115 |         # It's allowed to have {None:defaultvalue} to set the NoLookupValue
116 |         if val in self.cellvalueoverride:
117 |             val = self.cellvalueoverride[val]
118 |             assert val is None or isinstance(val, (str, float, int)), "Override from value should only be str,float,int,None (%s)" % type(val)
119 | 
120 |         # type call if no other things match
121 |         elif type(val) in self.cellvalueoverride:
122 |              val = self.cellvalueoverride[type(val)](val)
123 |             
124 |         return val
125 | 
126 | 
127 |     def cellvalobs(self, ob):
128 |         "Full lookup from a observation cell to its dimensional value (which can apply before lookup)"
129 |         if isinstance(ob, xypath.xypath.Bag):
130 |             assert len(ob) == 1, "Can only lookupobs a single cell"
131 |             ob = ob._cell
132 |         assert isinstance(ob, xypath.xypath._XYCell), "Lookups only allowed on an obs cell"
133 |         
134 |         # we do two steps through cellvalueoverride in three places on mutually distinct sets (obs, heading, strings)
135 |         # and not recursively as these are wholly different applications.  the celllookup is itself like a cellvalueoverride
136 |         if ob in self.cellvalueoverride:
137 |             val = self.cellvalueoverride[ob]  # knock out an individual obs for this cell
138 |             assert isinstance(val, str), "Override from obs should go directly to a string-value"
139 |             return None, val
140 |             
141 |         if self.hbagset is not None:
142 |             hcell = self.celllookup(ob)
143 |         else:
144 |             hcell = None
145 |             
146 |         return hcell, self.headcellval(hcell)
147 |         
148 |     def AddCellValueOverride(self, overridecell, overridevalue):
149 |         "Override the value of a header cell (and insert it if not present in the bag)" 
150 |         if isinstance(overridecell, str):
151 |             self.cellvalueoverride[overridecell] = overridevalue
152 |             return
153 |         if overridecell is None:
154 |             self.cellvalueoverride[overridecell] = overridevalue
155 |             return
156 |         if isinstance(overridecell, xypath.xypath.Bag):
157 |             assert len(overridecell) == 1, "Can only lookupobs a single cell"
158 |             overridecell = overridecell._cell
159 |         assert isinstance(overridecell, xypath.xypath._XYCell), "Lookups only allowed on an obs cell"
160 |         
161 |         # add the cell into the base set of cells if it's new 
162 |         if overridecell not in self.hbagset.unordered_cells:
163 |             if not self.bhbagsetCopied:
164 |                 self.hbagset = self.hbagset | (self.hbagset.by_index(1) if len(self.hbagset) else self.hbagset)  # force copy by adding element from itself
165 |                 self.bhbagsetCopied = True  # avoid inefficient copying every single time
166 |             self.hbagset.add(overridecell)
167 |             self.samerowlookup = None  # abolish any caching
168 |         else:
169 |             if overridecell in self.cellvalueoverride:
170 |                 if self.cellvalueoverride[overridecell] != overridevalue:
171 |                     warnings.warn("Cell %s was already overridden by value %s; is this a mistake?" % (overridecell, self.cellvalueoverride[overridecell]))
172 |             
173 |         assert overridevalue is None or isinstance(overridevalue, (str, float, int)), "Override from value should only be str,float,int,None (%s)" % type(overridevalue)
174 |         self.cellvalueoverride[overridecell] = overridevalue
175 | 
176 |     def discardcellsnotlookedup(self, obs):
177 |         "Remove header cells to which none of the observation cells looks up to"
178 |         hbagsetT = xypath.xypath.Bag(self.hbagset.table)
179 |         for ob in obs.unordered_cells:
180 |             hbagsetT.add(self.celllookup(ob))
181 |         self.hbagset = hbagsetT
182 | 
183 |     def valueslist(self):
184 |         "List of all the header cell values"
185 |         return [self.headcellval(cell)  for cell in sorted(self.hbagset.unordered_cells, key=lambda cell: (cell.y, cell.x))]
186 | 
187 |     def checkvalues(self, vlist):
188 |         "Check that the header cell values match"
189 |         scells = sorted(self.hbagset.unordered_cells, key=lambda cell: (cell.y, cell.x))
190 |         if len(scells) != len(vlist):
191 |             warnings.warn("checkvalues list length doesn't match")
192 |             return False
193 |             
194 |         for cell, v in zip(scells, vlist):
195 |             nv = self.headcellval(cell)
196 |             if nv != v:
197 |                 warnings.warn("checkvalues mismatch in cell (%d,%d) cell value '%s' doesn't match '%s'" % (cell.x, cell.y, nv, v))
198 |                 return False
199 |         return True
200 |         
201 | 
202 | def HDimConst(name, val):
203 |     "Define a constant value dimension across the whole segment"
204 |     return HDim(None, name, cellvalueoverride={None:val})
205 | 
206 | 
207 | def Ldatetimeunitloose(date):
208 |     if not isinstance(date, str):
209 |         if isinstance(date, (float, int)) and 1000<=date<=9999 and int(date)==date:
210 |             return "Year"
211 |         return ''
212 |     d = date.strip()
213 |     if re.match('\d{4}(?:\.0)?$', d):
214 |         return 'Year'
215 |     if re.match('\d{4}(?:\.0)?\s*[Qq]\d$', d):
216 |         return 'Quarter'
217 |     if re.match('[Qq]\d\s*\d{4}(?:\.0)?$', d):
218 |         return 'Quarter'
219 |     if re.match('[A-Za-z]{3}-[A-Za-z]{3}\s*\d{4}(?:\.0)?$', d):
220 |         return 'Quarter'
221 |     if re.match('[A-Za-z]{3}\s*\d{4}(?:\.0)?$', d):
222 |         return 'Month'
223 |     return ''
224 | 
225 | def Ldatetimeunitforce(st, timeunit):
226 |     st = str(st).strip()
227 |     if timeunit == 'Year':
228 |         mst = re.match("(\d\d\d\d)(?:\.0)?$", st)
229 |         if mst:
230 |             return mst.group(1)
231 |             
232 |     elif timeunit == "Quarter":
233 |         mq1 = re.match('(\d{4})(?:\.0)?\s*[Qq](\d)', st)
234 |         mq2 = re.match('([A-Za-z]{3}-[A-Za-z]{3})\s*(\d{4})', st)
235 |         mq3 = re.match('[Qq](\d)\s*(\d{4})', st)
236 |         if mq1:
237 |             return "%s Q%s" % (mq1.group(1), mq1.group(2))
238 |         if mq2:
239 |             return "%s %s" % (mq2.group(1), mq2.group(2))
240 |         if mq3:
241 |             return "%s Q%s" % (mq3.group(2), mq3.group(1))
242 |             
243 |     elif timeunit == "Month":
244 |         mm1 = re.match('\s*([A-Za-z]{3})\s*(\d{4})', st)
245 |         if mm1:
246 |             return "%s %s" % (mm1.group(1), mm1.group(2))
247 |     elif timeunit == "":
248 |         return st
249 |     else:
250 |         timeunit = "unknown:%s" % timeunit
251 |     warnings.warn("TIME %s disagrees with TIMEUNIT %s" % (st, timeunit))
252 |     return st
253 | 
254 | 
255 | def HLDUPgenerate_header_row(numheaderadditionals):
256 |     res = [ (k[0] if isinstance(k, tuple) else k)  for k in template.headermeasurements ]
257 |     for i in range(numheaderadditionals):
258 |         for k in template.headeradditionals:
259 |             if isinstance(k, tuple):
260 |                 sk = k[0]
261 |             else:
262 |                 sk = k
263 |             res.append("%s_%d" % (sk, i+1))
264 |     return res
265 | 
266 | 
267 | class ConversionSegment:
268 |     "Single output table object generated from a bag of observations that look up to a list of dimensions"
269 |     def __init__(self, observations, dimensions, Lobservations=None, processTIMEUNIT=True, includecellxy=False):
270 |         if Lobservations is None:   # new format that drops the unnecessary table element
271 |             tab = observations.table
272 |             Lobservations = observations
273 |         else:
274 |             tab = observations  # old function format
275 |             
276 |         self.tab = tab
277 |         self.dimensions = dimensions
278 |         self.segment = Lobservations   # original name for observations list
279 |         
280 |         self.processtimeunit = processTIMEUNIT
281 |         self.includecellxy = includecellxy
282 | 
283 |         for dimension in self.dimensions:
284 |             assert isinstance(dimension, HDim), ("Dimensions must have type HDim()")
285 |             assert dimension.hbagset is None or dimension.hbagset.table is tab, "dimension %s from different tab" % dimension.name
286 |             
287 |         self.numheaderadditionals = sum(1  for dimension in self.dimensions  if dimension.label not in template.headermeasurementnamesSet)
288 | 
289 |         # generate the ordered obslist here (so it is fixed here and can be reordered before processing)
290 |         if isinstance(self.segment, xypath.xypath.Bag):
291 |             assert self.segment.table is tab, "segments from different tab"
292 |             self.obslist = list(self.segment.unordered_cells)  # list(segment) otherwise gives bags of one element
293 |             self.obslist.sort(key=lambda cell: (cell.y, cell.x))
294 |         else:
295 |             assert isinstance(self.segment, (tuple, list)), "segment needs to be a Bag or a list, not a %s" % type(self.segment)
296 |             self.obslist = self.segment
297 |             
298 |         # holding place for output of processing.  
299 |         # technically no reason we shouldn't process at this point either, on this constructor, 
300 |         # but doing it in stages allows for interventions along the way
301 |         self.processedrows = None  
302 |             
303 | 
304 |     # used in tabletohtml for the subsets, and where we would find the mappings for over-ride values
305 |     def dsubsets(self):
306 |         tsubs = [ ]
307 |         if self.segment:
308 |             tsubs.append((0, "OBS", self.segment))
309 |         for i, dimension in enumerate(self.dimensions):
310 |             if dimension.hbagset is not None:   # filter out TempValue headers
311 |                 tsubs.append((i+1, dimension.name, dimension.hbagset))
312 |         return tsubs
313 |         
314 |     # used in tabletohtml for the subsets, and where we would find the mappings for over-ride values
315 |     def consolidatedcellvalueoverride(self):
316 |         res = { }
317 |         for i, dimension in enumerate(self.dimensions):
318 |             if dimension.hbagset is not None:   # filter out TempValue headers
319 |                 for hcell in dimension.hbagset.unordered_cells:
320 |                     sval = svalue(hcell)
321 |                     val = hcell.value
322 |                     if hcell in dimension.cellvalueoverride:
323 |                         val = str(dimension.cellvalueoverride[hcell])
324 |                     elif sval in dimension.cellvalueoverride:
325 |                         val = str(dimension.cellvalueoverride[val])
326 |                     elif type(hcell.value) in dimension.cellvalueoverride:
327 |                         val = str(dimension.cellvalueoverride[type(hcell.value)](hcell.value))
328 |                     else:
329 |                         val = sval
330 |                     if val != sval:
331 |                         res[(hcell.x, hcell.y)] = val
332 |         return res
333 | 
334 |     # individual lookup across the dimensions here
335 |     def lookupobs(self, ob):
336 |         if type(ob) is xypath.xypath.Bag:
337 |             assert len(ob) == 1, "Can only lookupobs on a single cell"
338 |             ob = ob._cell
339 | 
340 |         # force it to be float and split off anything not float into the datamarker
341 |         if not isinstance(ob.value, float):
342 |             if ob.properties['richtext']:  # should this case be implemented into the svalue() function?
343 |                 sval = richxlrd.RichCell(ob.properties.cell.sheet, ob.y, ob.x).fragments.not_script.value
344 |             else:
345 |                 sval = svalue(ob)
346 |                 
347 |             if template.SH_Split_OBS:
348 |                 assert template.SH_Split_OBS == databaker.constants.DATAMARKER, (template.SH_Split_OBS, databaker.constants.DATAMARKER)
349 |                 ob_value, dm_value = re.match(r"([-+]?[0-9]+\.?[0-9]*)?(.*)", sval).groups()
350 |                 dval = { }
351 |                 if dm_value:
352 |                     dval[template.SH_Split_OBS] = dm_value
353 |                 if ob_value:
354 |                     dval[databaker.constants.OBS] = float(ob_value)
355 |                 else:
356 |                     dval[databaker.constants.OBS] = ""
357 |             else:
358 |                 dval = { databaker.constants.OBS:sval }
359 |         else:
360 |             dval = { databaker.constants.OBS:ob.value }
361 |         
362 |         for hdim in self.dimensions:
363 |             hcell, val = hdim.cellvalobs(ob)
364 |             dval[hdim.label] = val
365 |             
366 |         if self.includecellxy:
367 |             dval["__x"] = ob.x
368 |             dval["__y"] = ob.y
369 |             dval["__tablename"] = self.tab.name
370 |         return dval
371 | 
372 |     def guesstimeunit(self):
373 |         for dval in self.processedrows:
374 |             dval[template.TIMEUNIT] = Ldatetimeunitloose(dval[template.TIME])
375 |         ctu = collections.Counter(dval[template.TIMEUNIT]  for dval in self.processedrows)
376 |         if len(ctu) == 1:
377 |             return "TIMEUNIT='%s'" % list(ctu.keys())[0]
378 |         return "multiple TIMEUNITs: %s" % ", ".join("'%s'(%d)" % (k,v)  for k,v in ctu.items())
379 |         
380 |     def fixtimefromtimeunit(self):  # this works individually and not across the whole segment homogeneously
381 |         for dval in self.processedrows:
382 |             dval[template.TIME] = Ldatetimeunitforce(dval[template.TIME], dval[template.TIMEUNIT])
383 | 
384 |     def process(self):
385 |         assert self.processedrows is None, "Conversion segment already processed"
386 |         self.processedrows = [ self.lookupobs(ob)  for ob in self.obslist ]
387 |         
388 |         kdim = dict((dimension.label, dimension)  for dimension in self.dimensions)
389 |         timeunitmessage = ""
390 |         if self.processtimeunit:
391 |             if template.SH_Create_ONS_time and ((template.TIMEUNIT not in kdim) and (template.TIME in kdim)):
392 |                 timeunitmessage = self.guesstimeunit()
393 |                 self.fixtimefromtimeunit()
394 |             elif template.TIME in kdim and template.TIMEUNIT not in kdim:
395 |                 self.fixtimefromtimeunit()
396 |         return timeunitmessage
397 |         
398 |         
399 |     def topandas(self):
400 |         if pandas is None:
401 |             warnings.warn("Sorry, you do not have pandas installed in this environment")
402 |             return None
403 |             
404 |         timeunitmessage = ""
405 |         if self.processedrows is None: 
406 |             timeunitmessage = self.process()  
407 |         print(timeunitmessage)
408 |         df = pandas.DataFrame.from_dict(self.processedrows)
409 |         
410 |         # sort the columns
411 |         dfcols = list(df.columns)
412 |         newdfcols = [ ]
413 |         for k in template.headermeasurements:
414 |             if isinstance(k, tuple):
415 |                 if k[1] in dfcols:
416 |                     newdfcols.append(k[1])
417 |                     dfcols.remove(k[1])
418 |         for dimension in self.dimensions:
419 |             if dimension.label not in template.headermeasurementnamesSet:
420 |                 assert dimension.label in dfcols
421 |                 newdfcols.append(dimension.label)
422 |                 dfcols.remove(dimension.label)
423 |                 
424 |         for excol in ["__x", "__y", "__tablename"]:
425 |             if excol in dfcols:
426 |                 if self.includecellxy:
427 |                     newdfcols.append(excol)
428 |                 dfcols.remove(excol)
429 |         assert not dfcols, ("unexplained extra columns", dfcols)
430 |         
431 |         df = df[newdfcols]   # map the new column list in
432 |         return df
433 | 
434 | def pdguessforceTIMEUNIT(df):
435 |     df["TIMEUNIT"] = df.apply(lambda row: Ldatetimeunitloose(row.TIME), axis=1)
436 |     df["TIME"] = df.apply(lambda row: Ldatetimeunitforce(row.TIME, row.TIMEUNIT), axis=1)
437 |     
438 | 
439 | 


--------------------------------------------------------------------------------
/databaker/overrides.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Patches xypath and messytables.
  3 | """
  4 | 
  5 | import re
  6 | import datetime
  7 | import warnings
  8 | 
  9 | import xypath
 10 | import messytables
 11 | 
 12 | class MatchNotFound(Exception):
 13 |     """failed to find match in bag.group"""
 14 |     pass
 15 | 
 16 | # === Cell Overrides ======================================
 17 | 
 18 | def cell_repr(cell):
 19 |     column = xypath.contrib.excel.excel_column_label(cell.x+1)
 20 |     return "<{}{} {!r}>".format(column, cell.y+1, cell.value)
 21 | xypath.xypath._XYCell.__repr__ = cell_repr
 22 | 
 23 | # === TableSet Overrides ==================================
 24 | 
 25 | @property
 26 | def tabnames(tableset):
 27 |     return set(x.name for x in tableset.tables)
 28 | messytables.TableSet.names = tabnames
 29 | 
 30 | # === Table Overrides =====================================
 31 | 
 32 | def excel_ref(table, reference):
 33 |     if ':' not in reference:
 34 |         (col, row) = xypath.contrib.excel.excel_address_coordinate(reference, partial=True)
 35 |         return table.get_at(col, row)
 36 |     else:
 37 |         ((left, top), (right, bottom)) = xypath.contrib.excel.excel_range(reference)
 38 |         bag = xypath.Bag(table=table)
 39 |         if top is None and bottom is None:
 40 |             for col in range(left, right + 1):
 41 |                 bag = bag | table.get_at(col, None)
 42 |         elif left is None and right is None:
 43 |             for row in range(top, bottom + 1):
 44 |                 bag = bag | table.get_at(None, row)
 45 |         else:
 46 |             for row in range(top, bottom + 1):
 47 |                 for col in range(left, right + 1):
 48 |                     bag = bag | table.get_at(col, row)
 49 |         return bag
 50 | xypath.Table.excel_ref = excel_ref
 51 | 
 52 | # copied in just for one function to enable deletion of utils.py
 53 | def Ddatematch(date, silent=False):
 54 |     """match mmm yyyy, mmm-mmm yyyy, yyyy Qn, yyyy"""
 55 |     if not isinstance(date, str):
 56 |         if (isinstance(date, float) or isinstance(date, int)) and date>=1000 and date<=9999 and int(date)==date:
 57 |             return "Year"
 58 |         if not silent:
 59 |             warnings.warn("Couldn't identify date {!r}".format(date))
 60 |         return ''
 61 |     d = date.strip()
 62 |     if re.match('\d{4}$', d):
 63 |         return 'Year'
 64 |     if re.match('\d{4} [Qq]\d$', d):
 65 |         return 'Quarter'
 66 |     if re.match('[A-Za-z]{3}-[A-Za-z]{3} \d{4}$', d):
 67 |         return 'Quarter'
 68 |     if re.match('[A-Za-z]{3} \d{4}$', d):
 69 |         return 'Month'
 70 |     if not silent:
 71 |         warnings.warn("Couldn't identify date {!r}".format(date))
 72 |     return ''
 73 | 
 74 | # === Bag Overrides =======================================
 75 | 
 76 | xypath.Bag.regex = lambda self, x: self.filter(re.compile(x))
 77 | 
 78 | def is_date(bag):
 79 |     return bag.filter(lambda cell: Ddatematch(cell.value, silent=True))
 80 | xypath.Bag.is_date = is_date
 81 | 
 82 | def is_number(bag):
 83 |     return bag.filter(lambda cell: isinstance(cell.value, (int, float, int)))
 84 | xypath.Bag.is_number = is_number
 85 | def is_not_number(bag):
 86 |     return bag.filter(lambda cell: not isinstance(cell.value, (int, float, int)))
 87 | xypath.Bag.is_not_number = is_not_number
 88 | 
 89 | def group(bag, regex):
 90 |     """get the text"""
 91 |     bag.assert_one()
 92 |     match = re.search(regex, bag.value)
 93 |     if not match:
 94 |         raise MatchNotFound("Can't find {!r} in {!r}".format(regex, bag.value))
 95 |     matchtext = match.groups(0)[0]
 96 |     assert matchtext
 97 |     return matchtext
 98 | xypath.Bag.group = group
 99 | 
100 | def one_of(bag, options):
101 |     output = None
102 |     for option in options:
103 |         if output is None:
104 |             output = bag.filter(option)
105 |         else:
106 |             output = output | bag.filter(option)
107 |     return output
108 | xypath.Bag.one_of = one_of
109 | 
110 | def parent(bag):
111 |     """for cell, get its top-left cell"""
112 |     output_bag = xypath.Bag(table = bag.table)
113 |     for cell in bag.unordered:
114 |         row, _, col, _ = cell.properties.raw_span(always=True)
115 |         output_bag.add(cell.table.get_at(col, row)._cell)
116 |     return output_bag
117 | xypath.Bag.parent = parent
118 | 
119 | def children(bag):
120 |     """for top-left cell, get all cells it spans"""
121 |     outputbag = xypath.Bag(table=bag.table)
122 |     for parent in bag:
123 |         top, bottom, left, right = parent.properties.raw_span(always=True)
124 |         for row in range(top, bottom + 1):
125 |             for col in range(left, right + 1):
126 |                 outputbag = outputbag | bag.table.get_at(col, row)
127 |     return outputbag
128 | xypath.Bag.children = children
129 | 
130 | def rich_text(bag):
131 |     r = bag.property.rich
132 |     return r
133 | xypath.Bag.rich_text = rich_text
134 | 
135 | def spaceprefix(bag, count):
136 |     """filter: cells starting with exactly count whitespace: no more, no less"""
137 |     return bag.filter(re.compile("^\s{%s}\S" % count))
138 | xypath.Bag.spaceprefix = spaceprefix
139 | 
140 | def is_whitespace(bag):
141 |     """filter: cells which do not contain printable characters"""
142 |     return bag.filter(lambda cell: not str(cell.value).strip())
143 | xypath.Bag.is_whitespace = is_whitespace
144 | 
145 | def is_not_whitespace(bag):
146 |     """filter: cells which do contain printable characters"""
147 |     return bag.filter(lambda cell: str(cell.value).strip())
148 | xypath.Bag.is_not_whitespace = is_not_whitespace
149 | 
150 | def by_index(bag, items):
151 |     """filter: return numbered items from a bag.
152 |        Note that this is 1-indexed!
153 |        Items can be a list or a single number"""
154 |     if isinstance(items, int):
155 |         return bag.by_index([items])
156 |     new = xypath.Bag(table=bag.table)
157 |     for i, cell in enumerate(bag):
158 |         if i+1 in items:
159 |             new.add(cell._cell)
160 |             if i+1 == max(items):
161 |                 return new
162 |     raise xypath.XYPathError("get_nth needed {} items, but bag only contained {}.\n{!r}".format(max(items), len(bag), bag))
163 | xypath.Bag.by_index = by_index
164 | 
165 | 


--------------------------------------------------------------------------------
/databaker/richxlrd/__init__.py:
--------------------------------------------------------------------------------
1 | from .richxlrd import RichCell, Fragments, Fragment
2 | 


--------------------------------------------------------------------------------
/databaker/richxlrd/rich.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cantabular/databaker/930b65e85038a3afd59574bbdc1d4cb79ce9b6ad/databaker/richxlrd/rich.xls


--------------------------------------------------------------------------------
/databaker/richxlrd/richxlrd.py:
--------------------------------------------------------------------------------
 1 | import xlrd
 2 | 
 3 | "Horrid workaround! Can get 'no' every time from LibreOffice xls"
 4 | @property
 5 | def _bold(self):
 6 |     return self.weight > 500
 7 | xlrd.formatting.Font._bold = _bold
 8 | 
 9 | @property
10 | def _script(self):
11 |     return self.escapement
12 | xlrd.formatting.Font.script = _script
13 | 
14 | 
15 | class RichCell(object):
16 |     def __init__(self, sheet, y, x):
17 |         self.sheet = sheet
18 |         self.y = y
19 |         self.x = x
20 | 
21 |     @property
22 |     def cell(self):
23 |         return self.sheet.cell(self.y, self.x)
24 | 
25 |     @property
26 |     def raw_fontlist(self):
27 |         """the position of a font change, and the new font code.
28 |            note that it doesn't include the first font!"""
29 |         return self.sheet.rich_text_runlist_map.get((self.y, self.x), [])
30 | 
31 |     @property
32 |     def first_font(self):
33 |         """the first font number"""
34 |         xf = self.cell.xf_index
35 |         return self.sheet.book.xf_list[xf].font_index
36 | 
37 |     @property
38 |     def fontlist(self):
39 |         full_fontlist = list(self.raw_fontlist)
40 |         full_fontlist.insert(0, (0, self.first_font))
41 |         return list((pos, self.sheet.book.font_list[font]) for pos, font in full_fontlist)
42 | 
43 |     @property
44 |     def fragments(self):
45 |         fontlist = self.fontlist
46 |         output = Fragments()
47 |         for i, (start, font) in enumerate(fontlist):
48 |             try:
49 |                 end = fontlist[i+1][0]
50 |             except IndexError:
51 |                 end = None
52 |             output.append(Fragment(self.cell.value[start:end], font))
53 |             start = end
54 |         return output
55 | 
56 | class Fragments(list):
57 |     @classmethod
58 |     def from_rich_text(self, richtext):
59 |         return richtext.fragments
60 | 
61 |     @property
62 |     def value(self):
63 |         return ''.join(x.value for x in self)
64 | 
65 |     def __getattr__(self, v):
66 |         if v.startswith('only_'):
67 |             sense = True
68 |             word = v[5:]
69 |         elif v.startswith('not_'):
70 |             sense = False
71 |             word = v[4:]
72 |         else:
73 |             raise AttributeError("{!r} object has no attribute {!r}".format(self.__class__.__name__, v))
74 |         if word in ['bold']:
75 |             word = '_' + word
76 |         return Fragments(frag for frag in self if bool(getattr(frag.font, word)) == sense)
77 | 
78 | 
79 | class Fragment(object):
80 |     def __init__(self, value, font):
81 |         self.value = value
82 |         self.font = font
83 | 
84 |     def __repr__(self):
85 |         return "<{!r}:{!r}>".format(self.value, self.font)
86 | 


--------------------------------------------------------------------------------
/databaker/structure_csv_default.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Template/Options file for altering the structure of the .csv flatfile output.
 3 | 
 4 | """
 5 | 
 6 | import collections
 7 | headermeasurements = [
 8 |     ('observation',            "OBS"), 
 9 |     ('data_marking',           "DATAMARKER"), 
10 |     ('statistical_unit_eng',   "STATUNIT"),      'statistical_unit_cym', 
11 |     ('measure_type_eng',       "MEASURETYPE"),   'measure_type_cym', 'observation_type', 'empty', 'obs_type_value', 
12 |     ('unit_multiplier',        "UNITMULTIPLIER"),
13 |     ('unit_of_measure_eng',    "UNITOFMEASURE"), 'unit_of_measure_cym', 'confidentuality', 'empty1', 
14 |     ('geographic_area',        "GEOG"),          'empty2', 'empty3', 
15 |     ('time_dim_item_id',       "TIME"),               
16 |     ('time_dim_item_label_eng',"TIME"),          'time_dim_item_label_cym', 
17 |     ('time_type',              "TIMEUNIT"),      'empty4', 
18 |     ('statistical_population_id',       "STATPOP"),
19 |     ('statistical_population_label_eng',"STATPOP"), 'statistical_population_label_cym', 
20 |     ('cdid', "CDID"), 'cdiddescrip', 'empty5', 'empty6', 'empty7', 'empty8', 'empty9', 'empty10', 'empty11', 'empty12'
21 | ]
22 | 
23 | headeradditionals = [ 
24 |     ("dim_id",      "NAME"),  ("dimension_label_eng",      "NAME"),  "dimension_label_cym",
25 |     ("dim_item_id", "VALUE"), ("dimension_item_label_eng", "VALUE"), "dimension_item_label_cym",  
26 |     "is_total", "is_sub_total"
27 | ]
28 | 
29 | conversionsegmentnumbercolumn = "empty11"
30 | 
31 | # Do we want to create a TIMEUNIT dimension using a TIME dimension - ONS specific
32 | SH_Create_ONS_time = True
33 | 
34 | # Do you want to split the OBS, placing non float data into your next column.
35 | SH_Split_OBS = "DATAMARKER"  # see value set to int value below
36 | 
37 | 
38 | ####  Below this point is derived data (used in old code) from the above tables
39 | 
40 | # derive the elements of the headernames above into the values below 
41 | headermeasurementnames = list(collections.OrderedDict.fromkeys(k[1]  for k in headermeasurements  if isinstance(k, tuple)))
42 | headermeasurementnamesSet = set(headermeasurementnames) 
43 | 
44 | # Create variables (This is terrible!)
45 | # TODO: Do this more cleanly e.g. as in https://stackoverflow.com/q/4859217/
46 | exec("%s = '%s'" % (", ".join(headermeasurementnames), "', '".join(map(str, headermeasurementnames))))
47 | exec("SH_Split_OBS = %s" % SH_Split_OBS)
48 | 
49 | __all__ = list(headermeasurementnames) # don't expose unnecessary items when using `from foo import *`
50 | 
51 | 
52 | 
53 | 


--------------------------------------------------------------------------------
/databaker/tutorial.py:
--------------------------------------------------------------------------------
 1 | # Based on altair tutorial loader:
 2 | # https://github.com/altair-viz/altair/blob/273a1fcf9cec1956474af755d5fe32f0e3f0aee8/altair/tutorial.py
 3 | 
 4 | # Copyright (c) 2015, Brian E. Granger and Jake Vanderplas
 5 | # All rights reserved.
 6 | #
 7 | # Redistribution and use in source and binary forms, with or without
 8 | # modification, are permitted provided that the following conditions are met:
 9 | #
10 | # * Redistributions of source code must retain the above copyright notice, this
11 | #   list of conditions and the following disclaimer.
12 | #
13 | # * Redistributions in binary form must reproduce the above copyright notice,
14 | #   this list of conditions and the following disclaimer in the documentation
15 | #   and/or other materials provided with the distribution.
16 | #
17 | # * Neither the name of altair nor the names of its
18 | #   contributors may be used to endorse or promote products derived from
19 | #   this software without specific prior written permission.
20 | #
21 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
25 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
27 | # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
29 | # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 | 
32 | import os
33 | import shutil
34 | 
35 | SRC_PATH = os.path.join(
36 |     os.path.split(os.path.abspath(__file__)
37 | )[0], 'tutorial')
38 | 
39 | DEST_PATH = os.path.relpath('DatabakerTutorial')
40 | 
41 | def copy_tutorial(overwrite=False):
42 |     """Copy the Databaker tutorial notebooks into ./DatabakerTutorial."""
43 |     if os.path.isdir(DEST_PATH) and overwrite:
44 |         print('Removing old tutorial directory: {}'.format(DEST_PATH))
45 |         shutil.rmtree(DEST_PATH, ignore_errors=True)
46 |     if os.path.isdir(DEST_PATH):
47 |         raise RuntimeError('{} already exists, run with overwrite=True to discard *all* existing files in tutorial directory'.format(DEST_PATH))
48 |     print('Copying notebooks into fresh tutorial directory: {}'.format(DEST_PATH))
49 |     shutil.copytree(SRC_PATH, DEST_PATH)
50 | 
51 | 
52 | def tutorial(overwrite=False):
53 |     """Copy the Databaker tutorial notebooks into ./DatabakerTutorial and show a link in the notebook."""
54 |     copy_tutorial(overwrite=overwrite)
55 |     print('Click on the following notebooks to explore the tutorial:')
56 |     from IPython.display import FileLinks, display
57 |     file_links = FileLinks(path=DEST_PATH,
58 |                            included_suffixes=['.ipynb'],
59 |                            recursive=False)
60 |     display(file_links)
61 | 


--------------------------------------------------------------------------------
/databaker/tutorial/Introduction.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Introduction\n",
  8 |     "\n",
  9 |     "[Databaker](https://github.com/sensiblecodeio/databaker) is an Open Source Python library for converting semi-structured spreadsheets into computer-friendly datatables.  The resulting data can be stored into [Pandas data tables](http://pandas.pydata.org/) or the ONS-specific WDA format.\n",
 10 |     "\n",
 11 |     "The system is embedded into the interactive programming environment called [Jupyter](http://jupyter.org/) for fast prototyping and development, and depends for its spreadsheet processing on [messytables](http://messytables.readthedocs.io/en/latest/) and [xypath](https://github.com/sensiblecodeio/xypath).\n",
 12 |     "\n",
 13 |     "Install it with the command:\n",
 14 |     "\n",
 15 |     "> `pip3 install databaker`\n",
 16 |     "\n",
 17 |     "Your main interaction with databaker is through the Jupyter notebook interface.  There are many tutorials to show you how to master this system elsewhere on-line.  \n",
 18 |     "\n",
 19 |     "Once you've have a working program to converts a particular spreadsheet style into the output which you want, there are ways to rerun the notebook on other spreadsheets externally or from the command line.  "
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "markdown",
 24 |    "metadata": {},
 25 |    "source": [
 26 |     "# Example\n",
 27 |     "\n",
 28 |     "Although Databaker can handle spreadsheets of any size, here is a tiny example from the tutorials to illustrate what it does."
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": 1,
 34 |    "metadata": {
 35 |     "collapsed": false
 36 |    },
 37 |    "outputs": [
 38 |     {
 39 |      "data": {
 40 |       "text/html": [
 41 |        "<div id=\"injblock1001\">\n",
 42 |        "<style>\n",
 43 |        "table.ex, table.exkey { border: thin black solid }\n",
 44 |        "table.ex td, table.ex tr { border: none }\n",
 45 |        "table.ex td:hover { border: thin blue solid }\n",
 46 |        "table.ex td.excOBS:hover { border: thin red solid }\n",
 47 |        "table.ex td.selected { border: thick red solid }\n",
 48 |        "</style>\n",
 49 |        "\n",
 50 |        "\n",
 51 |        "\n",
 52 |        "<table class=\"ex\">\n",
 53 |        "<caption style=\"text-align:center; padding:0px; caption-side:bottom\">beatles</caption>\n",
 54 |        "<tr><td title=\"0 0\">Date</td><td title=\"1 0\">2014.0</td><td title=\"2 0\"></td><td title=\"3 0\"></td></tr>\n",
 55 |        "<tr><td title=\"0 1\"></td><td title=\"1 1\"></td><td title=\"2 1\"></td><td title=\"3 1\"></td></tr>\n",
 56 |        "<tr><td title=\"0 2\"></td><td title=\"1 2\">Cars</td><td style=\"font-weight:bold\" title=\"2 2\">Planes</td><td style=\"font-weight:bold\" title=\"3 2\">Trains</td></tr>\n",
 57 |        "<tr><td title=\"0 3\">John</td><td title=\"1 3\">2.0</td><td title=\"2 3\">2.0</td><td title=\"3 3\">1.0</td></tr>\n",
 58 |        "<tr><td title=\"0 4\">Paul</td><td title=\"1 4\">4.0</td><td title=\"2 4\">3.0</td><td title=\"3 4\">2.0</td></tr>\n",
 59 |        "<tr><td title=\"0 5\">Ringo</td><td title=\"1 5\">4.0</td><td title=\"2 5\">1.0</td><td title=\"3 5\">3.0</td></tr>\n",
 60 |        "<tr><td title=\"0 6\">George</td><td title=\"1 6\">2.0</td><td title=\"2 6\">5.0</td><td title=\"3 6\">5.0</td></tr>\n",
 61 |        "</table>\n",
 62 |        "\n",
 63 |        "</div>\n"
 64 |       ],
 65 |       "text/plain": [
 66 |        "<IPython.core.display.HTML object>"
 67 |       ]
 68 |      },
 69 |      "metadata": {},
 70 |      "output_type": "display_data"
 71 |     }
 72 |    ],
 73 |    "source": [
 74 |     "from databaker.framework import *\n",
 75 |     "\n",
 76 |     "tab = loadxlstabs(\"example1.xls\", \"beatles\", verbose=False)[0]\n",
 77 |     "savepreviewhtml(tab, verbose=False)\n"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "markdown",
 82 |    "metadata": {},
 83 |    "source": [
 84 |     "## Conversion segments\n",
 85 |     "Databaker gives you tools to help you write the code to navigate around the spreadsheet and select the cells and their correspondences.  \n",
 86 |     "\n",
 87 |     "When you are done your code will look like the following.  \n",
 88 |     "\n",
 89 |     "You can click on the OBS (observation) cells to see how they connect to the headings."
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": 2,
 95 |    "metadata": {
 96 |     "collapsed": false
 97 |    },
 98 |    "outputs": [
 99 |     {
100 |      "data": {
101 |       "text/html": [
102 |        "<div id=\"injblock1002\">\n",
103 |        "<style>\n",
104 |        "table.ex, table.exkey { border: thin black solid }\n",
105 |        "table.ex td, table.ex tr { border: none }\n",
106 |        "table.ex td:hover { border: thin blue solid }\n",
107 |        "table.ex td.excOBS:hover { border: thin red solid }\n",
108 |        "table.ex td.selected { border: thick red solid }\n",
109 |        "</style>\n",
110 |        "\n",
111 |        "\n",
112 |        "<table class=\"exkey\">\n",
113 |        "<tr><td class=\"xc0\" style=\"background-color:Aqua\">OBS</td><td class=\"xc1\" style=\"background-color:LightGreen\">TIME</td><td class=\"xc2\" style=\"background-color:MistyRose\">Vehicles</td><td class=\"xc3\" style=\"background-color:LightGray\">Name</td></tr></table>\n",
114 |        "\n",
115 |        "<table class=\"ex\">\n",
116 |        "<caption style=\"text-align:center; padding:0px; caption-side:bottom\">beatles</caption>\n",
117 |        "<tr><td title=\"0 0\">Date</td><td style=\"background-color:LightGreen\" title=\"1 0\">2014.0</td><td title=\"2 0\"></td><td title=\"3 0\"></td></tr>\n",
118 |        "<tr><td title=\"0 1\"></td><td title=\"1 1\"></td><td title=\"2 1\"></td><td title=\"3 1\"></td></tr>\n",
119 |        "<tr><td title=\"0 2\"></td><td style=\"background-color:MistyRose\" title=\"1 2\">Cars</td><td style=\"background-color:MistyRose;font-weight:bold\" title=\"2 2\">Planes</td><td style=\"background-color:MistyRose;font-weight:bold\" title=\"3 2\">Trains</td></tr>\n",
120 |        "<tr><td style=\"background-color:LightGray\" title=\"0 3\">John</td><td style=\"background-color:Aqua\" title=\"1 3\">2.0</td><td style=\"background-color:Aqua\" title=\"2 3\">2.0</td><td style=\"background-color:Aqua\" title=\"3 3\">1.0</td></tr>\n",
121 |        "<tr><td style=\"background-color:LightGray\" title=\"0 4\">Paul</td><td style=\"background-color:Aqua\" title=\"1 4\">4.0</td><td style=\"background-color:Aqua\" title=\"2 4\">3.0</td><td style=\"background-color:Aqua\" title=\"3 4\">2.0</td></tr>\n",
122 |        "<tr><td style=\"background-color:LightGray\" title=\"0 5\">Ringo</td><td style=\"background-color:Aqua\" title=\"1 5\">4.0</td><td style=\"background-color:Aqua\" title=\"2 5\">1.0</td><td style=\"background-color:Aqua\" title=\"3 5\">3.0</td></tr>\n",
123 |        "<tr><td style=\"background-color:LightGray\" title=\"0 6\">George</td><td style=\"background-color:Aqua\" title=\"1 6\">2.0</td><td style=\"background-color:Aqua\" title=\"2 6\">5.0</td><td style=\"background-color:Aqua\" title=\"3 6\">5.0</td></tr>\n",
124 |        "</table>\n",
125 |        "\n",
126 |        "</div>\n",
127 |        "\n",
128 |        "<script>\n",
129 |        "var jslookup = {\"2 3\":[1,0,2,2,0,3],\"2 6\":[1,0,2,2,0,6],\"2 5\":[1,0,2,2,0,5],\"1 3\":[1,0,1,2,0,3],\"3 6\":[1,0,3,2,0,6],\"3 5\":[1,0,3,2,0,5],\"3 3\":[1,0,3,2,0,3],\"1 4\":[1,0,1,2,0,4],\"2 4\":[1,0,2,2,0,4],\"1 5\":[1,0,1,2,0,5],\"1 6\":[1,0,1,2,0,6],\"3 4\":[1,0,3,2,0,4]}; \n",
130 |        "var jdividNUM = \"injblock1002\"; \n",
131 |        "var Dclickedcell = null; \n",
132 |        "function clickedcell() \n",
133 |        "{ \n",
134 |        "    Dclickedcell = this; \n",
135 |        "    console.log(\"jjjj\", this); \n",
136 |        "    var rgc = new RegExp('(^|\\b)' + \"selected\".split(' ').join('|') + '(\\b|$)', 'gi'); \n",
137 |        "    Array.prototype.forEach.call(document.querySelectorAll(\"div#\"+jdividNUM+\" table.ex td.selected\"), function(el, i) { \n",
138 |        "        if (el.classList)  el.classList.remove(\"selected\");\n",
139 |        "        else  el.className = el.className.replace(rgc, ' ');\n",
140 |        "    }); \n",
141 |        "    if (this.classList)  this.classList.add(\"selected\");\n",
142 |        "    else this.className += ' ' + \"selected\";\n",
143 |        "\n",
144 |        "    var dimpairs = jslookup[this.title]; \n",
145 |        "    if (dimpairs !== undefined) {\n",
146 |        "        for (var i = 1; i < dimpairs.length; i += 2) {\n",
147 |        "            var row = document.querySelectorAll(\"div#\"+jdividNUM+\" table.ex tr\")[dimpairs[i]]; \n",
148 |        "            var el = row.querySelectorAll(\"td\")[dimpairs[i-1]]; \n",
149 |        "            if (el.classList)  el.classList.add(\"selected\");\n",
150 |        "            else el.className += ' ' + \"selected\";\n",
151 |        "        }\n",
152 |        "    }\n",
153 |        "}\n",
154 |        "Array.prototype.forEach.call(document.querySelectorAll(\"div#\"+jdividNUM+\" table.ex td\"), function(item, i) { item.onclick=clickedcell; }); \n",
155 |        "</script>\n"
156 |       ],
157 |       "text/plain": [
158 |        "<IPython.core.display.HTML object>"
159 |       ]
160 |      },
161 |      "metadata": {},
162 |      "output_type": "display_data"
163 |     }
164 |    ],
165 |    "source": [
166 |     "r1 = tab.excel_ref('B3').expand(RIGHT)\n",
167 |     "r2 = tab.excel_ref('A3').fill(DOWN)\n",
168 |     "dimensions = [ \n",
169 |     "    HDim(tab.excel_ref('B1'), TIME, CLOSEST, ABOVE), \n",
170 |     "    HDim(r1, \"Vehicles\", DIRECTLY, ABOVE), \n",
171 |     "    HDim(r2, \"Name\", DIRECTLY, LEFT), \n",
172 |     "    HDimConst(\"Category\", \"Beatles\")\n",
173 |     "]\n",
174 |     "observations = tab.excel_ref('B4').expand(DOWN).expand(RIGHT).is_not_blank().is_not_whitespace()\n",
175 |     "c1 = ConversionSegment(observations, dimensions)\n",
176 |     "savepreviewhtml(c1)\n"
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "markdown",
181 |    "metadata": {},
182 |    "source": [
183 |     "## Output in pandas\n",
184 |     "[Pandas data tables](http://pandas.pydata.org/) provides an enormous scope for further processing and cleaning of the data.  \n",
185 |     "\n",
186 |     "To make full use of its power you should become familiar with its [Time series functionality](http://pandas.pydata.org/pandas-docs/stable/timeseries.html), which will allows you to plot, resample and align multple data sources at once.\n"
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "code",
191 |    "execution_count": 3,
192 |    "metadata": {
193 |     "collapsed": false
194 |    },
195 |    "outputs": [
196 |     {
197 |      "name": "stdout",
198 |      "output_type": "stream",
199 |      "text": [
200 |       "TIMEUNIT='Year'\n"
201 |      ]
202 |     },
203 |     {
204 |      "data": {
205 |       "text/html": [
206 |        "<div>\n",
207 |        "<table border=\"1\" class=\"dataframe\">\n",
208 |        "  <thead>\n",
209 |        "    <tr style=\"text-align: right;\">\n",
210 |        "      <th></th>\n",
211 |        "      <th>OBS</th>\n",
212 |        "      <th>TIME</th>\n",
213 |        "      <th>TIMEUNIT</th>\n",
214 |        "      <th>Vehicles</th>\n",
215 |        "      <th>Name</th>\n",
216 |        "      <th>Category</th>\n",
217 |        "      <th>__x</th>\n",
218 |        "      <th>__y</th>\n",
219 |        "      <th>__tablename</th>\n",
220 |        "    </tr>\n",
221 |        "  </thead>\n",
222 |        "  <tbody>\n",
223 |        "    <tr>\n",
224 |        "      <th>0</th>\n",
225 |        "      <td>2.0</td>\n",
226 |        "      <td>2014</td>\n",
227 |        "      <td>Year</td>\n",
228 |        "      <td>Cars</td>\n",
229 |        "      <td>John</td>\n",
230 |        "      <td>Beatles</td>\n",
231 |        "      <td>1</td>\n",
232 |        "      <td>3</td>\n",
233 |        "      <td>beatles</td>\n",
234 |        "    </tr>\n",
235 |        "    <tr>\n",
236 |        "      <th>1</th>\n",
237 |        "      <td>2.0</td>\n",
238 |        "      <td>2014</td>\n",
239 |        "      <td>Year</td>\n",
240 |        "      <td>Planes</td>\n",
241 |        "      <td>John</td>\n",
242 |        "      <td>Beatles</td>\n",
243 |        "      <td>2</td>\n",
244 |        "      <td>3</td>\n",
245 |        "      <td>beatles</td>\n",
246 |        "    </tr>\n",
247 |        "    <tr>\n",
248 |        "      <th>2</th>\n",
249 |        "      <td>1.0</td>\n",
250 |        "      <td>2014</td>\n",
251 |        "      <td>Year</td>\n",
252 |        "      <td>Trains</td>\n",
253 |        "      <td>John</td>\n",
254 |        "      <td>Beatles</td>\n",
255 |        "      <td>3</td>\n",
256 |        "      <td>3</td>\n",
257 |        "      <td>beatles</td>\n",
258 |        "    </tr>\n",
259 |        "    <tr>\n",
260 |        "      <th>3</th>\n",
261 |        "      <td>4.0</td>\n",
262 |        "      <td>2014</td>\n",
263 |        "      <td>Year</td>\n",
264 |        "      <td>Cars</td>\n",
265 |        "      <td>Paul</td>\n",
266 |        "      <td>Beatles</td>\n",
267 |        "      <td>1</td>\n",
268 |        "      <td>4</td>\n",
269 |        "      <td>beatles</td>\n",
270 |        "    </tr>\n",
271 |        "    <tr>\n",
272 |        "      <th>4</th>\n",
273 |        "      <td>3.0</td>\n",
274 |        "      <td>2014</td>\n",
275 |        "      <td>Year</td>\n",
276 |        "      <td>Planes</td>\n",
277 |        "      <td>Paul</td>\n",
278 |        "      <td>Beatles</td>\n",
279 |        "      <td>2</td>\n",
280 |        "      <td>4</td>\n",
281 |        "      <td>beatles</td>\n",
282 |        "    </tr>\n",
283 |        "    <tr>\n",
284 |        "      <th>5</th>\n",
285 |        "      <td>2.0</td>\n",
286 |        "      <td>2014</td>\n",
287 |        "      <td>Year</td>\n",
288 |        "      <td>Trains</td>\n",
289 |        "      <td>Paul</td>\n",
290 |        "      <td>Beatles</td>\n",
291 |        "      <td>3</td>\n",
292 |        "      <td>4</td>\n",
293 |        "      <td>beatles</td>\n",
294 |        "    </tr>\n",
295 |        "    <tr>\n",
296 |        "      <th>6</th>\n",
297 |        "      <td>4.0</td>\n",
298 |        "      <td>2014</td>\n",
299 |        "      <td>Year</td>\n",
300 |        "      <td>Cars</td>\n",
301 |        "      <td>Ringo</td>\n",
302 |        "      <td>Beatles</td>\n",
303 |        "      <td>1</td>\n",
304 |        "      <td>5</td>\n",
305 |        "      <td>beatles</td>\n",
306 |        "    </tr>\n",
307 |        "    <tr>\n",
308 |        "      <th>7</th>\n",
309 |        "      <td>1.0</td>\n",
310 |        "      <td>2014</td>\n",
311 |        "      <td>Year</td>\n",
312 |        "      <td>Planes</td>\n",
313 |        "      <td>Ringo</td>\n",
314 |        "      <td>Beatles</td>\n",
315 |        "      <td>2</td>\n",
316 |        "      <td>5</td>\n",
317 |        "      <td>beatles</td>\n",
318 |        "    </tr>\n",
319 |        "    <tr>\n",
320 |        "      <th>8</th>\n",
321 |        "      <td>3.0</td>\n",
322 |        "      <td>2014</td>\n",
323 |        "      <td>Year</td>\n",
324 |        "      <td>Trains</td>\n",
325 |        "      <td>Ringo</td>\n",
326 |        "      <td>Beatles</td>\n",
327 |        "      <td>3</td>\n",
328 |        "      <td>5</td>\n",
329 |        "      <td>beatles</td>\n",
330 |        "    </tr>\n",
331 |        "    <tr>\n",
332 |        "      <th>9</th>\n",
333 |        "      <td>2.0</td>\n",
334 |        "      <td>2014</td>\n",
335 |        "      <td>Year</td>\n",
336 |        "      <td>Cars</td>\n",
337 |        "      <td>George</td>\n",
338 |        "      <td>Beatles</td>\n",
339 |        "      <td>1</td>\n",
340 |        "      <td>6</td>\n",
341 |        "      <td>beatles</td>\n",
342 |        "    </tr>\n",
343 |        "    <tr>\n",
344 |        "      <th>10</th>\n",
345 |        "      <td>5.0</td>\n",
346 |        "      <td>2014</td>\n",
347 |        "      <td>Year</td>\n",
348 |        "      <td>Planes</td>\n",
349 |        "      <td>George</td>\n",
350 |        "      <td>Beatles</td>\n",
351 |        "      <td>2</td>\n",
352 |        "      <td>6</td>\n",
353 |        "      <td>beatles</td>\n",
354 |        "    </tr>\n",
355 |        "    <tr>\n",
356 |        "      <th>11</th>\n",
357 |        "      <td>5.0</td>\n",
358 |        "      <td>2014</td>\n",
359 |        "      <td>Year</td>\n",
360 |        "      <td>Trains</td>\n",
361 |        "      <td>George</td>\n",
362 |        "      <td>Beatles</td>\n",
363 |        "      <td>3</td>\n",
364 |        "      <td>6</td>\n",
365 |        "      <td>beatles</td>\n",
366 |        "    </tr>\n",
367 |        "  </tbody>\n",
368 |        "</table>\n",
369 |        "</div>"
370 |       ],
371 |       "text/plain": [
372 |        "    OBS  TIME TIMEUNIT Vehicles    Name Category  __x  __y __tablename\n",
373 |        "0   2.0  2014     Year     Cars    John  Beatles    1    3     beatles\n",
374 |        "1   2.0  2014     Year   Planes    John  Beatles    2    3     beatles\n",
375 |        "2   1.0  2014     Year   Trains    John  Beatles    3    3     beatles\n",
376 |        "3   4.0  2014     Year     Cars    Paul  Beatles    1    4     beatles\n",
377 |        "4   3.0  2014     Year   Planes    Paul  Beatles    2    4     beatles\n",
378 |        "5   2.0  2014     Year   Trains    Paul  Beatles    3    4     beatles\n",
379 |        "6   4.0  2014     Year     Cars   Ringo  Beatles    1    5     beatles\n",
380 |        "7   1.0  2014     Year   Planes   Ringo  Beatles    2    5     beatles\n",
381 |        "8   3.0  2014     Year   Trains   Ringo  Beatles    3    5     beatles\n",
382 |        "9   2.0  2014     Year     Cars  George  Beatles    1    6     beatles\n",
383 |        "10  5.0  2014     Year   Planes  George  Beatles    2    6     beatles\n",
384 |        "11  5.0  2014     Year   Trains  George  Beatles    3    6     beatles"
385 |       ]
386 |      },
387 |      "execution_count": 3,
388 |      "metadata": {},
389 |      "output_type": "execute_result"
390 |     }
391 |    ],
392 |    "source": [
393 |     "c1.topandas()"
394 |    ]
395 |   },
396 |   {
397 |    "cell_type": "markdown",
398 |    "metadata": {},
399 |    "source": [
400 |     "## Output in WDA Observation File\n",
401 |     "The WDA system in the ONS has been the primary use for this library.  If you need output into WDA the result would look like the following:"
402 |    ]
403 |   },
404 |   {
405 |    "cell_type": "code",
406 |    "execution_count": 4,
407 |    "metadata": {
408 |     "collapsed": false
409 |    },
410 |    "outputs": [
411 |     {
412 |      "name": "stdout",
413 |      "output_type": "stream",
414 |      "text": [
415 |       "observation,data_marking,statistical_unit_eng,statistical_unit_cym,measure_type_eng,measure_type_cym,observation_type,empty,obs_type_value,unit_multiplier,unit_of_measure_eng,unit_of_measure_cym,confidentuality,empty1,geographic_area,empty2,empty3,time_dim_item_id,time_dim_item_label_eng,time_dim_item_label_cym,time_type,empty4,statistical_population_id,statistical_population_label_eng,statistical_population_label_cym,cdid,cdiddescrip,empty5,empty6,empty7,empty8,empty9,empty10,empty11,empty12,dim_id_1,dimension_label_eng_1,dimension_label_cym_1,dim_item_id_1,dimension_item_label_eng_1,dimension_item_label_cym_1,is_total_1,is_sub_total_1,dim_id_2,dimension_label_eng_2,dimension_label_cym_2,dim_item_id_2,dimension_item_label_eng_2,dimension_item_label_cym_2,is_total_2,is_sub_total_2,dim_id_3,dimension_label_eng_3,dimension_label_cym_3,dim_item_id_3,dimension_item_label_eng_3,dimension_item_label_cym_3,is_total_3,is_sub_total_3\r\n",
416 |       "2.0,,,,,,,,,,,,,,,,,2014,2014,,Year,,,,,,,,,,,,,0,,Vehicles,Vehicles,,Cars,Cars,,,,Name,Name,,John,John,,,,Category,Category,,Beatles,Beatles,,,\r\n",
417 |       "2.0,,,,,,,,,,,,,,,,,2014,2014,,Year,,,,,,,,,,,,,0,,Vehicles,Vehicles,,Planes,Planes,,,,Name,Name,,John,John,,,,Category,Category,,Beatles,Beatles,,,\r\n",
418 |       "1.0,,,,,,,,,,,,,,,,,2014,2014,,Year,,,,,,,,,,,,,0,,Vehicles,Vehicles,,Trains,Trains,,,,Name,Name,,John,John,,,,Category,Category,,Beatles,Beatles,,,\r\n",
419 |       "4.0,,,,,,,,,,,,,,,,,2014,2014,,Year,,,,,,,,,,,,,0,,Vehicles,Vehicles,,Cars,Cars,,,,Name,Name,,Paul,Paul,,,,Category,Category,,Beatles,Beatles,,,\r\n",
420 |       "3.0,,,,,,,,,,,,,,,,,2014,2014,,Year,,,,,,,,,,,,,0,,Vehicles,Vehicles,,Planes,Planes,,,,Name,Name,,Paul,Paul,,,,Category,Category,,Beatles,Beatles,,,\r\n",
421 |       "2.0,,,,,,,,,,,,,,,,,2014,2014,,Year,,,,,,,,,,,,,0,,Vehicles,Vehicles,,Trains,Trains,,,,Name,Name,,Paul,Paul,,,,Category,Category,,Beatles,Beatles,,,\r\n",
422 |       "4.0,,,,,,,,,,,,,,,,,2014,2014,,Year,,,,,,,,,,,,,0,,Vehicles,Vehicles,,Cars,Cars,,,,Name,Name,,Ringo,Ringo,,,,Category,Category,,Beatles,Beatles,,,\r\n",
423 |       "1.0,,,,,,,,,,,,,,,,,2014,2014,,Year,,,,,,,,,,,,,0,,Vehicles,Vehicles,,Planes,Planes,,,,Name,Name,,Ringo,Ringo,,,,Category,Category,,Beatles,Beatles,,,\r\n",
424 |       "3.0,,,,,,,,,,,,,,,,,2014,2014,,Year,,,,,,,,,,,,,0,,Vehicles,Vehicles,,Trains,Trains,,,,Name,Name,,Ringo,Ringo,,,,Category,Category,,Beatles,Beatles,,,\r\n",
425 |       "2.0,,,,,,,,,,,,,,,,,2014,2014,,Year,,,,,,,,,,,,,0,,Vehicles,Vehicles,,Cars,Cars,,,,Name,Name,,George,George,,,,Category,Category,,Beatles,Beatles,,,\r\n",
426 |       "5.0,,,,,,,,,,,,,,,,,2014,2014,,Year,,,,,,,,,,,,,0,,Vehicles,Vehicles,,Planes,Planes,,,,Name,Name,,George,George,,,,Category,Category,,Beatles,Beatles,,,\r\n",
427 |       "5.0,,,,,,,,,,,,,,,,,2014,2014,,Year,,,,,,,,,,,,,0,,Vehicles,Vehicles,,Trains,Trains,,,,Name,Name,,George,George,,,,Category,Category,,Beatles,Beatles,,,\r\n",
428 |       "*********,12\r\n",
429 |       "\n"
430 |      ]
431 |     }
432 |    ],
433 |    "source": [
434 |     "print(writetechnicalCSV(None, c1))"
435 |    ]
436 |   },
437 |   {
438 |    "cell_type": "markdown",
439 |    "metadata": {},
440 |    "source": [
441 |     "## Further notes\n",
442 |     "Databaker has been developed by the [Sensible Code Company](http://sensiblecode.io/) on contract from the [Office of National Statistics](https://www.ons.gov.uk/).\n",
443 |     "\n",
444 |     "The first version was written in 2014 and ran only as a command line script where previews were made by via a coloured Excel spreadsheet.  This version still exists under the [version 1.2.0](https://github.com/sensiblecodeio/databaker/tree/1.2.0) tag and the documentation is hosted [here](https://sensiblecodeio.github.io/quickcode-ons-docs/).\n",
445 |     "\n",
446 |     "This new version was developed at the end of 2015 to take advantage of the interactive programming capabilities of Jupyter and the freedom not to maintain backward compatibility.\n",
447 |     "\n",
448 |     "See the remaining tutorial notebooks for more details."
449 |    ]
450 |   },
451 |   {
452 |    "cell_type": "code",
453 |    "execution_count": null,
454 |    "metadata": {
455 |     "collapsed": true
456 |    },
457 |    "outputs": [],
458 |    "source": []
459 |   }
460 |  ],
461 |  "metadata": {
462 |   "kernelspec": {
463 |    "display_name": "Python 3",
464 |    "language": "python",
465 |    "name": "python3"
466 |   },
467 |   "language_info": {
468 |    "codemirror_mode": {
469 |     "name": "ipython",
470 |     "version": 3
471 |    },
472 |    "file_extension": ".py",
473 |    "mimetype": "text/x-python",
474 |    "name": "python",
475 |    "nbconvert_exporter": "python",
476 |    "pygments_lexer": "ipython3",
477 |    "version": "3.5.2"
478 |   }
479 |  },
480 |  "nbformat": 4,
481 |  "nbformat_minor": 1
482 | }
483 | 


--------------------------------------------------------------------------------
/databaker/tutorial/blank_template.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": false
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "from databaker.framework import *"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 2,
 17 |    "metadata": {
 18 |     "collapsed": false
 19 |    },
 20 |    "outputs": [
 21 |     {
 22 |      "name": "stdout",
 23 |      "output_type": "stream",
 24 |      "text": [
 25 |       "Loading example1.xls which has size 7168 bytes\n",
 26 |       "Table names: ['beatles', 'stones']\n"
 27 |      ]
 28 |     }
 29 |    ],
 30 |    "source": [
 31 |     "# put your input-output files here\n",
 32 |     "inputfile = \"example1.xls\"\n",
 33 |     "outputfile = \"example1.csv\"\n",
 34 |     "previewfile = \"preview.html\"\n",
 35 |     "\n",
 36 |     "tabs = loadxlstabs(inputfile)"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 3,
 42 |    "metadata": {
 43 |     "collapsed": false
 44 |    },
 45 |    "outputs": [],
 46 |    "source": [
 47 |     "# recipe area\n",
 48 |     "conversionsegments = [ ]\n",
 49 |     "tab = tabs[0]\n",
 50 |     "\n",
 51 |     "obs = tab.excel_ref('B4').expand(DOWN).expand(RIGHT).is_not_blank().is_not_whitespace()\n",
 52 |     "dimensions = [ \n",
 53 |     "    HDim(tab.excel_ref('B1'), TIME, CLOSEST, ABOVE), \n",
 54 |     "    HDim(tab.excel_ref('B3').expand(RIGHT), \"Vehicles\", DIRECTLY, ABOVE, cellvalueoverride={\"Cars\":\"Car\"}), \n",
 55 |     "    HDim(tab.excel_ref('A3').fill(DOWN), \"Name\", DIRECTLY, LEFT), \n",
 56 |     "    HDimConst(\"All\", \"thing\")\n",
 57 |     "]\n",
 58 |     "\n",
 59 |     "conversionsegment = ConversionSegment(tab, dimensions, obs)\n",
 60 |     "conversionsegments.append(conversionsegment)\n"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": 4,
 66 |    "metadata": {
 67 |     "collapsed": false
 68 |    },
 69 |    "outputs": [
 70 |     {
 71 |      "data": {
 72 |       "text/plain": [
 73 |        "{<D6 3.0>, <C5 3.0>, <C7 5.0>, <B6 4.0>, <D5 2.0>, <B4 2.0>, <C4 2.0>, <D7 5.0>, <B7 2.0>, <C6 1.0>, <D4 1.0>, <B5 3.0>}"
 74 |       ]
 75 |      },
 76 |      "execution_count": 4,
 77 |      "metadata": {},
 78 |      "output_type": "execute_result"
 79 |     }
 80 |    ],
 81 |    "source": [
 82 |     "conversionsegment = conversionsegments[0]\n",
 83 |     "\n",
 84 |     "hdim = conversionsegment.dimensions[1]\n",
 85 |     "hdim.hbagset\n",
 86 |     "conversionsegment.segment"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": 5,
 92 |    "metadata": {
 93 |     "collapsed": false
 94 |    },
 95 |    "outputs": [
 96 |     {
 97 |      "name": "stdout",
 98 |      "output_type": "stream",
 99 |      "text": [
100 |       "{'All': 'thing', 'Vehicles': 'Car', 'Name': 'John', -2: 2014.0, -9: 2.0}\n",
101 |       "{'All': 'thing', 'Vehicles': 'Planes', 'Name': 'John', -2: 2014.0, -9: 2.0}\n",
102 |       "{'All': 'thing', 'Vehicles': 'Trains', 'Name': 'John', -2: 2014.0, -9: 1.0}\n",
103 |       "{'All': 'thing', 'Vehicles': 'Car', 'Name': 'Paul', -2: 2014.0, -9: 3.0}\n",
104 |       "{'All': 'thing', 'Vehicles': 'Planes', 'Name': 'Paul', -2: 2014.0, -9: 3.0}\n",
105 |       "{'All': 'thing', 'Vehicles': 'Trains', 'Name': 'Paul', -2: 2014.0, -9: 2.0}\n",
106 |       "{'All': 'thing', 'Vehicles': 'Car', 'Name': 'Ringo', -2: 2014.0, -9: 4.0}\n",
107 |       "{'All': 'thing', 'Vehicles': 'Planes', 'Name': 'Ringo', -2: 2014.0, -9: 1.0}\n",
108 |       "{'All': 'thing', 'Vehicles': 'Trains', 'Name': 'Ringo', -2: 2014.0, -9: 3.0}\n",
109 |       "{'All': 'thing', 'Vehicles': 'Car', 'Name': 'George', -2: 2014.0, -9: 2.0}\n",
110 |       "{'All': 'thing', 'Vehicles': 'Planes', 'Name': 'George', -2: 2014.0, -9: 5.0}\n",
111 |       "{'All': 'thing', 'Vehicles': 'Trains', 'Name': 'George', -2: 2014.0, -9: 5.0}\n"
112 |      ]
113 |     }
114 |    ],
115 |    "source": [
116 |     "for ob in list(conversionsegment.segment):\n",
117 |     "    print(conversionsegment.lookupobs(ob))"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": 6,
123 |    "metadata": {
124 |     "collapsed": false
125 |    },
126 |    "outputs": [
127 |     {
128 |      "name": "stdout",
129 |      "output_type": "stream",
130 |      "text": [
131 |       "tablepart 'beatles' written #injblock1001\n",
132 |       "javascript calculated\n"
133 |      ]
134 |     },
135 |     {
136 |      "data": {
137 |       "text/html": [
138 |        "Written to file <a href=\"file:///home/goatchurch/sensiblecode/quickcode-ons-recipes/helpnotes/preview.html\" title=\"It would work if this linked to something like: http://localhost:8888/files/ILCH/preview.html\" >/home/goatchurch/sensiblecode/quickcode-ons-recipes/helpnotes/preview.html</a>"
139 |       ],
140 |       "text/plain": [
141 |        "<IPython.core.display.HTML object>"
142 |       ]
143 |      },
144 |      "metadata": {},
145 |      "output_type": "display_data"
146 |     },
147 |     {
148 |      "data": {
149 |       "text/html": [
150 |        "<div id=\"injblock1002\">#injblock1002\n",
151 |        "<style>\n",
152 |        "table.ex, table.exkey { border: thin black solid }\n",
153 |        "table.ex td, table.ex tr { border: none }\n",
154 |        "td.xb { font-weight: bold }\n",
155 |        "td.xn { color: green }\n",
156 |        "td.xd { color: purple }\n",
157 |        "table { border-collapse: collapse }\n",
158 |        "td.xc0 { background-color: Aqua }\n",
159 |        "td.xc1 { background-color: LightGreen }\n",
160 |        "td.xc2 { background-color: PaleTurquoise }\n",
161 |        "td.xc3 { background-color: LightBlue }\n",
162 |        "td.xc4 { background-color: SkyBlue }\n",
163 |        "td.xc5 { background-color: Plum }\n",
164 |        "td.xc6 { background-color: Gold }\n",
165 |        "td.xc7 { background-color: Lime }\n",
166 |        "td.xc8 { background-color: Coral }\n",
167 |        "td.xc9 { background-color: BurlyWood }\n",
168 |        "td.xc10 { background-color: CornflowerBlue }\n",
169 |        "td.xc-2 { background-color: MistyRose }\n",
170 |        "td.xc-9 { background-color: Lavender }\n",
171 |        "td.xc-8 { background-color: Violet }\n",
172 |        "td.xc-7 { background-color: LightGray }\n",
173 |        "td.xc-6 { background-color: SeaGreen }\n",
174 |        "td.xc-5 { background-color: DeepSkyBlue }\n",
175 |        "td.xc-4 { background-color: Blue }\n",
176 |        "td.xc-3 { background-color: LightGray }\n",
177 |        "td.xc-1 { background-color: Tan }\n",
178 |        "table.ex td:hover { border: thin blue solid }\n",
179 |        "table.ex td.exc-9:hover { border: thin red solid }\n",
180 |        "table.ex td.selected { background-color: red; border: thin blue dotted }\n",
181 |        "</style>\n",
182 |        "\n",
183 |        "\n",
184 |        "Table: <b>beatles</b> <table class=\"exkey\">\n",
185 |        "<tr><td class=\"xc-9\">OBS</td><td class=\"xc0\">TIME</td><td class=\"xc1\">Vehicles</td><td class=\"xc2\">Name</td></tr></table>\n",
186 |        "\n",
187 |        "<table class=\"ex\">\n",
188 |        "<tr><td class=\"\" title=\"0 0\">Date</td><td class=\"xc0 xn\" title=\"1 0\">2014.0</td><td class=\"\" title=\"2 0\"></td><td class=\"\" title=\"3 0\"></td></tr>\n",
189 |        "<tr><td class=\"\" title=\"0 1\"></td><td class=\"\" title=\"1 1\"></td><td class=\"\" title=\"2 1\"></td><td class=\"\" title=\"3 1\"></td></tr>\n",
190 |        "<tr><td class=\"\" title=\"0 2\"></td><td class=\"xc1\" title=\"1 2\">Cars</td><td class=\"xc1\" title=\"2 2\">Planes</td><td class=\"xc1\" title=\"3 2\">Trains</td></tr>\n",
191 |        "<tr><td class=\"xc2\" title=\"0 3\">John</td><td class=\"xc-9 xn\" title=\"1 3\">2.0</td><td class=\"xc-9 xn\" title=\"2 3\">2.0</td><td class=\"xc-9 xn\" title=\"3 3\">1.0</td></tr>\n",
192 |        "<tr><td class=\"xc2\" title=\"0 4\">Paul</td><td class=\"xc-9 xn\" title=\"1 4\">3.0</td><td class=\"xc-9 xn\" title=\"2 4\">3.0</td><td class=\"xc-9 xn\" title=\"3 4\">2.0</td></tr>\n",
193 |        "<tr><td class=\"xc2\" title=\"0 5\">Ringo</td><td class=\"xc-9 xn\" title=\"1 5\">4.0</td><td class=\"xc-9 xn\" title=\"2 5\">1.0</td><td class=\"xc-9 xn\" title=\"3 5\">3.0</td></tr>\n",
194 |        "<tr><td class=\"xc2\" title=\"0 6\">George</td><td class=\"xc-9 xn\" title=\"1 6\">2.0</td><td class=\"xc-9 xn\" title=\"2 6\">5.0</td><td class=\"xc-9 xn\" title=\"3 6\">5.0</td></tr>\n",
195 |        "</table>\n",
196 |        "\n",
197 |        "</div>\n",
198 |        "\n",
199 |        "<script>\n",
200 |        "var jslookup = {\"3 5\":[1,0,3,2,0,5],\"2 4\":[1,0,2,2,0,4],\"2 6\":[1,0,2,2,0,6],\"1 5\":[1,0,1,2,0,5],\"3 4\":[1,0,3,2,0,4],\"1 3\":[1,0,1,2,0,3],\"2 3\":[1,0,2,2,0,3],\"3 6\":[1,0,3,2,0,6],\"1 6\":[1,0,1,2,0,6],\"2 5\":[1,0,2,2,0,5],\"3 3\":[1,0,3,2,0,3],\"1 4\":[1,0,1,2,0,4]}; \n",
201 |        "var jdividNUM = \"injblock1002\"; \n",
202 |        "var Dclickedcell = null; \n",
203 |        "function clickedcell() \n",
204 |        "{ \n",
205 |        "    Dclickedcell = this; \n",
206 |        "    console.log(\"jjjj\", this); \n",
207 |        "    var rgc = new RegExp('(^|\\b)' + \"selected\".split(' ').join('|') + '(\\b|$)', 'gi'); \n",
208 |        "    Array.prototype.forEach.call(document.querySelectorAll(\"div#\"+jdividNUM+\" table.ex td.selected\"), function(el, i) { \n",
209 |        "        if (el.classList)  el.classList.remove(\"selected\");\n",
210 |        "        else  el.className = el.className.replace(rgc, ' ');\n",
211 |        "    }); \n",
212 |        "    if (this.classList)  this.classList.add(\"selected\");\n",
213 |        "    else this.className += ' ' + \"selected\";\n",
214 |        "\n",
215 |        "    var dimpairs = jslookup[this.title]; \n",
216 |        "    if (dimpairs !== undefined) {\n",
217 |        "        for (var i = 1; i < dimpairs.length; i += 2) {\n",
218 |        "            var row = document.querySelectorAll(\"div#\"+jdividNUM+\" table.ex tr\")[dimpairs[i]]; \n",
219 |        "            var el = row.querySelectorAll(\"td\")[dimpairs[i-1]]; \n",
220 |        "            if (el.classList)  el.classList.add(\"selected\");\n",
221 |        "            else el.className += ' ' + \"selected\";\n",
222 |        "        }\n",
223 |        "    }\n",
224 |        "}\n",
225 |        "Array.prototype.forEach.call(document.querySelectorAll(\"div#\"+jdividNUM+\" table.ex td\"), function(item, i) { item.onclick=clickedcell; }); \n",
226 |        "</script>\n"
227 |       ],
228 |       "text/plain": [
229 |        "<IPython.core.display.HTML object>"
230 |       ]
231 |      },
232 |      "metadata": {},
233 |      "output_type": "display_data"
234 |     }
235 |    ],
236 |    "source": [
237 |     "# this is the preview system\n",
238 |     "conversionsegment = conversionsegments[0]\n",
239 |     "savepreviewhtml(conversionsegment, previewfile)\n",
240 |     "savepreviewhtml(conversionsegment)\n"
241 |    ]
242 |   },
243 |   {
244 |    "cell_type": "code",
245 |    "execution_count": 7,
246 |    "metadata": {
247 |     "collapsed": false
248 |    },
249 |    "outputs": [
250 |     {
251 |      "name": "stdout",
252 |      "output_type": "stream",
253 |      "text": [
254 |       "writing 1 conversion segments into /home/goatchurch/sensiblecode/quickcode-ons-recipes/helpnotes/example1.csv\n",
255 |       "conversionwrite segment size 12 table 'beatles; TIMEUNIT='Year'\n"
256 |      ]
257 |     }
258 |    ],
259 |    "source": [
260 |     "writetechnicalCSV(outputfile, conversionsegments)\n"
261 |    ]
262 |   },
263 |   {
264 |    "cell_type": "code",
265 |    "execution_count": 8,
266 |    "metadata": {
267 |     "collapsed": false
268 |    },
269 |    "outputs": [
270 |     {
271 |      "data": {
272 |       "text/html": [
273 |        "<div>\n",
274 |        "<table border=\"1\" class=\"dataframe\">\n",
275 |        "  <thead>\n",
276 |        "    <tr style=\"text-align: right;\">\n",
277 |        "      <th></th>\n",
278 |        "      <th>All</th>\n",
279 |        "      <th>Name</th>\n",
280 |        "      <th>Vehicles</th>\n",
281 |        "      <th>-2</th>\n",
282 |        "      <th>-9</th>\n",
283 |        "    </tr>\n",
284 |        "  </thead>\n",
285 |        "  <tbody>\n",
286 |        "    <tr>\n",
287 |        "      <th>0</th>\n",
288 |        "      <td>thing</td>\n",
289 |        "      <td>John</td>\n",
290 |        "      <td>Car</td>\n",
291 |        "      <td>2014.0</td>\n",
292 |        "      <td>2.0</td>\n",
293 |        "    </tr>\n",
294 |        "    <tr>\n",
295 |        "      <th>1</th>\n",
296 |        "      <td>thing</td>\n",
297 |        "      <td>John</td>\n",
298 |        "      <td>Planes</td>\n",
299 |        "      <td>2014.0</td>\n",
300 |        "      <td>2.0</td>\n",
301 |        "    </tr>\n",
302 |        "    <tr>\n",
303 |        "      <th>2</th>\n",
304 |        "      <td>thing</td>\n",
305 |        "      <td>John</td>\n",
306 |        "      <td>Trains</td>\n",
307 |        "      <td>2014.0</td>\n",
308 |        "      <td>1.0</td>\n",
309 |        "    </tr>\n",
310 |        "    <tr>\n",
311 |        "      <th>3</th>\n",
312 |        "      <td>thing</td>\n",
313 |        "      <td>Paul</td>\n",
314 |        "      <td>Car</td>\n",
315 |        "      <td>2014.0</td>\n",
316 |        "      <td>3.0</td>\n",
317 |        "    </tr>\n",
318 |        "    <tr>\n",
319 |        "      <th>4</th>\n",
320 |        "      <td>thing</td>\n",
321 |        "      <td>Paul</td>\n",
322 |        "      <td>Planes</td>\n",
323 |        "      <td>2014.0</td>\n",
324 |        "      <td>3.0</td>\n",
325 |        "    </tr>\n",
326 |        "    <tr>\n",
327 |        "      <th>5</th>\n",
328 |        "      <td>thing</td>\n",
329 |        "      <td>Paul</td>\n",
330 |        "      <td>Trains</td>\n",
331 |        "      <td>2014.0</td>\n",
332 |        "      <td>2.0</td>\n",
333 |        "    </tr>\n",
334 |        "    <tr>\n",
335 |        "      <th>6</th>\n",
336 |        "      <td>thing</td>\n",
337 |        "      <td>Ringo</td>\n",
338 |        "      <td>Car</td>\n",
339 |        "      <td>2014.0</td>\n",
340 |        "      <td>4.0</td>\n",
341 |        "    </tr>\n",
342 |        "    <tr>\n",
343 |        "      <th>7</th>\n",
344 |        "      <td>thing</td>\n",
345 |        "      <td>Ringo</td>\n",
346 |        "      <td>Planes</td>\n",
347 |        "      <td>2014.0</td>\n",
348 |        "      <td>1.0</td>\n",
349 |        "    </tr>\n",
350 |        "    <tr>\n",
351 |        "      <th>8</th>\n",
352 |        "      <td>thing</td>\n",
353 |        "      <td>Ringo</td>\n",
354 |        "      <td>Trains</td>\n",
355 |        "      <td>2014.0</td>\n",
356 |        "      <td>3.0</td>\n",
357 |        "    </tr>\n",
358 |        "    <tr>\n",
359 |        "      <th>9</th>\n",
360 |        "      <td>thing</td>\n",
361 |        "      <td>George</td>\n",
362 |        "      <td>Car</td>\n",
363 |        "      <td>2014.0</td>\n",
364 |        "      <td>2.0</td>\n",
365 |        "    </tr>\n",
366 |        "    <tr>\n",
367 |        "      <th>10</th>\n",
368 |        "      <td>thing</td>\n",
369 |        "      <td>George</td>\n",
370 |        "      <td>Planes</td>\n",
371 |        "      <td>2014.0</td>\n",
372 |        "      <td>5.0</td>\n",
373 |        "    </tr>\n",
374 |        "    <tr>\n",
375 |        "      <th>11</th>\n",
376 |        "      <td>thing</td>\n",
377 |        "      <td>George</td>\n",
378 |        "      <td>Trains</td>\n",
379 |        "      <td>2014.0</td>\n",
380 |        "      <td>5.0</td>\n",
381 |        "    </tr>\n",
382 |        "  </tbody>\n",
383 |        "</table>\n",
384 |        "</div>"
385 |       ],
386 |       "text/plain": [
387 |        "      All    Name Vehicles      -2   -9\n",
388 |        "0   thing    John      Car  2014.0  2.0\n",
389 |        "1   thing    John   Planes  2014.0  2.0\n",
390 |        "2   thing    John   Trains  2014.0  1.0\n",
391 |        "3   thing    Paul      Car  2014.0  3.0\n",
392 |        "4   thing    Paul   Planes  2014.0  3.0\n",
393 |        "5   thing    Paul   Trains  2014.0  2.0\n",
394 |        "6   thing   Ringo      Car  2014.0  4.0\n",
395 |        "7   thing   Ringo   Planes  2014.0  1.0\n",
396 |        "8   thing   Ringo   Trains  2014.0  3.0\n",
397 |        "9   thing  George      Car  2014.0  2.0\n",
398 |        "10  thing  George   Planes  2014.0  5.0\n",
399 |        "11  thing  George   Trains  2014.0  5.0"
400 |       ]
401 |      },
402 |      "execution_count": 8,
403 |      "metadata": {},
404 |      "output_type": "execute_result"
405 |     }
406 |    ],
407 |    "source": [
408 |     "topandas(conversionsegment)\n"
409 |    ]
410 |   },
411 |   {
412 |    "cell_type": "code",
413 |    "execution_count": 9,
414 |    "metadata": {
415 |     "collapsed": false
416 |    },
417 |    "outputs": [
418 |     {
419 |      "name": "stdout",
420 |      "output_type": "stream",
421 |      "text": [
422 |       "observation,data_marking,statistical_unit_eng,statistical_unit_cym,measure_type_eng,measure_type_cym,observation_type,empty,obs_type_value,unit_multiplier,unit_of_measure_eng,unit_of_measure_cym,confidentuality,empty1,geographic_area,empty2,empty3,time_dim_item_id,time_dim_item_label_eng,time_dim_item_label_cym,time_type,empty4,statistical_population_id,statistical_population_label_eng,statistical_population_label_cym,cdid,cdiddescrip,empty5,empty6,empty7,empty8,empty9,empty10,empty11,empty12,dim_id_1,dimension_label_eng_1,dimension_label_cym_1,dim_item_id_1,dimension_item_label_eng_1,dimension_item_label_cym_1,is_total_1,is_sub_total_1,dim_id_2,dimension_label_eng_2,dimension_label_cym_2,dim_item_id_2,dimension_item_label_eng_2,dimension_item_label_cym_2,is_total_2,is_sub_total_2,dim_id_3,dimension_label_eng_3,dimension_label_cym_3,dim_item_id_3,dimension_item_label_eng_3,dimension_item_label_cym_3,is_total_3,is_sub_total_3\r\n",
423 |       "2.0,,,,,,,,,,,,,,,,,2014,2014,,Year,,,,,,,,,,,,,,,Vehicles,Vehicles,,Car,Car,,,,Name,Name,,John,John,,,,All,All,,thing,thing,,,\r\n",
424 |       "2.0,,,,,,,,,,,,,,,,,2014,2014,,Year,,,,,,,,,,,,,,,Vehicles,Vehicles,,Planes,Planes,,,,Name,Name,,John,John,,,,All,All,,thing,thing,,,\r\n",
425 |       "1.0,,,,,,,,,,,,,,,,,2014,2014,,Year,,,,,,,,,,,,,,,Vehicles,Vehicles,,Trains,Trains,,,,Name,Name,,John,John,,,,All,All,,thing,thing,,,\r\n",
426 |       "3.0,,,,,,,,,,,,,,,,,2014,2014,,Year,,,,,,,,,,,,,,,Vehicles,Vehicles,,Car,Car,,,,Name,Name,,Paul,Paul,,,,All,All,,thing,thing,,,\r\n",
427 |       "3.0,,,,,,,,,,,,,,,,,2014,2014,,Year,,,,,,,,,,,,,,,Vehicles,Vehicles,,Planes,Planes,,,,Name,Name,,Paul,Paul,,,,All,All,,thing,thing,,,\r\n",
428 |       "2.0,,,,,,,,,,,,,,,,,2014,2014,,Year,,,,,,,,,,,,,,,Vehicles,Vehicles,,Trains,Trains,,,,Name,Name,,Paul,Paul,,,,All,All,,thing,thing,,,\r\n",
429 |       "4.0,,,,,,,,,,,,,,,,,2014,2014,,Year,,,,,,,,,,,,,,,Vehicles,Vehicles,,Car,Car,,,,Name,Name,,Ringo,Ringo,,,,All,All,,thing,thing,,,\r\n",
430 |       "1.0,,,,,,,,,,,,,,,,,2014,2014,,Year,,,,,,,,,,,,,,,Vehicles,Vehicles,,Planes,Planes,,,,Name,Name,,Ringo,Ringo,,,,All,All,,thing,thing,,,\r\n",
431 |       "3.0,,,,,,,,,,,,,,,,,2014,2014,,Year,,,,,,,,,,,,,,,Vehicles,Vehicles,,Trains,Trains,,,,Name,Name,,Ringo,Ringo,,,,All,All,,thing,thing,,,\r\n",
432 |       "2.0,,,,,,,,,,,,,,,,,2014,2014,,Year,,,,,,,,,,,,,,,Vehicles,Vehicles,,Car,Car,,,,Name,Name,,George,George,,,,All,All,,thing,thing,,,\r\n",
433 |       "5.0,,,,,,,,,,,,,,,,,2014,2014,,Year,,,,,,,,,,,,,,,Vehicles,Vehicles,,Planes,Planes,,,,Name,Name,,George,George,,,,All,All,,thing,thing,,,\r\n",
434 |       "5.0,,,,,,,,,,,,,,,,,2014,2014,,Year,,,,,,,,,,,,,,,Vehicles,Vehicles,,Trains,Trains,,,,Name,Name,,George,George,,,,All,All,,thing,thing,,,\r\n",
435 |       "*********,12\r\n",
436 |       "\n"
437 |      ]
438 |     }
439 |    ],
440 |    "source": [
441 |     "writetechnicalCSV(None, conversionsegments)\n"
442 |    ]
443 |   },
444 |   {
445 |    "cell_type": "code",
446 |    "execution_count": null,
447 |    "metadata": {
448 |     "collapsed": true
449 |    },
450 |    "outputs": [],
451 |    "source": []
452 |   },
453 |   {
454 |    "cell_type": "code",
455 |    "execution_count": null,
456 |    "metadata": {
457 |     "collapsed": true
458 |    },
459 |    "outputs": [],
460 |    "source": []
461 |   }
462 |  ],
463 |  "metadata": {
464 |   "kernelspec": {
465 |    "display_name": "Python 3",
466 |    "language": "python",
467 |    "name": "python3"
468 |   },
469 |   "language_info": {
470 |    "codemirror_mode": {
471 |     "name": "ipython",
472 |     "version": 3
473 |    },
474 |    "file_extension": ".py",
475 |    "mimetype": "text/x-python",
476 |    "name": "python",
477 |    "nbconvert_exporter": "python",
478 |    "pygments_lexer": "ipython3",
479 |    "version": "3.5.2"
480 |   }
481 |  },
482 |  "nbformat": 4,
483 |  "nbformat_minor": 1
484 | }
485 | 


--------------------------------------------------------------------------------
/databaker/tutorial/example1.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cantabular/databaker/930b65e85038a3afd59574bbdc1d4cb79ce9b6ad/databaker/tutorial/example1.xls


--------------------------------------------------------------------------------
/databaker/tutorial/nbconvert_demo.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# `nbconvert`\n",
  8 |     "\n",
  9 |     "`nbconvert` is a Jupyter command line tool that can convert Jupyter notebooks to other output formats, and also execute them before converting. It's very useful for logging \"blind\" processing of a notebook."
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "You can use it like:\n",
 17 |     "\n",
 18 |     "```sh\n",
 19 |     "jupyter nbconvert --to html --execute my_notebook.ipynb\n",
 20 |     "```\n",
 21 |     "\n",
 22 |     "which will execute the input cells in `my_notebook.ipynb` and save the entire output as HTML.\n",
 23 |     "\n",
 24 |     "[Full documentation](https://nbconvert.readthedocs.io) is available from the developers."
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "markdown",
 29 |    "metadata": {},
 30 |    "source": [
 31 |     "## Input filename wrangling\n",
 32 |     "\n",
 33 |     "Unfortunately, `nbconvert` is a little limited to implement the `bake.py` usage we used to have in Databaker where you could specify filenames as it does not support passing in arguments to the notebook, e.g. so that you can change a variable, such as filename.\n",
 34 |     "\n",
 35 |     "So, we've written a wrapper, `databaker_nbconvert` around this that allows you to specify a notebook filename and an input filename. **The notebook and the input file should be in the same directory.** The notebook filename you specify can be an absolute path, but the input file should be just the filename without any path. Simplest way is to just stick everything in one directory and run `databaker_nbconvert` from there; it should work as a standalone command.\n",
 36 |     "\n",
 37 |     "Here's a very simple demo that shows this in action. We're not doing any processing of the spreadsheets here, but is only designed to show how you could switch a filename at the command line, while still being able to specify the filename within the notebook for development."
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": null,
 43 |    "metadata": {
 44 |     "collapsed": false
 45 |    },
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "import databaker.framework"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "markdown",
 53 |    "metadata": {},
 54 |    "source": [
 55 |     "`databaker.framework.DATABAKER_INPUT_FILE` is just a string of the filename to use. we specify the input filename that we're using within this notebook. By default, this is the file that will get used."
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": null,
 61 |    "metadata": {
 62 |     "collapsed": false
 63 |    },
 64 |    "outputs": [],
 65 |    "source": [
 66 |     "databaker.framework.DATABAKER_INPUT_FILE = 'example1.xls'"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "markdown",
 71 |    "metadata": {},
 72 |    "source": [
 73 |     "`getinputfilename()` is a function that gives you back the spreadsheet filename that we've passed to `databaker_nbconvert` or, if that's not the case, it gives us back the `DATABAKER_INPUT_FILE` value specified above.\n",
 74 |     "\n",
 75 |     "This way, we can leave `f` unspecified and allows us to do the following:\n",
 76 |     "\n",
 77 |     "* if we process the notebook here, then we will process `example1.xls`.\n",
 78 |     "\n",
 79 |     "* if we process with `databaker_nbconvert` with a specified spreadsheet filename, then we override the `example1.xls` here with whichever filename we specified to `databaker_nbconvert`.\n",
 80 |     "\n",
 81 |     "(This is actually a little bit of a hack that uses operating system environment variables to pass the values in, and we wrap this in another Python script, so this is transparent to the user, and also simplifies how this works across Windows and Linux.) "
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": null,
 87 |    "metadata": {
 88 |     "collapsed": false
 89 |    },
 90 |    "outputs": [],
 91 |    "source": [
 92 |     "f = databaker.framework.getinputfilename()\n",
 93 |     "print(f)"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "markdown",
 98 |    "metadata": {},
 99 |    "source": [
100 |     "Below, you'll see the loaded XLS details. If you process this notebook with `databaker_nbconvert` and enter `ott.xls` as a spreadsheet filename, e.g.\n",
101 |     "\n",
102 |     "```sh\n",
103 |     "databaker_nbconvert \"nbconvert_demo.ipynb\" \"ott.xls\"\n",
104 |     "```\n",
105 |     "\n",
106 |     "you'll see that's what gets loaded, not the `example1.xls` we specified above (but is ignored)."
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "code",
111 |    "execution_count": null,
112 |    "metadata": {
113 |     "collapsed": false
114 |    },
115 |    "outputs": [],
116 |    "source": [
117 |     "databaker.framework.loadxlstabs(f)"
118 |    ]
119 |   }
120 |  ],
121 |  "metadata": {
122 |   "kernelspec": {
123 |    "display_name": "Python 3",
124 |    "language": "python",
125 |    "name": "python3"
126 |   },
127 |   "language_info": {
128 |    "codemirror_mode": {
129 |     "name": "ipython",
130 |     "version": 3
131 |    },
132 |    "file_extension": ".py",
133 |    "mimetype": "text/x-python",
134 |    "name": "python",
135 |    "nbconvert_exporter": "python",
136 |    "pygments_lexer": "ipython3",
137 |    "version": "3.5.2"
138 |   }
139 |  },
140 |  "nbformat": 4,
141 |  "nbformat_minor": 1
142 | }
143 | 


--------------------------------------------------------------------------------
/databaker/tutorial/ott.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cantabular/databaker/930b65e85038a3afd59574bbdc1d4cb79ce9b6ad/databaker/tutorial/ott.xls


--------------------------------------------------------------------------------
/databaker/tutorial/tutorial_reference.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Function reference\n",
  8 |     "\n",
  9 |     "This is a one-page comprehensive run-down of all the functions and features in the system to be kept for reference.  \n",
 10 |     "\n",
 11 |     "By executing `tutorial()` in the notebook you are taking a copy of all the tutorial notebooks, including this one.  \n",
 12 |     "\n",
 13 |     "So, if a function looks useful but you don't quite understand the description, you should experiment with its input and outputs within this interactive programming environment.  \n",
 14 |     "\n",
 15 |     "However, you need to execute at least the first cell in every section for it to work as it imports the libraries.\n",
 16 |     "\n",
 17 |     "## Table of contents\n",
 18 |     "\n",
 19 |     "* [Loading and saving](#Loading-and-saving) - The input (excel and WDA files) and output methods (html and WDA files)\n",
 20 |     "* [Cell bag selection](#Cell-bag-selection) - Selecting and transforming sets of cells in the spreadsheet\n",
 21 |     "* [Dimensions](#Dimensions) - Turning a mere set of cells into a dimension with look up instructions\n",
 22 |     "* [Conversion segments](#Conversion-Segments) - The batch of observations and list of dimensions that creates the output\n",
 23 |     "* [Downloading and unzipping files](#Downloading-excel-and-unzipping-files) - Further routes to full automation\n"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "markdown",
 28 |    "metadata": {},
 29 |    "source": [
 30 |     "# Loading and saving\n",
 31 |     "\n",
 32 |     "### tabs = loadxlstabs(inputfile, sheetids=\"*\", verbose=True)\n",
 33 |     "Load xls file into a list of tables, which act as bags of cells\n",
 34 |     "  \n",
 35 |     "  \n",
 36 |     "### savepreviewhtml(tab, htmlfilename=None, verbose=True)\n",
 37 |     "\n",
 38 |     "Previews a table -- or list of cellbags or conversion segments with the same table -- either inline, or into a separate file.\n",
 39 |     "  \n",
 40 |     "  \n",
 41 |     "### writetechnicalCSV(outputfile, conversionsegments) \n",
 42 |     "\n",
 43 |     "Outputs a WDA format CSV file from a list of conversion segments or pandas dataframes\n",
 44 |     "\n",
 45 |     "\n",
 46 |     "### readtechnicalCSV(wdafile, bverbose=False, baspandas=True)\n",
 47 |     "\n",
 48 |     "Reads in an old WDA file into a list of pandas tables, one for each segment\n"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "code",
 53 |    "execution_count": 7,
 54 |    "metadata": {
 55 |     "collapsed": false
 56 |    },
 57 |    "outputs": [],
 58 |    "source": [
 59 |     "from databaker.framework import *\n",
 60 |     "\n",
 61 |     "# put your input-output files here\n",
 62 |     "inputfile = \"example1.xls\"\n",
 63 |     "outputfile = \"example1.csv\"\n",
 64 |     "previewfile = \"preview.html\"\n"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "markdown",
 69 |    "metadata": {},
 70 |    "source": [
 71 |     "# Cell bag selection\n",
 72 |     "These functions generally apply to a table as well as a cell bag, but they always output a cell bag.\n",
 73 |     "\n",
 74 |     "A cell bag `bag` always has a pointer to its original table `bag.table`.  Howwever, you can access the underlying unordered set of cells of a bag as `bag.unordered_cells`.\n",
 75 |     "\n",
 76 |     "**Note** in the examples below, please use `savepreviewhtml(cellbag)` or `savepreviewhtml([cellbagA, cellbagB, ...])` to see what the selections look like in the contents of the table.  These have been left out of unused notebook only to save clutter."
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": 2,
 82 |    "metadata": {
 83 |     "collapsed": false
 84 |    },
 85 |    "outputs": [
 86 |     {
 87 |      "name": "stdout",
 88 |      "output_type": "stream",
 89 |      "text": [
 90 |       "Loading example1.xls which has size 8192 bytes\n",
 91 |       "Table names: ['stones']\n",
 92 |       "{<B4 'Jan'>, <C6 'yes'>, <E7 88.0>, <D2 ''>, <E2 ''>, <A5 ''>, <E1 ''>, <B6 'Feb'>, <E3 'cost'>, <D3 'Rocks'>, <D8 'basalt'>, <C8 'yes'>, <A9 ''>, <B5 'Aug'>, <A2 'Date'>, <B9 'Dec'>, <D6 'limestone'>, <E6 2.0>, <A6 1989.0>, <C7 'no'>, <A8 ''>, <C3 'present'>, <C9 'yes'>, <E9 8.0>, <A1 ''>, <B3 'Month'>, <A4 1972.0>, <B8 'Jun'>, <B1 ''>, <E5 30.0>, <C2 ''>, <E8 96.0>, <C5 'no'>, <B7 'Mar'>, <D7 'shale'>, <B2 ''>, <D4 'chalk'>, <C4 'yes'>, <C1 ''>, <A3 'Year'>, <D5 'granite'>, <D1 ''>, <A7 ''>, <E4 10.0>, <D9 'ice'>}\n"
 93 |      ]
 94 |     }
 95 |    ],
 96 |    "source": [
 97 |     "from databaker.framework import *\n",
 98 |     "tab = loadxlstabs(\"example1.xls\", sheetids=\"stones\", verbose=True)[0]\n",
 99 |     "print(tab)\n"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "markdown",
104 |    "metadata": {},
105 |    "source": [
106 |     "### cellbag.is_XXX()\n",
107 |     "### cellbag.is_not_XXX()\n",
108 |     "\n",
109 |     "Returns cells which are or are not a XXX thing.\n",
110 |     "  \n",
111 |     "Allowable functions: \n",
112 |     "\n",
113 |     "> bold, italic, underline, number, date, whitespace, strikeout, any_border, all_border, richtext\n",
114 |     "\n",
115 |     "These functions can be chained, eg cellbag.is_not_number().is_not_whitespace()."
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "code",
120 |    "execution_count": 3,
121 |    "metadata": {
122 |     "collapsed": false
123 |    },
124 |    "outputs": [
125 |     {
126 |      "name": "stdout",
127 |      "output_type": "stream",
128 |      "text": [
129 |       "Numbered cells only: {<E5 30.0>, <E9 8.0>, <E7 88.0>, <A4 1972.0>, <E6 2.0>, <A6 1989.0>, <E8 96.0>, <E4 10.0>}\n",
130 |       "\n",
131 |       "Not numbers: {<B4 'Jan'>, <C6 'yes'>, <D2 ''>, <E2 ''>, <B6 'Feb'>, <A5 ''>, <E1 ''>, <E3 'cost'>, <D3 'Rocks'>, <D8 'basalt'>, <C8 'yes'>, <A9 ''>, <B5 'Aug'>, <A2 'Date'>, <B9 'Dec'>, <D6 'limestone'>, <D1 ''>, <C7 'no'>, <C3 'present'>, <A1 ''>, <B3 'Month'>, <B8 'Jun'>, <B1 ''>, <C2 ''>, <C5 'no'>, <B7 'Mar'>, <B2 ''>, <D5 'granite'>, <A7 ''>, <C4 'yes'>, <C1 ''>, <A3 'Year'>, <C9 'yes'>, <A8 ''>, <D7 'shale'>, <D4 'chalk'>, <D9 'ice'>}\n",
132 |       "\n",
133 |       "Not numbers and not whitespace: {<B4 'Jan'>, <C6 'yes'>, <B6 'Feb'>, <E3 'cost'>, <D3 'Rocks'>, <D8 'basalt'>, <C8 'yes'>, <B5 'Aug'>, <A2 'Date'>, <B9 'Dec'>, <D6 'limestone'>, <C7 'no'>, <C3 'present'>, <B3 'Month'>, <B8 'Jun'>, <C5 'no'>, <B7 'Mar'>, <D5 'granite'>, <C4 'yes'>, <A3 'Year'>, <C9 'yes'>, <D7 'shale'>, <D4 'chalk'>, <D9 'ice'>}\n",
134 |       "\n",
135 |       "Cells that seem to be a date: {<A4 1972.0>, <A6 1989.0>}\n"
136 |      ]
137 |     }
138 |    ],
139 |    "source": [
140 |     "cellbag = tab\n",
141 |     "print(\"Numbered cells only:\", cellbag.is_number())\n",
142 |     "print()\n",
143 |     "print(\"Not numbers:\", cellbag.is_not_number())\n",
144 |     "print()\n",
145 |     "print(\"Not numbers and not whitespace:\", cellbag.is_not_number().is_not_whitespace())\n",
146 |     "print()\n",
147 |     "print(\"Cells that seem to be a date:\", cellbag.is_date())\n"
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "markdown",
152 |    "metadata": {},
153 |    "source": [
154 |     "### cellbag.filter(word)\n",
155 |     "\n",
156 |     "Only cells matching this word exactly\n",
157 |     "\n",
158 |     "### cellbag.filter(function(cell))\n",
159 |     "\n",
160 |     "Only cells where function(cell) == True\n",
161 |     "\n",
162 |     "\n",
163 |     "### cellbag.one_of([word1, word2])\n",
164 |     "\n",
165 |     "Only cells matching one of the words\n",
166 |     "\n",
167 |     "\n",
168 |     "### cellbag.regex(regexp)\n",
169 |     "\n",
170 |     "Only cell matching one of the words\n",
171 |     "\n",
172 |     "\n",
173 |     "### cellbag.excel_ref(ref)\n",
174 |     "\n",
175 |     "Selects a cell by its excel Column-Row/Letter-Number format where 'A1' is the top left hand corner.\n",
176 |     "\n",
177 |     "This also works for single columns or rows (eg 'C', or '3') and ranges (eg 'A2:B3'). \n",
178 |     "\n",
179 |     "This way of accessing is not recommended unless you know that the spreadsheet you are working with won't have extra rows or columns inserted or deleted from it.  \n",
180 |     "\n",
181 |     "### cellbag.by_index(n)\n",
182 |     "\n",
183 |     "Selects a single cell from the cell bag of index n, where n=1 is the first element.  (n can also be a list of integers.)\n",
184 |     "\n",
185 |     "\n",
186 |     "### cellbag.assert_one()\n",
187 |     "\n",
188 |     "Throws an exception if there is not exactly one cell in this bag (useful for validation if your filter above was supposed to return only one cell)\n",
189 |     "\n",
190 |     "### cellbag.value\n",
191 |     "\n",
192 |     "If `len(cellbag) == 1` then cellbag.value gives the original value within that cell, otherwise it throws an exception.\n"
193 |    ]
194 |   },
195 |   {
196 |    "cell_type": "code",
197 |    "execution_count": 20,
198 |    "metadata": {
199 |     "collapsed": false
200 |    },
201 |    "outputs": [
202 |     {
203 |      "name": "stdout",
204 |      "output_type": "stream",
205 |      "text": [
206 |       "Loading example1.xls which has size 8192 bytes\n",
207 |       "Table names: ['stones']\n",
208 |       "Get some matching cells {<D9 'ice'>, <D3 'Rocks'>}\n",
209 |       "A3 is {<A3 'Year'>}\n",
210 |       "A3:B4 is {<A4 1972.0>, <B4 'Jan'>, <A3 'Year'>, <A2 'Date'>, <B2 ''>, <B3 'Month'>}\n",
211 |       "\n",
212 |       "The second cell in the whole table is {<B1 ''>}\n",
213 |       "Numbers greater than 20 {<A4 1972.0>, <E5 30.0>, <E7 88.0>, <E8 96.0>, <A6 1989.0>}\n",
214 |       "Numbers less than 20 {<E6 2.0>, <E4 10.0>, <E9 8.0>}\n"
215 |      ]
216 |     }
217 |    ],
218 |    "source": [
219 |     "from databaker.framework import *   # restated import so you can run from this cell\n",
220 |     "cellbag = tab = loadxlstabs(\"example1.xls\", sheetids=\"stones\", verbose=True)[0]\n",
221 |     "\n",
222 |     "print(\"Get some matching cells\", cellbag.one_of([\"Rocks\", \"ice\", \"mud\"]))\n",
223 |     "print(\"A3 is\", cellbag.excel_ref(\"A3\"))\n",
224 |     "print(\"A3:B4 is\", cellbag.excel_ref(\"A2:B4\"))\n",
225 |     "print()\n",
226 |     "print(\"The second cell in the whole table is\", tab.by_index(2))\n",
227 |     "\n",
228 |     "ngreater20 = cellbag.is_number().filter(lambda c: c.value>20)\n",
229 |     "nlessthan20 = cellbag.is_number().filter(lambda c: c.value<20)\n",
230 |     "print(\"Numbers greater than 20\", ngreater20)\n",
231 |     "print(\"Numbers less than 20\", nlessthan20)\n",
232 |     "\n",
233 |     "# Uncomment this line to see these selections in contents\n",
234 |     "# savepreviewhtml([ngreater20, nlessthan20])\n"
235 |    ]
236 |   },
237 |   {
238 |    "cell_type": "markdown",
239 |    "metadata": {},
240 |    "source": [
241 |     "### cellbag1.union(cellbag2)\n",
242 |     "\n",
243 |     "Union of two bags.  Can also be expressed as `cellbag1 | cellbag2`\n",
244 |     "\n",
245 |     "### cellbag1.difference(cellbag2)\n",
246 |     "\n",
247 |     "Difference of two bags.  Can also be expressed as `cellbag1 - cellbag2`\n",
248 |     "\n",
249 |     "### cellbag1.difference(cellbag2)\n",
250 |     "\n",
251 |     "Intersection of two bags.  Can also be expressed as `cellbag1 & cellbag2`"
252 |    ]
253 |   },
254 |   {
255 |    "cell_type": "code",
256 |    "execution_count": 121,
257 |    "metadata": {
258 |     "collapsed": false
259 |    },
260 |    "outputs": [
261 |     {
262 |      "name": "stdout",
263 |      "output_type": "stream",
264 |      "text": [
265 |       "colC {<D5 'granite'>, <D3 'Rocks'>, <D4 'chalk'>}\n",
266 |       "rowC {<A4 1972.0>, <D4 'chalk'>, <C4 'yes'>, <B4 'Jan'>}\n",
267 |       "\n",
268 |       "Union is {<D5 'granite'>, <A4 1972.0>, <D3 'Rocks'>, <D4 'chalk'>, <C4 'yes'>, <B4 'Jan'>}\n",
269 |       "Difference is {<D5 'granite'>, <D3 'Rocks'>}\n",
270 |       "Intersection is {<D4 'chalk'>}\n",
271 |       "\n",
272 |       "Union is {<D5 'granite'>, <A4 1972.0>, <D3 'Rocks'>, <D4 'chalk'>, <C4 'yes'>, <B4 'Jan'>}\n",
273 |       "Difference is {<D5 'granite'>, <D3 'Rocks'>}\n",
274 |       "Intersection is {<D4 'chalk'>}\n"
275 |      ]
276 |     }
277 |    ],
278 |    "source": [
279 |     "colC = tab.excel_ref(\"D3:D5\")\n",
280 |     "rowC = tab.excel_ref(\"A4:D4\")\n",
281 |     "print(\"colC\", colC)\n",
282 |     "print(\"rowC\", rowC)\n",
283 |     "print()\n",
284 |     "print(\"Union is\", colC.union(rowC))\n",
285 |     "print(\"Difference is\", colC.difference(rowC))\n",
286 |     "print(\"Intersection is\", colC.intersection(rowC))\n",
287 |     "print()\n",
288 |     "print(\"Union is\", (colC | rowC))\n",
289 |     "print(\"Difference is\", (colC - rowC))\n",
290 |     "print(\"Intersection is\", (colC & rowC))\n"
291 |    ]
292 |   },
293 |   {
294 |    "cell_type": "markdown",
295 |    "metadata": {},
296 |    "source": [
297 |     "### cellbag1.waffle(cellbag2)\n",
298 |     "\n",
299 |     "Get all cells which have a cell from one bag above them, and the other bag to the side. Note that the two bags are interchangable without changing the output. You can change the direction from its default (DOWN) by specifying direction=LEFT or similar.\n",
300 |     "\n",
301 |     "### cellbag1.junction(cellbag2)\n",
302 |     "\n",
303 |     "Enumerates the output of waffle in triplets\n",
304 |     "\n",
305 |     "\n",
306 |     "### cellbag1.same_row(cellbag2)\n",
307 |     "\n",
308 |     "Get cells in this bag which are in the same row as a cell in the second.\n",
309 |     "\n",
310 |     "### cellbag1.same_column(cellbag2)\n",
311 |     "\n",
312 |     "Get cells in this bag which are in the same column as a cell in the second."
313 |    ]
314 |   },
315 |   {
316 |    "cell_type": "code",
317 |    "execution_count": 7,
318 |    "metadata": {
319 |     "collapsed": false
320 |    },
321 |    "outputs": [
322 |     {
323 |      "name": "stdout",
324 |      "output_type": "stream",
325 |      "text": [
326 |       "Waffle:\n"
327 |      ]
328 |     },
329 |     {
330 |      "data": {
331 |       "text/html": [
332 |        "<div id=\"injblock1003\">\n",
333 |        "<style>\n",
334 |        "table.ex, table.exkey { border: thin black solid }\n",
335 |        "table.ex td, table.ex tr { border: none }\n",
336 |        "table.ex td:hover { border: thin blue solid }\n",
337 |        "table.ex td.excOBS:hover { border: thin red solid }\n",
338 |        "table.ex td.selected { border: thick red solid }\n",
339 |        "</style>\n",
340 |        "\n",
341 |        "\n",
342 |        "<table class=\"exkey\">\n",
343 |        "<tr><td class=\"xc1\" style=\"background-color:LightGreen\">item 0</td><td class=\"xc2\" style=\"background-color:MistyRose\">item 1</td><td class=\"xc3\" style=\"background-color:LightGray\">item 2</td></tr></table>\n",
344 |        "\n",
345 |        "<table class=\"ex\">\n",
346 |        "<caption style=\"text-align:center; padding:0px; caption-side:bottom\">stones</caption>\n",
347 |        "<tr><td title=\"0 0\"></td><td title=\"1 0\"></td><td title=\"2 0\"></td><td title=\"3 0\"></td><td title=\"4 0\"></td></tr>\n",
348 |        "<tr><td title=\"0 1\">Date</td><td title=\"1 1\"></td><td style=\"font-weight:bold\" title=\"2 1\"></td><td title=\"3 1\"></td><td title=\"4 1\"></td></tr>\n",
349 |        "<tr><td title=\"0 2\">Year</td><td title=\"1 2\">Month</td><td style=\"font-weight:bold\" title=\"2 2\">present</td><td style=\"background-color:LightGreen\" title=\"3 2\">Rocks</td><td style=\"font-weight:bold\" title=\"4 2\">cost</td></tr>\n",
350 |        "<tr><td title=\"0 3\">1972.0</td><td title=\"1 3\">Jan</td><td title=\"2 3\">yes</td><td title=\"3 3\">chalk</td><td style=\"background-color:LightGreen\" title=\"4 3\">10.0</td></tr>\n",
351 |        "<tr><td title=\"0 4\"></td><td title=\"1 4\">Aug</td><td title=\"2 4\">no</td><td title=\"3 4\">granite</td><td title=\"4 4\">30.0</td></tr>\n",
352 |        "<tr><td style=\"background-color:MistyRose\" title=\"0 5\">1989.0</td><td title=\"1 5\">Feb</td><td title=\"2 5\">yes</td><td style=\"background-color:LightGray\" title=\"3 5\">limestone</td><td style=\"background-color:LightGray\" title=\"4 5\">2.0</td></tr>\n",
353 |        "<tr><td style=\"background-color:MistyRose\" title=\"0 6\"></td><td title=\"1 6\">Mar</td><td title=\"2 6\">no</td><td style=\"background-color:LightGray\" title=\"3 6\">shale</td><td style=\"background-color:LightGray\" title=\"4 6\">88.0</td></tr>\n",
354 |        "<tr><td title=\"0 7\"></td><td title=\"1 7\">Jun</td><td title=\"2 7\">yes</td><td title=\"3 7\">basalt</td><td title=\"4 7\">96.0</td></tr>\n",
355 |        "<tr><td title=\"0 8\"></td><td title=\"1 8\">Dec</td><td title=\"2 8\">yes</td><td title=\"3 8\">ice</td><td title=\"4 8\">8.0</td></tr>\n",
356 |        "</table>\n",
357 |        "\n",
358 |        "</div>\n"
359 |       ],
360 |       "text/plain": [
361 |        "<IPython.core.display.HTML object>"
362 |       ]
363 |      },
364 |      "metadata": {},
365 |      "output_type": "display_data"
366 |     }
367 |    ],
368 |    "source": [
369 |     "c = tab.excel_ref(\"D3\") | tab.excel_ref(\"E4\")\n",
370 |     "d = tab.excel_ref(\"A6:A7\")\n",
371 |     "print(\"Waffle:\")\n",
372 |     "savepreviewhtml([c,d, c.waffle(d)])"
373 |    ]
374 |   },
375 |   {
376 |    "cell_type": "code",
377 |    "execution_count": 123,
378 |    "metadata": {
379 |     "collapsed": false
380 |    },
381 |    "outputs": [
382 |     {
383 |      "name": "stdout",
384 |      "output_type": "stream",
385 |      "text": [
386 |       "Junction output:\n",
387 |       "   ({<D3 'Rocks'>}, {<A6 1989.0>}, {<D6 'limestone'>})\n",
388 |       "   ({<D3 'Rocks'>}, {<A7 ''>}, {<D7 'shale'>})\n",
389 |       "   ({<E4 10.0>}, {<A6 1989.0>}, {<E6 2.0>})\n",
390 |       "   ({<E4 10.0>}, {<A7 ''>}, {<E7 88.0>})\n"
391 |      ]
392 |     }
393 |    ],
394 |    "source": [
395 |     "print(\"Junction output:\")\n",
396 |     "for s in c.junction(d):\n",
397 |     "    print(\"  \", s)"
398 |    ]
399 |   },
400 |   {
401 |    "cell_type": "code",
402 |    "execution_count": 128,
403 |    "metadata": {
404 |     "collapsed": false
405 |    },
406 |    "outputs": [
407 |     {
408 |      "name": "stdout",
409 |      "output_type": "stream",
410 |      "text": [
411 |       "Cells column A that are in same row as {<D3 'Rocks'>, <E4 10.0>} are {<A4 1972.0>, <A3 'Year'>}\n",
412 |       "Cells column 7 that are in same column as {<D3 'Rocks'>, <E4 10.0>} are {<D7 'shale'>, <E7 88.0>}\n"
413 |      ]
414 |     }
415 |    ],
416 |    "source": [
417 |     "print(\"Cells column A that are in same row as\", c, \"are\", tab.excel_ref(\"A\").same_row(c))\n",
418 |     "print(\"Cells column 7 that are in same column as\", c, \"are\", tab.excel_ref(\"7\").same_col(c))"
419 |    ]
420 |   },
421 |   {
422 |    "cell_type": "markdown",
423 |    "metadata": {},
424 |    "source": [
425 |     "### cellbag.shift(direction)\n",
426 |     "\n",
427 |     "Move the selected cells UP, DOWN, LEFT or Right by one cell\n",
428 |     "\n",
429 |     "### cellbag.shift((dx, dy))\n",
430 |     "\n",
431 |     "Move the selected cells dx cells to RIGHT and dy cells DOWN (can have negative values)\n",
432 |     "\n",
433 |     "\n",
434 |     "### cellbag.fill(direction)\n",
435 |     "\n",
436 |     "Take all the cells in one direction from the given cellbag\n",
437 |     "\n",
438 |     "### cellbag.expand(direction)\n",
439 |     "\n",
440 |     "All the cells in one direction, including itself.\n",
441 |     "\n",
442 |     "### cellbag.extrude(dx, dy)\n",
443 |     "\n",
444 |     "Step and include this many cells between 0 and dx and dy.\n"
445 |    ]
446 |   },
447 |   {
448 |    "cell_type": "code",
449 |    "execution_count": 120,
450 |    "metadata": {
451 |     "collapsed": false
452 |    },
453 |    "outputs": [
454 |     {
455 |      "name": "stdout",
456 |      "output_type": "stream",
457 |      "text": [
458 |       "Shift RIGHT from {<B4 'Jan'>} is {<C4 'yes'>}\n",
459 |       "Shift (-1,-2) from {<B4 'Jan'>} is {<A2 'Date'>}\n",
460 |       "Fill UP from {<B4 'Jan'>} is {<B3 'Month'>, <B1 ''>, <B2 ''>}\n",
461 |       "Expand UP from {<B4 'Jan'>} is {<B3 'Month'>, <B4 'Jan'>, <B1 ''>, <B2 ''>}\n",
462 |       "\n",
463 |       "How it works: UP= (0, -1)   DOWN= (0, 1)   LEFT= (-1, 0)   RIGHT= (1, 0)\n",
464 |       "\n",
465 |       "Extrude two cells rightwards {<D4 'chalk'>, <C4 'yes'>, <B4 'Jan'>}\n"
466 |      ]
467 |     }
468 |    ],
469 |    "source": [
470 |     "c = tab.excel_ref(\"B4\")\n",
471 |     "print(\"Shift RIGHT from\", c, \"is\", c.shift(RIGHT))\n",
472 |     "print(\"Shift (-1,-2) from\", c, \"is\", c.shift((-1, -2)))\n",
473 |     "print(\"Fill UP from\", c, \"is\", c.fill(UP))\n",
474 |     "print(\"Expand UP from\", c, \"is\", c.expand(UP))\n",
475 |     "print()\n",
476 |     "print(\"How it works: UP=\", UP, \"  DOWN=\", DOWN, \"  LEFT=\", LEFT, \"  RIGHT=\", RIGHT)\n",
477 |     "print()\n",
478 |     "print(\"Extrude two cells rightwards\", c.extrude(2,0))"
479 |    ]
480 |   },
481 |   {
482 |    "cell_type": "markdown",
483 |    "metadata": {},
484 |    "source": [
485 |     "# Dimensions\n",
486 |     "A dimension is simply a cellbag with a label and a lookup direction applied to it.  \n",
487 |     "\n",
488 |     "Each dimension represents a column in the output table and basically contains the instructions for how to look up to the corresponding value given a particular cell in the set of observations.\n",
489 |     "\n",
490 |     "\n",
491 |     "### hdim = HDim(cellbag, label, strict=[DIRECTLY|CLOSEST], direction=[ABOVE|BELOW|LEFT|RIGHT])\n",
492 |     "\n",
493 |     "The main constructor, taking a set of cells, a string name (label), look up condition and lookup directions.   \n",
494 |     "\n",
495 |     "The lookup conditions are:\n",
496 |     "* CLOSEST (gets the *first* cell in the same column or row as the observation in a specified direction);\n",
497 |     "* DIRECTLY (gets the *closest* cell in the same column or row as the observation in a specified direction).\n",
498 |     "\n",
499 |     "\n",
500 |     "### hdim.cellvalobs(cell)\n",
501 |     "\n",
502 |     "This function looks up the value of an individual cell in `hdim.hbagset` (defined in the constructor) according to the lookup condition and direction, and returns the pair `(cell, value)`  The `value` will always be `cell.value`, unless it has been overridden by some member of `hdim.cellvalueoverride`.\n",
503 |     "\n",
504 |     "\n",
505 |     "### hdim.AddCellValueOverride(overridecell, overridevalue)\n",
506 |     "\n",
507 |     "This function is an interface to changing the return values alters the `hdim.cellvalueoverride`.  It can be used to change the spellings of particular dimension values or to insert new heading cells in place of blank ones.  \n",
508 |     "\n",
509 |     "Inserting header cells is sometimes necessary when a heading is centred and you can't look it up with a single `(strict=CLOSEST, direction=LEFT|RIGHT)` command.  (The `direction=NEAREST` feature proved unreliable in the real world.) \n",
510 |     "\n",
511 |     "\n",
512 |     "### hdim.discardcellsnotlookedup(observationcells)\n",
513 |     "\n",
514 |     "This function uses a set of observation cells to thin out the list of dimension cells `hdim.hbagset` to only those which can be looked up.  Can be used to quickly trim out footnote in the bottom of a column that don't make any difference to the final output while making validation easier (see `hdim.checkvalues` below).\n",
515 |     "\n",
516 |     "\n",
517 |     "### hdim.valueslist()\n",
518 |     "\n",
519 |     "Use this function to print the final heading cells values (the values in `hdim.hbagset` after they are overridden by `hdim.cellvalueoverride` for use in making the validation checks.\n",
520 |     "\n",
521 |     "\n",
522 |     "### hdim.checkvalues(valueslist)\n",
523 |     "\n",
524 |     "This validates the dimension values against a hard-coded values list that has been generated earlier by `hdim.valueslist()` and throws an exception with an explanation if they are different.  \n",
525 |     "\n",
526 |     "Use this function if you need to run your code against different spreadsheets and need to check that the outputs are going to be consistent.\n",
527 |     "\n",
528 |     "\n",
529 |     "### hdimc = HDimConst(label, value)\n",
530 |     "\n",
531 |     "Create a constant dimension that will give the same value no matter what the observation is looked up.\n",
532 |     "\n"
533 |    ]
534 |   },
535 |   {
536 |    "cell_type": "code",
537 |    "execution_count": 24,
538 |    "metadata": {
539 |     "collapsed": false
540 |    },
541 |    "outputs": [
542 |     {
543 |      "name": "stdout",
544 |      "output_type": "stream",
545 |      "text": [
546 |       "{<D4 'chalk'>, <D8 'basalt'>, <D5 'granite'>, <D7 'shale'>, <D9 'ice'>, <D6 'limestone'>}\n"
547 |      ]
548 |     }
549 |    ],
550 |    "source": [
551 |     "from databaker.framework import *\n",
552 |     "tab = loadxlstabs(\"example1.xls\", sheetids=\"stones\", verbose=False)[0]\n",
553 |     "\n",
554 |     "rocks = tab.filter(\"Rocks\").fill(DOWN)\n",
555 |     "years = tab.filter(\"Year\").fill(DOWN).is_not_whitespace()\n",
556 |     "cost = tab.filter(\"cost\").fill(DOWN)\n",
557 |     "print(rocks)\n",
558 |     "\n",
559 |     "# savepreviewhtml([rocks, years, cost])  # <-- uncomment this line to see the table"
560 |    ]
561 |   },
562 |   {
563 |    "cell_type": "code",
564 |    "execution_count": 31,
565 |    "metadata": {
566 |     "collapsed": false
567 |    },
568 |    "outputs": [
569 |     {
570 |      "name": "stdout",
571 |      "output_type": "stream",
572 |      "text": [
573 |       "{<E4 10.0>} \t (<A4 1972.0>, '1972.0') \t (<D4 'chalk'>, 'chalk')\n",
574 |       "{<E5 30.0>} \t (<A4 1972.0>, '1972.0') \t (<D5 'granite'>, 'gneiss')\n",
575 |       "{<E6 2.0>} \t (<A6 1989.0>, '1989.0') \t (<D6 'limestone'>, 'limestone')\n",
576 |       "{<E7 88.0>} \t (<A6 1989.0>, '1989.0') \t (<D7 'shale'>, 'shale')\n",
577 |       "{<E8 96.0>} \t (<A6 1989.0>, '1989.0') \t (<D8 'basalt'>, 'basalt')\n",
578 |       "{<E9 8.0>} \t (<A6 1989.0>, '1989.0') \t (<D9 'ice'>, 'ice')\n"
579 |      ]
580 |     }
581 |    ],
582 |    "source": [
583 |     "hrocks = HDim(rocks, \"ROCKS!\", DIRECTLY, LEFT)\n",
584 |     "hrocks.AddCellValueOverride(\"granite\", \"gneiss\")\n",
585 |     "hyears = HDim(years, \"yyyy\", CLOSEST, UP)\n",
586 |     "\n",
587 |     "for ob in cost:\n",
588 |     "    print(ob, \"\\t\", hyears.cellvalobs(ob), \"\\t\", hrocks.cellvalobs(ob))\n",
589 |     "\n",
590 |     "# savepreviewhtml([hrocks, hyears, cost])  # <-- uncomment to see as a coloured table"
591 |    ]
592 |   },
593 |   {
594 |    "cell_type": "markdown",
595 |    "metadata": {},
596 |    "source": [
597 |     "# Conversion Segments\n",
598 |     "\n",
599 |     "A ConversionSegment is a set of observations together with a list of Dimensions\n",
600 |     "\n",
601 |     "\n",
602 |     "### ConversionSegment(observations, dimensions)\n",
603 |     "\n",
604 |     "Constructor for the ConversionSegment, where `observations` is a bag of cells and `dimensions` is a list of `HDim` and `HDimConst` dimension objects.  You can construct the dimensions at the same time as defining the list at the point when you call this function.  \n",
605 |     "\n",
606 |     "\n",
607 |     "### ConversionSegment(observations, dimensions, processTIMEUNIT=True, includecellxy=False)\n",
608 |     "\n",
609 |     "Two default parameters in the ConversionSegment constructor.  `processTIMEUNIT` controls whether a dimension called TIME should be used to automatically set the dimension known as TIMEUNIT.  This is required by the WDA output, however its operation can be implemented in pandas.  \n",
610 |     "\n",
611 |     "`includecellxy` causes the output to include three extra columns, `[__x, __y, __tablename]` which can be used for debugging purposes.  \n",
612 |     "\n",
613 |     "### conversionsegment.topandas()\n",
614 |     "\n",
615 |     "Turns a ConversionSegment into a [pandas.DataFrame](http://pandas.pydata.org/), which is an extremely powerful, efficient and widely used data manipulation library.  \n",
616 |     "\n",
617 |     "This marks the place where you depart cleanly from the Databaker library and can go on to further analysis, or it's a temporary entry into a system where the data can be fixed up before outputting it to the WDA format.  \n",
618 |     "\n",
619 |     "\n",
620 |     "### savepreviewhtml(conversionsegment, htmlfilename=None, verbose=True)\n",
621 |     "\n",
622 |     "This function is restated from the Loading-and-saving section to remind you that when you use it on a ConversionSegment the Observation cells are interactive -- click on one to highlight the dimension cells it is looking up to.\n",
623 |     "\n",
624 |     "Also, overridden values are illustrated by strike-throughs.\n",
625 |     "\n",
626 |     "\n",
627 |     "### writetechnicalCSV(outputfile, conversionsegments) \n",
628 |     "\n",
629 |     "This function is also restated from the Loading-and-saving section for saving a WDA output file.  The argument can be a single ConversionSegment, a list of ConversionSegments or a list of pandas.DataFrames (which have been cleaned up).  \n",
630 |     "\n",
631 |     "\n",
632 |     "### Special WDA dimensions\n",
633 |     "\n",
634 |     "The WDA format contains the following special dimension columns that are output at the front of every row.  They are identified by their dimension labels.\n",
635 |     "\n",
636 |     "For convenience, the variable names have been set to their string names, ie \n",
637 |     "> `STATUNIT = \"STATUNIT\"`\n",
638 |     "\n",
639 |     "See the WDA documentation for their specific uses.  \n",
640 |     "\n",
641 |     "* OBS - This is not a dimension; it's the observation column.  Do not name a dimension as \"OBS\"\n",
642 |     "* DATAMARKER - If OBS is not a number, then the non-numeric part is stripped off and put into the DATAMARKER column\n",
643 |     "* STATUNIT \n",
644 |     "* MEASURETYPE \n",
645 |     "* UNITMULTIPLIER \n",
646 |     "* UNITOFMEASURE\n",
647 |     "* GEOG\n",
648 |     "* TIME - Of the form \"2010\", \"2010 Q1\" or \"Jan 2010\"\n",
649 |     "* TIMEUNIT - \"Year\", \"Quarter\", \"Month\" respectively\n",
650 |     "* STATPOP\n",
651 |     "\n",
652 |     "\n",
653 |     "### pdguessforceTIMEUNIT(dataframe)\n",
654 |     "\n",
655 |     "Find and set the TIMEUNIT column from the TIME column in a pandas.DataFrame.  This function has two lines.  The first line matches the unit from the TIME value:\n",
656 |     "```python\n",
657 |     "df[\"TIMEUNIT\"] = df.apply(lambda row: Ldatetimeunitloose(row.TIME), axis=1)\n",
658 |     "```\n",
659 |     "The second line forces the TIME value to conform to the exact format required by the WDA file, given the TIMEUNIT\n",
660 |     "```python\n",
661 |     "df[\"TIME\"] = df.apply(lambda row: Ldatetimeunitforce(row.TIME, row.TIMEUNIT), axis=1)\n"
662 |    ]
663 |   },
664 |   {
665 |    "cell_type": "code",
666 |    "execution_count": 39,
667 |    "metadata": {
668 |     "collapsed": false
669 |    },
670 |    "outputs": [
671 |     {
672 |      "name": "stdout",
673 |      "output_type": "stream",
674 |      "text": [
675 |       "2017.0 is\t Year corrected to\t 2017\n",
676 |       "Q32017 is\t Quarter corrected to\t 2017 Q3\n",
677 |       "Mar  2017 is\t Month corrected to\t Mar 2017\n"
678 |      ]
679 |     }
680 |    ],
681 |    "source": [
682 |     "from databaker.framework import *\n",
683 |     "\n",
684 |     "times = [2017.0, \"Q32017\", \"Mar  2017\"]\n",
685 |     "for t in times:\n",
686 |     "    print(t, \"is\\t\", Ldatetimeunitloose(t), \"corrected to\\t\", Ldatetimeunitforce(t, Ldatetimeunitloose(t)))\n"
687 |    ]
688 |   },
689 |   {
690 |    "cell_type": "code",
691 |    "execution_count": 9,
692 |    "metadata": {
693 |     "collapsed": false
694 |    },
695 |    "outputs": [
696 |     {
697 |      "name": "stdout",
698 |      "output_type": "stream",
699 |      "text": [
700 |       "\n",
701 |       "observation,data_marking,statistical_unit_eng,statistical_unit_cym,measure_type_eng,measure_type_cym,observation_type,empty,obs_type_value,unit_multiplier,unit_of_measure_eng,unit_of_measure_cym,confidentuality,empty1,geographic_area,empty2,empty3,time_dim_item_id,time_dim_item_label_eng,time_dim_item_label_cym,time_type,empty4,statistical_population_id,statistical_population_label_eng,statistical_population_label_cym,cdid,cdiddescrip,empty5,empty6,empty7,empty8,empty9,empty10,empty11,empty12\r\n",
702 |       "10.0,,,,,,,,,,,,,,,,,Jan 1972,Jan 1972,,Month,,,,,,,,,,,,,0,\r\n",
703 |       "30.0,,,,,,,,,,,,,,,,,Aug 1972,Aug 1972,,Month,,,,,,,,,,,,,0,\r\n",
704 |       "2.0,,,,,,,,,,,,,,,,,Feb 1989,Feb 1989,,Month,,,,,,,,,,,,,0,\r\n",
705 |       "88.0,,,,,,,,,,,,,,,,,Mar 1989,Mar 1989,,Month,,,,,,,,,,,,,0,\r\n",
706 |       "96.0,,,,,,,,,,,,,,,,,Jun 1989,Jun 1989,,Month,,,,,,,,,,,,,0,\r\n",
707 |       "8.0,,,,,,,,,,,,,,,,,Dec 1989,Dec 1989,,Month,,,,,,,,,,,,,0,\r\n",
708 |       "*********,6\r\n",
709 |       "\n"
710 |      ]
711 |     }
712 |    ],
713 |    "source": [
714 |     "from databaker.framework import *\n",
715 |     "tab = loadxlstabs(\"example1.xls\", sheetids=\"stones\", verbose=False)[0]\n",
716 |     "\n",
717 |     "cs = ConversionSegment(tab.filter(\"cost\").fill(DOWN), [\n",
718 |     "        HDim(tab.filter(\"Year\").fill(DOWN).is_not_whitespace(), \"year\", CLOSEST, UP),\n",
719 |     "        HDim(tab.filter(\"Month\").fill(DOWN).is_not_whitespace(), \"month\", DIRECTLY, LEFT)\n",
720 |     "    ])\n",
721 |     "\n",
722 |     "\n",
723 |     "###################\n",
724 |     "# savepreviewhtml(cs)   # <-- uncomment this to see the interactive table\n",
725 |     "\n",
726 |     "dcs = cs.topandas()\n",
727 |     "# print(dcs)   # uncomment to see the table\n",
728 |     "\n",
729 |     "# concatenate the month and year into a time\n",
730 |     "dcs[\"TIME\"] = dcs.month + \" \" + dcs.year\n",
731 |     "pdguessforceTIMEUNIT(dcs)   # <-- fixes the date format (removing the '.0's on the years)\n",
732 |     "# print(dcs)   # uncomment to see the table at this point\n",
733 |     "\n",
734 |     "# delete the now redundant columns \n",
735 |     "dcs.drop(['year', \"month\"], axis=1, inplace=True)\n",
736 |     "#print(dcs)  # uncomment to see pandas table\n",
737 |     "\n",
738 |     "# Output the finished WDA file where the dates should all work!\n",
739 |     "print(writetechnicalCSV(None, dcs))"
740 |    ]
741 |   },
742 |   {
743 |    "cell_type": "markdown",
744 |    "metadata": {},
745 |    "source": [
746 |     "# Downloading excel and unzipping files\n",
747 |     "\n",
748 |     "If you are doing work on a computer that can actually be done by the computer, then you are not doing real work.  \n",
749 |     "\n",
750 |     "Please automate the webscraping and unzipping of files.  \n",
751 |     "\n",
752 |     "Here are some quick methods for downloading multiple excel spreadsheets linked to from [this page](https://www.ons.gov.uk/businessindustryandtrade/constructionindustry/datasets/outputintheconstructionindustry/current).\n"
753 |    ]
754 |   },
755 |   {
756 |    "cell_type": "code",
757 |    "execution_count": 11,
758 |    "metadata": {
759 |     "collapsed": false
760 |    },
761 |    "outputs": [
762 |     {
763 |      "name": "stdout",
764 |      "output_type": "stream",
765 |      "text": [
766 |       "Downloaded a webpage with 31071 bytes\n"
767 |      ]
768 |     }
769 |    ],
770 |    "source": [
771 |     "import urllib, re, os\n",
772 |     "\n",
773 |     "# url containing the index of a set of spreadsheets\n",
774 |     "ddurl = \"https://www.ons.gov.uk/businessindustryandtrade/constructionindustry/datasets/outputintheconstructionindustry/current\"\n",
775 |     "req1 = urllib.request.Request(ddurl, headers={'User-Agent' : \"Sensible code\"}) \n",
776 |     "dhtml = urllib.request.urlopen(req1).read().decode(\"utf8\")\n",
777 |     "print(\"Downloaded a webpage with\", len(dhtml), \"bytes\")"
778 |    ]
779 |   },
780 |   {
781 |    "cell_type": "code",
782 |    "execution_count": 20,
783 |    "metadata": {
784 |     "collapsed": false
785 |    },
786 |    "outputs": [],
787 |    "source": [
788 |     "# make the download directory\n",
789 |     "dfiles = \"downloaddir\"\n",
790 |     "if not os.path.isdir(dfiles):\n",
791 |     "    print(\"making directory\", dfiles)\n",
792 |     "    os.mkdir(dfiles)"
793 |    ]
794 |   },
795 |   {
796 |    "cell_type": "code",
797 |    "execution_count": 30,
798 |    "metadata": {
799 |     "collapsed": false
800 |    },
801 |    "outputs": [],
802 |    "source": [
803 |     "# quick and dirty regular expression for pullint out the links to relevant xls spreadsheets\n",
804 |     "xllinklist = re.findall('href=\"(/file\\?uri=/businessindustryandtrade.*?/([^/\"]*\\.xls))\"', dhtml)\n",
805 |     "    \n",
806 |     "for xl, xln in xllinklist:\n",
807 |     "    lxln = os.path.join(dfiles, xln)\n",
808 |     "    if os.path.exists(lxln):\n",
809 |     "        continue   # <-- we avoid downloading the same file a second time, in this case\n",
810 |     "    furl = urllib.parse.urljoin(ddurl, xl)\n",
811 |     "    req = urllib.request.Request(furl, headers={'User-Agent' : \"Sensible code\"}) \n",
812 |     "    xp = urllib.request.urlopen(req).read()\n",
813 |     "    print(\"Downloading\", xln, len(xp), \"bytes\")\n",
814 |     "    fout = open(lxln, \"wb\")\n",
815 |     "    fout.write(xp)\n",
816 |     "    fout.close()\n"
817 |    ]
818 |   },
819 |   {
820 |    "cell_type": "code",
821 |    "execution_count": 31,
822 |    "metadata": {
823 |     "collapsed": false
824 |    },
825 |    "outputs": [],
826 |    "source": [
827 |     "fnames = [ os.path.join(dfiles, f)  for f in os.listdir(dfiles)  if f[-4:] == '.xls' ]\n",
828 |     "\n",
829 |     "print(\"Your list of xls files is:\\n\", \"\\n \".join(fnames))\n"
830 |    ]
831 |   },
832 |   {
833 |    "cell_type": "markdown",
834 |    "metadata": {},
835 |    "source": [
836 |     "## What to do when you have zip files\n",
837 |     "\n",
838 |     "If you find yourself downloading zipfiles and manually instructing the computer to unzip each file, you should think about making the computer do the work itself.\n",
839 |     "\n",
840 |     "An example of zipfiles containing excel spreadsheets can be found on [this page](https://www.ons.gov.uk/employmentandlabourmarket/peopleinwork/workplacepensions/datasets/annualsurveyofhoursandearningspensiontablespensiontypebyagegroupandbygrossweeklyearningsbandsp1).\n",
841 |     "\n",
842 |     "First job is to download one of these files, as we did above:"
843 |    ]
844 |   },
845 |   {
846 |    "cell_type": "code",
847 |    "execution_count": 36,
848 |    "metadata": {
849 |     "collapsed": false
850 |    },
851 |    "outputs": [
852 |     {
853 |      "name": "stdout",
854 |      "output_type": "stream",
855 |      "text": [
856 |       "We are about to download the file:\n",
857 |       " https://www.ons.gov.uk/file?uri=/employmentandlabourmarket/peopleinwork/workplacepensions/datasets/annualsurveyofhoursandearningspensiontablespensiontypebyagegroupandbygrossweeklyearningsbandsp1/2015/2015provisionaltablep1.zip\n",
858 |       "downloaded.zip is 44560 bytes long.\n"
859 |      ]
860 |     }
861 |    ],
862 |    "source": [
863 |     "import urllib, re\n",
864 |     "\n",
865 |     "# fetch the front page and find the link to the zip file we want\n",
866 |     "iurl = \"https://www.ons.gov.uk/employmentandlabourmarket/peopleinwork/workplacepensions/datasets/annualsurveyofhoursandearningspensiontablespensiontypebyagegroupandbygrossweeklyearningsbandsp1\"\n",
867 |     "req = urllib.request.Request(iurl, headers={'User-Agent' : \"Sensible Code\"}) \n",
868 |     "ipage = urllib.request.urlopen(req).read()\n",
869 |     "\n",
870 |     "# search the link to the zip file and \"join\" against the baseurl to get the full url (there's a space -> %20 bug problem)\n",
871 |     "zyears = [ urllib.parse.urljoin(iurl, z.replace(\" \", \"%20\"))  for z in re.findall('<a href=\"([^\"]*?\\.zip)\"', str(ipage)) ]\n",
872 |     "zurl = zyears[0]\n",
873 |     "\n",
874 |     "print(\"We are about to download the file:\\n\", zurl)\n",
875 |     "zfilename = \"downloaded.zip\"\n",
876 |     "zurl = zurl.replace(\" \", \"%20\")   # spaces in the url get escaped in the browser\n",
877 |     "req = urllib.request.Request(zurl, headers={'User-Agent' : \"Sensible Code\"}) \n",
878 |     "zbytes = urllib.request.urlopen(req).read()\n",
879 |     "    \n",
880 |     "fout = open(zfilename, \"wb\")\n",
881 |     "fout.write(zbytes)\n",
882 |     "fout.close()\n",
883 |     "print(zfilename, \"is\", len(zbytes), \"bytes long.\")    \n"
884 |    ]
885 |   },
886 |   {
887 |    "cell_type": "code",
888 |    "execution_count": 48,
889 |    "metadata": {
890 |     "collapsed": false
891 |    },
892 |    "outputs": [
893 |     {
894 |      "name": "stdout",
895 |      "output_type": "stream",
896 |      "text": [
897 |       "The files in downloaded.zip are:\n",
898 |       " PROV - Pension Provision by Earnings & Age Group Table P1.1b   Pension Type 2015 CV.xls\n",
899 |       " PROV - Pension Provision by Earnings & Age Group Table P1.1a   Pension Type 2015.xls\n",
900 |       "\n",
901 |       "We have unzipped:\n",
902 |       " PROV - Pension Provision by Earnings & Age Group Table P1.1b   Pension Type 2015 CV.xls \n",
903 |       "and saved it as downloaded0.xls with 83968 bytes\n"
904 |      ]
905 |     }
906 |    ],
907 |    "source": [
908 |     "import zipfile\n",
909 |     "\n",
910 |     "zfilename = \"downloaded.zip\"\n",
911 |     "\n",
912 |     "# open the zipfile\n",
913 |     "zdir = zipfile.ZipFile(zfilename)\n",
914 |     "\n",
915 |     "print(\"The files in\", zfilename, \"are:\\n\", \"\\n \".join(zdir.namelist()))\n",
916 |     "\n",
917 |     "zmember0 = zdir.namelist()[0]\n",
918 |     "\n",
919 |     "xlsfilename = \"downloaded0.xls\"\n",
920 |     "fout = open(xlsfilename, \"wb\")\n",
921 |     "xlsbindata = zdir.read(zmember0)\n",
922 |     "fout.write(xlsbindata)   \n",
923 |     "fout.close()\n",
924 |     "\n",
925 |     "print()\n",
926 |     "print(\"We have unzipped:\\n\", zmember0, \"\\nand saved it as\", xlsfilename, \"with\", len(xlsbindata), \"bytes\")"
927 |    ]
928 |   },
929 |   {
930 |    "cell_type": "code",
931 |    "execution_count": 47,
932 |    "metadata": {
933 |     "collapsed": false
934 |    },
935 |    "outputs": [
936 |     {
937 |      "name": "stdout",
938 |      "output_type": "stream",
939 |      "text": [
940 |       "Loading downloaded0.xls which has size 83968 bytes\n",
941 |       "Table names: ['CV notes', 'All', 'Male', 'Female']\n"
942 |      ]
943 |     }
944 |    ],
945 |    "source": [
946 |     "# now we can load this file into databaker and continue with our work\n",
947 |     "from databaker.framework import *\n",
948 |     "tabs = loadxlstabs(xlsfilename)"
949 |    ]
950 |   },
951 |   {
952 |    "cell_type": "markdown",
953 |    "metadata": {},
954 |    "source": [
955 |     "## Final Automation Notes\n",
956 |     "\n",
957 |     "The processes for downloading, saving and extracting the excel files from places on the web might appear complex, but are in fact quite simple.  They are broken down into nothing more than opening files, listing files, reading files and saving files.  \n",
958 |     "\n",
959 |     "What is very complex is being organized at deciding where to copy the files, knowling what names they should have, and keeping track of what are the new files versus the ones that have already been seen or are even possibly being over-written.  \n",
960 |     "\n",
961 |     "There's no obvious answer to this problem, because it depends on the consistency and form of the source pages -- which may not be as consistent as you'd like them to be.  The first step, however, is to take this file management issue seriously and give it the design and thought that it requires.  And be prepared to look at it again in the event that your first attempt at automation turns out to be more burdensome than necessary.\n",
962 |     "\n",
963 |     "Finally, using the same functions here of `urllib.request.Request` and so forth, it's possible for the code in a notebook to POST the processed results back into a webservice further down the pipeline so that no one needs to touch this script by hand.  In this case it is important to handle the error generation and return any messages about the consistency of the input files to the place where the file was generated in order for fixes to happen as soon as possible while the file is live.\n"
964 |    ]
965 |   },
966 |   {
967 |    "cell_type": "code",
968 |    "execution_count": null,
969 |    "metadata": {
970 |     "collapsed": true
971 |    },
972 |    "outputs": [],
973 |    "source": []
974 |   }
975 |  ],
976 |  "metadata": {
977 |   "kernelspec": {
978 |    "display_name": "Python 3",
979 |    "language": "python",
980 |    "name": "python3"
981 |   },
982 |   "language_info": {
983 |    "codemirror_mode": {
984 |     "name": "ipython",
985 |     "version": 3
986 |    },
987 |    "file_extension": ".py",
988 |    "mimetype": "text/x-python",
989 |    "name": "python",
990 |    "nbconvert_exporter": "python",
991 |    "pygments_lexer": "ipython3",
992 |    "version": "3.5.2"
993 |   }
994 |  },
995 |  "nbformat": 4,
996 |  "nbformat_minor": 1
997 | }
998 | 


--------------------------------------------------------------------------------
/docwdaspecs/Interface Specification for Generic Load.doc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cantabular/databaker/930b65e85038a3afd59574bbdc1d4cb79ce9b6ad/docwdaspecs/Interface Specification for Generic Load.doc


--------------------------------------------------------------------------------
/docwdaspecs/wda.txt:
--------------------------------------------------------------------------------
 1 | Observation
 2 | Data Marking
 3 | Statistical Unit Eng
 4 | Statistical Unit Cym
 5 | Measure Type Eng
 6 | Measure Type Cym
 7 | Observation Type Code
 8 | 
 9 | Obs Type Value
10 | Unit Multiplier
11 | Unit of Measure Eng
12 | Unit of Measure Cym
13 | Confidentiality
14 | 
15 | Geographic Area
16 | 
17 | 
18 | Time Dim Item ID
19 | Time Dim Item Label Eng
20 | Time Dim Item Label Cym
21 | Time Type
22 | 
23 | Statistical Population ID
24 | Statistical Population Label Eng
25 | Statistical Population Label Cym
26 | CDID
27 | CDID Description
28 | 
29 | 
30 | 
31 | 
32 | 
33 | 
34 | 
35 | 
36 | Dim ID 1
37 | Dimension Label Eng 1
38 | Dimension Label Cym 1
39 | Dim Item ID 1
40 | Dimension Item Label Eng 1
41 | Dimenion Item Label Cym 1
42 | Is Total 1
43 | Is Subtotal 1
44 | 


--------------------------------------------------------------------------------
/docwdaspecs/wda_notes.txt:
--------------------------------------------------------------------------------
 1 | HAS HEADERS!
 2 | 
 3 | .CSV extension
 4 | UTF-8 encoded
 5 | 
 6 | Final row =
 7 | *********, <observation count>
 8 | 
 9 | Where there are multiple sets of dimensions one set must come first
10 | 
11 | e.g.
12 | 
13 | 4,x=5,y=6
14 | 7,x=12,y=25
15 | 0,x=6,z=0
16 | 12,x=9,z=99
17 | 
18 | note all x/y before any x/z
19 | 
20 | Likewise if some obs are counts, and some are percentages
21 | 
22 | Likewise keep together obs with the same number of dimensions
23 | 
24 | 
25 | We only care about:
26 | 
27 | 1 obs
28 | 2 datamarking  -- if obs isn't a number, make it a datamarking
29 | ? geography -- if not E-code, stick it in anyway.
30 | 18 time value -- will recieve a msg
31 | 19 copy 18
32 | 21 time type -- eg "year"
33 | 36+8n dim name -- eg "gender"
34 | 37+8n copy 36
35 | 39+8n dim value -- eg "male"
36 | 40+8n copy 40
37 | 
38 | -- discussion with Rob 2015-01-20
39 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | nose==1.3.7
2 | docopt==0.6.2
3 | xypath==1.1.1
4 | xlutils==2.0.0
5 | pyhamcrest==1.9.0
6 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | long_desc = """
 4 | Transform Excel spreadsheets
 5 | """
 6 | # See https://pypi.python.org/pypi?%3Aaction=list_classifiers for classifiers
 7 | 
 8 | conf = dict(
 9 |     name='databaker',
10 |     version='2.0.0',
11 |     description="DataBaker, part of QuickCode for ONS",
12 |     long_description=long_desc,
13 |     classifiers=[
14 |         "Development Status :: 3 - Alpha",
15 |         "License :: OSI Approved :: GNU Affero General Public License v3 or later (AGPLv3+)",
16 |         "Operating System :: POSIX :: Linux",
17 |         "Operating System :: Microsoft :: Windows",
18 |         "Programming Language :: Python :: 3.4",
19 |         "Programming Language :: Python :: 3.5",
20 |         "Programming Language :: Python :: 3.6",
21 |     ],
22 |     keywords='',
23 |     author='The Sensible Code Company Ltd',
24 |     author_email='feedback@sensiblecode.io',
25 |     url='https://github.com/sensiblecodeio/databaker',
26 |     license='AGPL',
27 |     packages=find_packages(exclude=['ez_setup', 'examples', 'tests']),
28 |     namespace_packages=[],
29 |     include_package_data=False,
30 |     zip_safe=False,
31 |     install_requires=['docopt', 'xypath>=1.1.0', 'xlutils', 'pyhamcrest'],
32 |     tests_require=[],
33 |     entry_points={
34 |         'console_scripts': [
35 |             'databaker_nbconvert = databaker.databaker_nbconvert:main',
36 |             ]
37 |         },
38 |     )
39 | 
40 | if __name__ == '__main__':
41 |     setup(**conf)
42 | 


--------------------------------------------------------------------------------