├── .github ├── dependabot.yml └── workflows │ └── codeql-analysis.yml ├── .gitignore ├── LICENCE ├── README.md ├── databaker ├── __init__.py ├── constants.py ├── databaker_nbconvert.py ├── framework.py ├── jupybakecsv.py ├── jupybakehtml.py ├── jupybakeutils.py ├── overrides.py ├── richxlrd │ ├── __init__.py │ ├── rich.xls │ └── richxlrd.py ├── structure_csv_default.py ├── tutorial.py └── tutorial │ ├── Finding_your_way.ipynb │ ├── Introduction.ipynb │ ├── Real_world_example.ipynb │ ├── blank_template.ipynb │ ├── construction_output_tables.ipynb │ ├── example1.xls │ ├── nbconvert_demo.ipynb │ ├── ott.xls │ └── tutorial_reference.ipynb ├── docwdaspecs ├── Interface Specification for Generic Load.doc ├── wda.txt └── wda_notes.txt ├── requirements.txt └── setup.py /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: "github-actions" 4 | directory: "/" 5 | schedule: 6 | interval: "daily" 7 | 8 | - package-ecosystem: "pip" 9 | directory: "/" 10 | schedule: 11 | interval: "daily" 12 | -------------------------------------------------------------------------------- /.github/workflows/codeql-analysis.yml: -------------------------------------------------------------------------------- 1 | # For most projects, this workflow file will not need changing; you simply need 2 | # to commit it to your repository. 3 | # 4 | # You may wish to alter this file to override the set of languages analyzed, 5 | # or to provide custom queries or build logic. 6 | name: "CodeQL" 7 | 8 | on: 9 | push: 10 | branches: [master] 11 | pull_request: 12 | # The branches below must be a subset of the branches above 13 | branches: [master] 14 | schedule: 15 | - cron: '0 3 * * 2' 16 | 17 | jobs: 18 | analyze: 19 | name: Analyze 20 | runs-on: ubuntu-latest 21 | 22 | strategy: 23 | fail-fast: false 24 | matrix: 25 | # Override automatic language detection by changing the below list 26 | # Supported options are ['csharp', 'cpp', 'go', 'java', 'javascript', 'python'] 27 | language: ['python'] 28 | # Learn more... 29 | # https://docs.github.com/en/github/finding-security-vulnerabilities-and-errors-in-your-code/configuring-code-scanning#overriding-automatic-language-detection 30 | 31 | steps: 32 | - name: Checkout repository 33 | uses: actions/checkout@v3 34 | with: 35 | # We must fetch at least the immediate parents so that if this is 36 | # a pull request then we can checkout the head. 37 | fetch-depth: 2 38 | 39 | # If this run was triggered by a pull request event, then checkout 40 | # the head of the pull request instead of the merge commit. 41 | - run: git checkout HEAD^2 42 | if: ${{ github.event_name == 'pull_request' }} 43 | 44 | # Initializes the CodeQL tools for scanning. 45 | - name: Initialize CodeQL 46 | uses: github/codeql-action/init@v2 47 | with: 48 | languages: ${{ matrix.language }} 49 | # If you wish to specify custom queries, you can do so here or in a config file. 50 | # By default, queries listed here will override any specified in a config file. 51 | # Prefix the list here with "+" to use these queries and those in the config file. 52 | # queries: ./path/to/local/query, your-org/your-repo/queries@main 53 | 54 | # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). 55 | # If this step fails, then you should remove it and run the build manually (see below) 56 | - name: Autobuild 57 | uses: github/codeql-action/autobuild@v2 58 | 59 | # ℹ️ Command-line programs to run using the OS shell. 60 | # 📚 https://git.io/JvXDl 61 | 62 | # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines 63 | # and modify them (or add more) to build your code if your project 64 | # uses a compiled language 65 | 66 | #- run: | 67 | # make bootstrap 68 | # make release 69 | 70 | - name: Perform CodeQL Analysis 71 | uses: github/codeql-action/analyze@v2 72 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.csv 2 | *.swp 3 | *.pyc 4 | venv 5 | xypath 6 | .~lock* 7 | *.egg-info 8 | 9 | /test/t_out.xls 10 | /test/t_rich.xls 11 | -------------------------------------------------------------------------------- /LICENCE: -------------------------------------------------------------------------------- 1 | This software is Copyright (c) 2016 The Sensible Code Company Limited. 2 | 3 | Unless otherwise stated in particular files or directories, this 4 | software is free software; you can redistribute it and/or modify it 5 | under the terms of the GNU Affero General Public License as published 6 | by the Free Software Foundation, either version 3 of the License, or 7 | (at your option) any later version. 8 | 9 | This software is distributed in the hope that it will be useful, but 10 | WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | Affero General Public License for more details. 13 | 14 | Information about the GNU Affero GPL: 15 | http://www.fsf.org/licensing/licenses/agpl-3.0.html 16 | 17 | A copy of the GNU Affero General Public License follows. 18 | 19 | ----------------------------------------------------------------------- 20 | 21 | GNU AFFERO GENERAL PUBLIC LICENSE 22 | Version 3, 19 November 2007 23 | 24 | Copyright (C) 2007 Free Software Foundation, Inc. 25 | Everyone is permitted to copy and distribute verbatim copies 26 | of this license document, but changing it is not allowed. 27 | 28 | Preamble 29 | 30 | The GNU Affero General Public License is a free, copyleft license for 31 | software and other kinds of works, specifically designed to ensure 32 | cooperation with the community in the case of network server software. 33 | 34 | The licenses for most software and other practical works are designed 35 | to take away your freedom to share and change the works. By contrast, 36 | our General Public Licenses are intended to guarantee your freedom to 37 | share and change all versions of a program--to make sure it remains free 38 | software for all its users. 39 | 40 | When we speak of free software, we are referring to freedom, not 41 | price. Our General Public Licenses are designed to make sure that you 42 | have the freedom to distribute copies of free software (and charge for 43 | them if you wish), that you receive source code or can get it if you 44 | want it, that you can change the software or use pieces of it in new 45 | free programs, and that you know you can do these things. 46 | 47 | Developers that use our General Public Licenses protect your rights 48 | with two steps: (1) assert copyright on the software, and (2) offer 49 | you this License which gives you legal permission to copy, distribute 50 | and/or modify the software. 51 | 52 | A secondary benefit of defending all users' freedom is that 53 | improvements made in alternate versions of the program, if they 54 | receive widespread use, become available for other developers to 55 | incorporate. Many developers of free software are heartened and 56 | encouraged by the resulting cooperation. However, in the case of 57 | software used on network servers, this result may fail to come about. 58 | The GNU General Public License permits making a modified version and 59 | letting the public access it on a server without ever releasing its 60 | source code to the public. 61 | 62 | The GNU Affero General Public License is designed specifically to 63 | ensure that, in such cases, the modified source code becomes available 64 | to the community. It requires the operator of a network server to 65 | provide the source code of the modified version running there to the 66 | users of that server. Therefore, public use of a modified version, on 67 | a publicly accessible server, gives the public access to the source 68 | code of the modified version. 69 | 70 | An older license, called the Affero General Public License and 71 | published by Affero, was designed to accomplish similar goals. This is 72 | a different license, not a version of the Affero GPL, but Affero has 73 | released a new version of the Affero GPL which permits relicensing under 74 | this license. 75 | 76 | The precise terms and conditions for copying, distribution and 77 | modification follow. 78 | 79 | TERMS AND CONDITIONS 80 | 81 | 0. Definitions. 82 | 83 | "This License" refers to version 3 of the GNU Affero General Public License. 84 | 85 | "Copyright" also means copyright-like laws that apply to other kinds of 86 | works, such as semiconductor masks. 87 | 88 | "The Program" refers to any copyrightable work licensed under this 89 | License. Each licensee is addressed as "you". "Licensees" and 90 | "recipients" may be individuals or organizations. 91 | 92 | To "modify" a work means to copy from or adapt all or part of the work 93 | in a fashion requiring copyright permission, other than the making of an 94 | exact copy. The resulting work is called a "modified version" of the 95 | earlier work or a work "based on" the earlier work. 96 | 97 | A "covered work" means either the unmodified Program or a work based 98 | on the Program. 99 | 100 | To "propagate" a work means to do anything with it that, without 101 | permission, would make you directly or secondarily liable for 102 | infringement under applicable copyright law, except executing it on a 103 | computer or modifying a private copy. Propagation includes copying, 104 | distribution (with or without modification), making available to the 105 | public, and in some countries other activities as well. 106 | 107 | To "convey" a work means any kind of propagation that enables other 108 | parties to make or receive copies. Mere interaction with a user through 109 | a computer network, with no transfer of a copy, is not conveying. 110 | 111 | An interactive user interface displays "Appropriate Legal Notices" 112 | to the extent that it includes a convenient and prominently visible 113 | feature that (1) displays an appropriate copyright notice, and (2) 114 | tells the user that there is no warranty for the work (except to the 115 | extent that warranties are provided), that licensees may convey the 116 | work under this License, and how to view a copy of this License. If 117 | the interface presents a list of user commands or options, such as a 118 | menu, a prominent item in the list meets this criterion. 119 | 120 | 1. Source Code. 121 | 122 | The "source code" for a work means the preferred form of the work 123 | for making modifications to it. "Object code" means any non-source 124 | form of a work. 125 | 126 | A "Standard Interface" means an interface that either is an official 127 | standard defined by a recognized standards body, or, in the case of 128 | interfaces specified for a particular programming language, one that 129 | is widely used among developers working in that language. 130 | 131 | The "System Libraries" of an executable work include anything, other 132 | than the work as a whole, that (a) is included in the normal form of 133 | packaging a Major Component, but which is not part of that Major 134 | Component, and (b) serves only to enable use of the work with that 135 | Major Component, or to implement a Standard Interface for which an 136 | implementation is available to the public in source code form. A 137 | "Major Component", in this context, means a major essential component 138 | (kernel, window system, and so on) of the specific operating system 139 | (if any) on which the executable work runs, or a compiler used to 140 | produce the work, or an object code interpreter used to run it. 141 | 142 | The "Corresponding Source" for a work in object code form means all 143 | the source code needed to generate, install, and (for an executable 144 | work) run the object code and to modify the work, including scripts to 145 | control those activities. However, it does not include the work's 146 | System Libraries, or general-purpose tools or generally available free 147 | programs which are used unmodified in performing those activities but 148 | which are not part of the work. For example, Corresponding Source 149 | includes interface definition files associated with source files for 150 | the work, and the source code for shared libraries and dynamically 151 | linked subprograms that the work is specifically designed to require, 152 | such as by intimate data communication or control flow between those 153 | subprograms and other parts of the work. 154 | 155 | The Corresponding Source need not include anything that users 156 | can regenerate automatically from other parts of the Corresponding 157 | Source. 158 | 159 | The Corresponding Source for a work in source code form is that 160 | same work. 161 | 162 | 2. Basic Permissions. 163 | 164 | All rights granted under this License are granted for the term of 165 | copyright on the Program, and are irrevocable provided the stated 166 | conditions are met. This License explicitly affirms your unlimited 167 | permission to run the unmodified Program. The output from running a 168 | covered work is covered by this License only if the output, given its 169 | content, constitutes a covered work. This License acknowledges your 170 | rights of fair use or other equivalent, as provided by copyright law. 171 | 172 | You may make, run and propagate covered works that you do not 173 | convey, without conditions so long as your license otherwise remains 174 | in force. You may convey covered works to others for the sole purpose 175 | of having them make modifications exclusively for you, or provide you 176 | with facilities for running those works, provided that you comply with 177 | the terms of this License in conveying all material for which you do 178 | not control copyright. Those thus making or running the covered works 179 | for you must do so exclusively on your behalf, under your direction 180 | and control, on terms that prohibit them from making any copies of 181 | your copyrighted material outside their relationship with you. 182 | 183 | Conveying under any other circumstances is permitted solely under 184 | the conditions stated below. Sublicensing is not allowed; section 10 185 | makes it unnecessary. 186 | 187 | 3. Protecting Users' Legal Rights From Anti-Circumvention Law. 188 | 189 | No covered work shall be deemed part of an effective technological 190 | measure under any applicable law fulfilling obligations under article 191 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or 192 | similar laws prohibiting or restricting circumvention of such 193 | measures. 194 | 195 | When you convey a covered work, you waive any legal power to forbid 196 | circumvention of technological measures to the extent such circumvention 197 | is effected by exercising rights under this License with respect to 198 | the covered work, and you disclaim any intention to limit operation or 199 | modification of the work as a means of enforcing, against the work's 200 | users, your or third parties' legal rights to forbid circumvention of 201 | technological measures. 202 | 203 | 4. Conveying Verbatim Copies. 204 | 205 | You may convey verbatim copies of the Program's source code as you 206 | receive it, in any medium, provided that you conspicuously and 207 | appropriately publish on each copy an appropriate copyright notice; 208 | keep intact all notices stating that this License and any 209 | non-permissive terms added in accord with section 7 apply to the code; 210 | keep intact all notices of the absence of any warranty; and give all 211 | recipients a copy of this License along with the Program. 212 | 213 | You may charge any price or no price for each copy that you convey, 214 | and you may offer support or warranty protection for a fee. 215 | 216 | 5. Conveying Modified Source Versions. 217 | 218 | You may convey a work based on the Program, or the modifications to 219 | produce it from the Program, in the form of source code under the 220 | terms of section 4, provided that you also meet all of these conditions: 221 | 222 | a) The work must carry prominent notices stating that you modified 223 | it, and giving a relevant date. 224 | 225 | b) The work must carry prominent notices stating that it is 226 | released under this License and any conditions added under section 227 | 7. This requirement modifies the requirement in section 4 to 228 | "keep intact all notices". 229 | 230 | c) You must license the entire work, as a whole, under this 231 | License to anyone who comes into possession of a copy. This 232 | License will therefore apply, along with any applicable section 7 233 | additional terms, to the whole of the work, and all its parts, 234 | regardless of how they are packaged. This License gives no 235 | permission to license the work in any other way, but it does not 236 | invalidate such permission if you have separately received it. 237 | 238 | d) If the work has interactive user interfaces, each must display 239 | Appropriate Legal Notices; however, if the Program has interactive 240 | interfaces that do not display Appropriate Legal Notices, your 241 | work need not make them do so. 242 | 243 | A compilation of a covered work with other separate and independent 244 | works, which are not by their nature extensions of the covered work, 245 | and which are not combined with it such as to form a larger program, 246 | in or on a volume of a storage or distribution medium, is called an 247 | "aggregate" if the compilation and its resulting copyright are not 248 | used to limit the access or legal rights of the compilation's users 249 | beyond what the individual works permit. Inclusion of a covered work 250 | in an aggregate does not cause this License to apply to the other 251 | parts of the aggregate. 252 | 253 | 6. Conveying Non-Source Forms. 254 | 255 | You may convey a covered work in object code form under the terms 256 | of sections 4 and 5, provided that you also convey the 257 | machine-readable Corresponding Source under the terms of this License, 258 | in one of these ways: 259 | 260 | a) Convey the object code in, or embodied in, a physical product 261 | (including a physical distribution medium), accompanied by the 262 | Corresponding Source fixed on a durable physical medium 263 | customarily used for software interchange. 264 | 265 | b) Convey the object code in, or embodied in, a physical product 266 | (including a physical distribution medium), accompanied by a 267 | written offer, valid for at least three years and valid for as 268 | long as you offer spare parts or customer support for that product 269 | model, to give anyone who possesses the object code either (1) a 270 | copy of the Corresponding Source for all the software in the 271 | product that is covered by this License, on a durable physical 272 | medium customarily used for software interchange, for a price no 273 | more than your reasonable cost of physically performing this 274 | conveying of source, or (2) access to copy the 275 | Corresponding Source from a network server at no charge. 276 | 277 | c) Convey individual copies of the object code with a copy of the 278 | written offer to provide the Corresponding Source. This 279 | alternative is allowed only occasionally and noncommercially, and 280 | only if you received the object code with such an offer, in accord 281 | with subsection 6b. 282 | 283 | d) Convey the object code by offering access from a designated 284 | place (gratis or for a charge), and offer equivalent access to the 285 | Corresponding Source in the same way through the same place at no 286 | further charge. You need not require recipients to copy the 287 | Corresponding Source along with the object code. If the place to 288 | copy the object code is a network server, the Corresponding Source 289 | may be on a different server (operated by you or a third party) 290 | that supports equivalent copying facilities, provided you maintain 291 | clear directions next to the object code saying where to find the 292 | Corresponding Source. Regardless of what server hosts the 293 | Corresponding Source, you remain obligated to ensure that it is 294 | available for as long as needed to satisfy these requirements. 295 | 296 | e) Convey the object code using peer-to-peer transmission, provided 297 | you inform other peers where the object code and Corresponding 298 | Source of the work are being offered to the general public at no 299 | charge under subsection 6d. 300 | 301 | A separable portion of the object code, whose source code is excluded 302 | from the Corresponding Source as a System Library, need not be 303 | included in conveying the object code work. 304 | 305 | A "User Product" is either (1) a "consumer product", which means any 306 | tangible personal property which is normally used for personal, family, 307 | or household purposes, or (2) anything designed or sold for incorporation 308 | into a dwelling. In determining whether a product is a consumer product, 309 | doubtful cases shall be resolved in favor of coverage. For a particular 310 | product received by a particular user, "normally used" refers to a 311 | typical or common use of that class of product, regardless of the status 312 | of the particular user or of the way in which the particular user 313 | actually uses, or expects or is expected to use, the product. A product 314 | is a consumer product regardless of whether the product has substantial 315 | commercial, industrial or non-consumer uses, unless such uses represent 316 | the only significant mode of use of the product. 317 | 318 | "Installation Information" for a User Product means any methods, 319 | procedures, authorization keys, or other information required to install 320 | and execute modified versions of a covered work in that User Product from 321 | a modified version of its Corresponding Source. The information must 322 | suffice to ensure that the continued functioning of the modified object 323 | code is in no case prevented or interfered with solely because 324 | modification has been made. 325 | 326 | If you convey an object code work under this section in, or with, or 327 | specifically for use in, a User Product, and the conveying occurs as 328 | part of a transaction in which the right of possession and use of the 329 | User Product is transferred to the recipient in perpetuity or for a 330 | fixed term (regardless of how the transaction is characterized), the 331 | Corresponding Source conveyed under this section must be accompanied 332 | by the Installation Information. But this requirement does not apply 333 | if neither you nor any third party retains the ability to install 334 | modified object code on the User Product (for example, the work has 335 | been installed in ROM). 336 | 337 | The requirement to provide Installation Information does not include a 338 | requirement to continue to provide support service, warranty, or updates 339 | for a work that has been modified or installed by the recipient, or for 340 | the User Product in which it has been modified or installed. Access to a 341 | network may be denied when the modification itself materially and 342 | adversely affects the operation of the network or violates the rules and 343 | protocols for communication across the network. 344 | 345 | Corresponding Source conveyed, and Installation Information provided, 346 | in accord with this section must be in a format that is publicly 347 | documented (and with an implementation available to the public in 348 | source code form), and must require no special password or key for 349 | unpacking, reading or copying. 350 | 351 | 7. Additional Terms. 352 | 353 | "Additional permissions" are terms that supplement the terms of this 354 | License by making exceptions from one or more of its conditions. 355 | Additional permissions that are applicable to the entire Program shall 356 | be treated as though they were included in this License, to the extent 357 | that they are valid under applicable law. If additional permissions 358 | apply only to part of the Program, that part may be used separately 359 | under those permissions, but the entire Program remains governed by 360 | this License without regard to the additional permissions. 361 | 362 | When you convey a copy of a covered work, you may at your option 363 | remove any additional permissions from that copy, or from any part of 364 | it. (Additional permissions may be written to require their own 365 | removal in certain cases when you modify the work.) You may place 366 | additional permissions on material, added by you to a covered work, 367 | for which you have or can give appropriate copyright permission. 368 | 369 | Notwithstanding any other provision of this License, for material you 370 | add to a covered work, you may (if authorized by the copyright holders of 371 | that material) supplement the terms of this License with terms: 372 | 373 | a) Disclaiming warranty or limiting liability differently from the 374 | terms of sections 15 and 16 of this License; or 375 | 376 | b) Requiring preservation of specified reasonable legal notices or 377 | author attributions in that material or in the Appropriate Legal 378 | Notices displayed by works containing it; or 379 | 380 | c) Prohibiting misrepresentation of the origin of that material, or 381 | requiring that modified versions of such material be marked in 382 | reasonable ways as different from the original version; or 383 | 384 | d) Limiting the use for publicity purposes of names of licensors or 385 | authors of the material; or 386 | 387 | e) Declining to grant rights under trademark law for use of some 388 | trade names, trademarks, or service marks; or 389 | 390 | f) Requiring indemnification of licensors and authors of that 391 | material by anyone who conveys the material (or modified versions of 392 | it) with contractual assumptions of liability to the recipient, for 393 | any liability that these contractual assumptions directly impose on 394 | those licensors and authors. 395 | 396 | All other non-permissive additional terms are considered "further 397 | restrictions" within the meaning of section 10. If the Program as you 398 | received it, or any part of it, contains a notice stating that it is 399 | governed by this License along with a term that is a further 400 | restriction, you may remove that term. If a license document contains 401 | a further restriction but permits relicensing or conveying under this 402 | License, you may add to a covered work material governed by the terms 403 | of that license document, provided that the further restriction does 404 | not survive such relicensing or conveying. 405 | 406 | If you add terms to a covered work in accord with this section, you 407 | must place, in the relevant source files, a statement of the 408 | additional terms that apply to those files, or a notice indicating 409 | where to find the applicable terms. 410 | 411 | Additional terms, permissive or non-permissive, may be stated in the 412 | form of a separately written license, or stated as exceptions; 413 | the above requirements apply either way. 414 | 415 | 8. Termination. 416 | 417 | You may not propagate or modify a covered work except as expressly 418 | provided under this License. Any attempt otherwise to propagate or 419 | modify it is void, and will automatically terminate your rights under 420 | this License (including any patent licenses granted under the third 421 | paragraph of section 11). 422 | 423 | However, if you cease all violation of this License, then your 424 | license from a particular copyright holder is reinstated (a) 425 | provisionally, unless and until the copyright holder explicitly and 426 | finally terminates your license, and (b) permanently, if the copyright 427 | holder fails to notify you of the violation by some reasonable means 428 | prior to 60 days after the cessation. 429 | 430 | Moreover, your license from a particular copyright holder is 431 | reinstated permanently if the copyright holder notifies you of the 432 | violation by some reasonable means, this is the first time you have 433 | received notice of violation of this License (for any work) from that 434 | copyright holder, and you cure the violation prior to 30 days after 435 | your receipt of the notice. 436 | 437 | Termination of your rights under this section does not terminate the 438 | licenses of parties who have received copies or rights from you under 439 | this License. If your rights have been terminated and not permanently 440 | reinstated, you do not qualify to receive new licenses for the same 441 | material under section 10. 442 | 443 | 9. Acceptance Not Required for Having Copies. 444 | 445 | You are not required to accept this License in order to receive or 446 | run a copy of the Program. Ancillary propagation of a covered work 447 | occurring solely as a consequence of using peer-to-peer transmission 448 | to receive a copy likewise does not require acceptance. However, 449 | nothing other than this License grants you permission to propagate or 450 | modify any covered work. These actions infringe copyright if you do 451 | not accept this License. Therefore, by modifying or propagating a 452 | covered work, you indicate your acceptance of this License to do so. 453 | 454 | 10. Automatic Licensing of Downstream Recipients. 455 | 456 | Each time you convey a covered work, the recipient automatically 457 | receives a license from the original licensors, to run, modify and 458 | propagate that work, subject to this License. You are not responsible 459 | for enforcing compliance by third parties with this License. 460 | 461 | An "entity transaction" is a transaction transferring control of an 462 | organization, or substantially all assets of one, or subdividing an 463 | organization, or merging organizations. If propagation of a covered 464 | work results from an entity transaction, each party to that 465 | transaction who receives a copy of the work also receives whatever 466 | licenses to the work the party's predecessor in interest had or could 467 | give under the previous paragraph, plus a right to possession of the 468 | Corresponding Source of the work from the predecessor in interest, if 469 | the predecessor has it or can get it with reasonable efforts. 470 | 471 | You may not impose any further restrictions on the exercise of the 472 | rights granted or affirmed under this License. For example, you may 473 | not impose a license fee, royalty, or other charge for exercise of 474 | rights granted under this License, and you may not initiate litigation 475 | (including a cross-claim or counterclaim in a lawsuit) alleging that 476 | any patent claim is infringed by making, using, selling, offering for 477 | sale, or importing the Program or any portion of it. 478 | 479 | 11. Patents. 480 | 481 | A "contributor" is a copyright holder who authorizes use under this 482 | License of the Program or a work on which the Program is based. The 483 | work thus licensed is called the contributor's "contributor version". 484 | 485 | A contributor's "essential patent claims" are all patent claims 486 | owned or controlled by the contributor, whether already acquired or 487 | hereafter acquired, that would be infringed by some manner, permitted 488 | by this License, of making, using, or selling its contributor version, 489 | but do not include claims that would be infringed only as a 490 | consequence of further modification of the contributor version. For 491 | purposes of this definition, "control" includes the right to grant 492 | patent sublicenses in a manner consistent with the requirements of 493 | this License. 494 | 495 | Each contributor grants you a non-exclusive, worldwide, royalty-free 496 | patent license under the contributor's essential patent claims, to 497 | make, use, sell, offer for sale, import and otherwise run, modify and 498 | propagate the contents of its contributor version. 499 | 500 | In the following three paragraphs, a "patent license" is any express 501 | agreement or commitment, however denominated, not to enforce a patent 502 | (such as an express permission to practice a patent or covenant not to 503 | sue for patent infringement). To "grant" such a patent license to a 504 | party means to make such an agreement or commitment not to enforce a 505 | patent against the party. 506 | 507 | If you convey a covered work, knowingly relying on a patent license, 508 | and the Corresponding Source of the work is not available for anyone 509 | to copy, free of charge and under the terms of this License, through a 510 | publicly available network server or other readily accessible means, 511 | then you must either (1) cause the Corresponding Source to be so 512 | available, or (2) arrange to deprive yourself of the benefit of the 513 | patent license for this particular work, or (3) arrange, in a manner 514 | consistent with the requirements of this License, to extend the patent 515 | license to downstream recipients. "Knowingly relying" means you have 516 | actual knowledge that, but for the patent license, your conveying the 517 | covered work in a country, or your recipient's use of the covered work 518 | in a country, would infringe one or more identifiable patents in that 519 | country that you have reason to believe are valid. 520 | 521 | If, pursuant to or in connection with a single transaction or 522 | arrangement, you convey, or propagate by procuring conveyance of, a 523 | covered work, and grant a patent license to some of the parties 524 | receiving the covered work authorizing them to use, propagate, modify 525 | or convey a specific copy of the covered work, then the patent license 526 | you grant is automatically extended to all recipients of the covered 527 | work and works based on it. 528 | 529 | A patent license is "discriminatory" if it does not include within 530 | the scope of its coverage, prohibits the exercise of, or is 531 | conditioned on the non-exercise of one or more of the rights that are 532 | specifically granted under this License. You may not convey a covered 533 | work if you are a party to an arrangement with a third party that is 534 | in the business of distributing software, under which you make payment 535 | to the third party based on the extent of your activity of conveying 536 | the work, and under which the third party grants, to any of the 537 | parties who would receive the covered work from you, a discriminatory 538 | patent license (a) in connection with copies of the covered work 539 | conveyed by you (or copies made from those copies), or (b) primarily 540 | for and in connection with specific products or compilations that 541 | contain the covered work, unless you entered into that arrangement, 542 | or that patent license was granted, prior to 28 March 2007. 543 | 544 | Nothing in this License shall be construed as excluding or limiting 545 | any implied license or other defenses to infringement that may 546 | otherwise be available to you under applicable patent law. 547 | 548 | 12. No Surrender of Others' Freedom. 549 | 550 | If conditions are imposed on you (whether by court order, agreement or 551 | otherwise) that contradict the conditions of this License, they do not 552 | excuse you from the conditions of this License. If you cannot convey a 553 | covered work so as to satisfy simultaneously your obligations under this 554 | License and any other pertinent obligations, then as a consequence you may 555 | not convey it at all. For example, if you agree to terms that obligate you 556 | to collect a royalty for further conveying from those to whom you convey 557 | the Program, the only way you could satisfy both those terms and this 558 | License would be to refrain entirely from conveying the Program. 559 | 560 | 13. Remote Network Interaction; Use with the GNU General Public License. 561 | 562 | Notwithstanding any other provision of this License, if you modify the 563 | Program, your modified version must prominently offer all users 564 | interacting with it remotely through a computer network (if your version 565 | supports such interaction) an opportunity to receive the Corresponding 566 | Source of your version by providing access to the Corresponding Source 567 | from a network server at no charge, through some standard or customary 568 | means of facilitating copying of software. This Corresponding Source 569 | shall include the Corresponding Source for any work covered by version 3 570 | of the GNU General Public License that is incorporated pursuant to the 571 | following paragraph. 572 | 573 | Notwithstanding any other provision of this License, you have 574 | permission to link or combine any covered work with a work licensed 575 | under version 3 of the GNU General Public License into a single 576 | combined work, and to convey the resulting work. The terms of this 577 | License will continue to apply to the part which is the covered work, 578 | but the work with which it is combined will remain governed by version 579 | 3 of the GNU General Public License. 580 | 581 | 14. Revised Versions of this License. 582 | 583 | The Free Software Foundation may publish revised and/or new versions of 584 | the GNU Affero General Public License from time to time. Such new versions 585 | will be similar in spirit to the present version, but may differ in detail to 586 | address new problems or concerns. 587 | 588 | Each version is given a distinguishing version number. If the 589 | Program specifies that a certain numbered version of the GNU Affero General 590 | Public License "or any later version" applies to it, you have the 591 | option of following the terms and conditions either of that numbered 592 | version or of any later version published by the Free Software 593 | Foundation. If the Program does not specify a version number of the 594 | GNU Affero General Public License, you may choose any version ever published 595 | by the Free Software Foundation. 596 | 597 | If the Program specifies that a proxy can decide which future 598 | versions of the GNU Affero General Public License can be used, that proxy's 599 | public statement of acceptance of a version permanently authorizes you 600 | to choose that version for the Program. 601 | 602 | Later license versions may give you additional or different 603 | permissions. However, no additional obligations are imposed on any 604 | author or copyright holder as a result of your choosing to follow a 605 | later version. 606 | 607 | 15. Disclaimer of Warranty. 608 | 609 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY 610 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT 611 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY 612 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, 613 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 614 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM 615 | IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF 616 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 617 | 618 | 16. Limitation of Liability. 619 | 620 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 621 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS 622 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY 623 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE 624 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF 625 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD 626 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), 627 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF 628 | SUCH DAMAGES. 629 | 630 | 17. Interpretation of Sections 15 and 16. 631 | 632 | If the disclaimer of warranty and limitation of liability provided 633 | above cannot be given local legal effect according to their terms, 634 | reviewing courts shall apply local law that most closely approximates 635 | an absolute waiver of all civil liability in connection with the 636 | Program, unless a warranty or assumption of liability accompanies a 637 | copy of the Program in return for a fee. 638 | 639 | END OF TERMS AND CONDITIONS 640 | 641 | How to Apply These Terms to Your New Programs 642 | 643 | If you develop a new program, and you want it to be of the greatest 644 | possible use to the public, the best way to achieve this is to make it 645 | free software which everyone can redistribute and change under these terms. 646 | 647 | To do so, attach the following notices to the program. It is safest 648 | to attach them to the start of each source file to most effectively 649 | state the exclusion of warranty; and each file should have at least 650 | the "copyright" line and a pointer to where the full notice is found. 651 | 652 | 653 | Copyright (C) 654 | 655 | This program is free software: you can redistribute it and/or modify 656 | it under the terms of the GNU Affero General Public License as published by 657 | the Free Software Foundation, either version 3 of the License, or 658 | (at your option) any later version. 659 | 660 | This program is distributed in the hope that it will be useful, 661 | but WITHOUT ANY WARRANTY; without even the implied warranty of 662 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 663 | GNU Affero General Public License for more details. 664 | 665 | You should have received a copy of the GNU Affero General Public License 666 | along with this program. If not, see . 667 | 668 | Also add information on how to contact you by electronic and paper mail. 669 | 670 | If your software can interact with users remotely through a computer 671 | network, you should also make sure that it provides a way for users to 672 | get its source. For example, if your program is a web application, its 673 | interface could display a "Source" link that leads users to an archive 674 | of the code. There are many ways you could offer source, and different 675 | solutions will be better for different programs; see section 13 for the 676 | specific requirements. 677 | 678 | You should also get your employer (if you work as a programmer) or school, 679 | if any, to sign a "copyright disclaimer" for the program, if necessary. 680 | For more information on this, and how to apply and follow the GNU AGPL, see 681 | . 682 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Databaker 2 | 3 | Jupyter notebook tool for converting data that is laid out in a formatted Excel 4 | spreadsheet into a normalized form for use by databases. 5 | 6 | It depends on [okfn/messytables](https://github.com/okfn/messytables) and 7 | [sensiblecodeio/xypath](https://github.com/sensiblecodeio/xypath) 8 | 9 | Python 3.4+ supported. 10 | 11 | ## Starting up 12 | 13 | ### For development 14 | 15 | To install for development, the easiest way is create a virtualenv, 16 | activate it: 17 | 18 | `source bin/activate` 19 | 20 | and then type 21 | 22 | `pip install -e git+https://github.com/sensiblecodeio/databaker.git#egg=databaker` 23 | 24 | This will install the code into `src/databaker` where you can edit and commit it. 25 | 26 | ### For normal use 27 | 28 | Install with `pip install databaker` 29 | 30 | ## Usage 31 | 32 | Launch a Jupyter notebook: 33 | 34 | `jupyter notebook` 35 | 36 | and then follow the tutorials as described below. 37 | 38 | ## Documentation 39 | 40 | The current documentation is in the form of Jupyter notebooks located 41 | inside the [tutorial](databaker/tutorial) directory. 42 | 43 | You can access these directly by creating a new Jupyter notebook and 44 | running the following in a Jupyter cell: 45 | 46 | ``` 47 | from databaker.tutorial import tutorial 48 | tutorial() 49 | ``` 50 | 51 | which will copy the tutorials to your current directory and provide 52 | links to these copied notebooks. 53 | 54 | ## Authors 55 | 56 | Made by the [Sensible Code Company](http://sensiblecode.io) on behalf of the 57 | [Office of National Statistics](https://www.ons.gov.uk/) (UK). 58 | -------------------------------------------------------------------------------- /databaker/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cantabular/databaker/930b65e85038a3afd59574bbdc1d4cb79ce9b6ad/databaker/__init__.py -------------------------------------------------------------------------------- /databaker/constants.py: -------------------------------------------------------------------------------- 1 | from xypath import DOWN, UP, LEFT, RIGHT 2 | from hamcrest import * 3 | 4 | # IF theres a custom template use it, Otherwise use the default. 5 | try: 6 | from structure_csv_user import * 7 | import structure_csv_user as template 8 | except ImportError: 9 | from .structure_csv_default import * 10 | from . import structure_csv_default as template 11 | 12 | 13 | ABOVE = UP 14 | BELOW = DOWN 15 | 16 | DIRECTLY = True 17 | CLOSEST = False 18 | 19 | 20 | -------------------------------------------------------------------------------- /databaker/databaker_nbconvert.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os 3 | import subprocess 4 | import sys 5 | 6 | 7 | def main(argv=sys.argv[1:]): 8 | if argv is None or len(argv) == 0 or len(argv) > 2: 9 | print("Usage: databaker_process.py ") 10 | print() 11 | print(" is optional; it replaces DATABAKER_INPUT_FILE") 12 | print("in the notebook.") 13 | print("The input file should also be in the same directory as the") 14 | print("notebook.") 15 | sys.exit(1) 16 | 17 | process_env = os.environ.copy() 18 | 19 | if len(argv) == 2: 20 | process_env['DATABAKER_INPUT_FILE'] = argv[1] 21 | 22 | # TODO get custom templates working; according to this: 23 | # https://github.com/jupyter/nbconvert/issues/391 24 | # they should work, but I get TemplateNotFound when using absolute path 25 | # for template. 26 | cmd_line = ['jupyter', 'nbconvert', '--to', 'html', '--execute', argv[0]] 27 | print("Running:", ' '.join(cmd_line)) 28 | subprocess.call(args=cmd_line, env=process_env) 29 | 30 | 31 | if __name__ == '__main__': 32 | main() 33 | -------------------------------------------------------------------------------- /databaker/framework.py: -------------------------------------------------------------------------------- 1 | import os, warnings 2 | import xypath 3 | import xypath.loader 4 | import databaker.constants 5 | from databaker.constants import * # also brings in template 6 | import databaker.overrides as overrides # warning: injects additional class functions into xypath and messytables 7 | 8 | # core classes and functionality 9 | from databaker.jupybakeutils import HDim, HDimConst, ConversionSegment, Ldatetimeunitloose, Ldatetimeunitforce, pdguessforceTIMEUNIT 10 | from databaker.jupybakecsv import writetechnicalCSV, readtechnicalCSV 11 | from databaker.jupybakehtml import savepreviewhtml 12 | 13 | # this lot should be deprecated 14 | from databaker.jupybakecsv import headersfromwdasegment, extraheaderscheck, checktheconstantdimensions, checksegmentobsvalues 15 | from databaker.jupybakecsv import wdamsgstrings, CompareConversionSegments 16 | 17 | def loadxlstabs(inputfile, sheetids="*", verbose=True): 18 | if verbose: 19 | print("Loading %s which has size %d bytes" % (inputfile, os.path.getsize(inputfile))) 20 | tableset = xypath.loader.table_set(inputfile, extension='xls') 21 | tabs = list(xypath.loader.get_sheets(tableset, sheetids)) 22 | tabnames = [ tab.name for tab in tabs ] 23 | if verbose: 24 | print("Table names: %s" % str(tabnames)) 25 | 26 | if sheetids != "*": 27 | if type(sheetids) == str: 28 | sheetids = [sheetids] 29 | assert type(sheetids) in [list, tuple], ("What type is this?", type(sheetids)) 30 | for sid in sheetids: 31 | assert sid in tabnames, (sid, "missing from found tables") 32 | assert len(sheetids) == len(tabnames), ("Number of selected tables disagree", "len(sheetids) == len(tabnames)", len(sheetids), len(tabnames)) 33 | if len(set(tabnames)) != len(tabnames): 34 | warnings.warn("Duplicates found in table names list") 35 | return tabs 36 | 37 | DATABAKER_INPUT_FILE = None 38 | 39 | 40 | def getinputfilename(): 41 | """ Return DATABAKER_INPUT_FILE from os.environ or this module. 42 | 43 | This is so that DATABAKER_INPUT_FILE could be specified in a notebook and 44 | then overridden by an environment variable if not. 45 | 46 | Use of environment variables is because nbconvert doesn't allow you to 47 | easily pass arguments to the notebook. 48 | 49 | Use in notebook is along the lines of: 50 | 51 | DATABAKER_INPUT_FILE = 'myfile.xls' 52 | f = getinputfilename() 53 | 54 | This way, we can set the filename in the notebook, or at the commmand line 55 | with environment variables. 56 | """ 57 | try: 58 | return os.environ['DATABAKER_INPUT_FILE'] 59 | except KeyError as e: 60 | return DATABAKER_INPUT_FILE 61 | -------------------------------------------------------------------------------- /databaker/jupybakecsv.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | # HTML preview of the dimensions and table (will be moved to a function in databakersolo) 3 | 4 | import io, os, collections, re, warnings, csv, datetime 5 | import databaker.constants 6 | from databaker.jupybakeutils import ConversionSegment 7 | template = databaker.constants.template 8 | 9 | try: import pandas 10 | except ImportError: pandas = None # no pandas in pypy 11 | 12 | def HLDUPgenerate_header_row(numheaderadditionals): 13 | res = [ (k[0] if isinstance(k, tuple) else k) for k in template.headermeasurements ] 14 | for i in range(numheaderadditionals): 15 | for k in template.headeradditionals: 16 | if isinstance(k, tuple): 17 | sk = k[0] 18 | else: 19 | sk = k 20 | res.append("%s_%d" % (sk, i+1)) 21 | return res 22 | 23 | 24 | 25 | def Lyield_dimension_values(dval, isegmentnumber, Cheaderadditionals): 26 | for k in template.headermeasurements: 27 | if isinstance(k, tuple): 28 | yield dval.get(k[1], '') 29 | elif k == template.conversionsegmentnumbercolumn: 30 | yield isegmentnumber 31 | else: 32 | yield '' 33 | 34 | for dlab in Cheaderadditionals: 35 | for k in template.headeradditionals: 36 | if isinstance(k, tuple): 37 | if k[1] == "NAME": 38 | yield dlab 39 | else: 40 | assert k[1] == "VALUE" 41 | yield dval[dlab] 42 | else: 43 | yield '' 44 | 45 | 46 | def writetechnicalCSV(outputfile, conversionsegments): 47 | "Output the CSV into the bloated WDA format (takes lists of conversionsegments or pandas tables)" 48 | if not isinstance(conversionsegments, (list, tuple)): 49 | conversionsegments = [ conversionsegments ] 50 | 51 | if outputfile is not None: 52 | print("writing %d conversion segments into %s" % (len(conversionsegments), os.path.abspath(outputfile))) 53 | try: 54 | filehandle = open(outputfile, "w", newline='\n', encoding='utf-8') 55 | except TypeError: # this happens if you run in pypy2 because the newline parameter is not recognized 56 | filehandle = open(outputfile, "w") 57 | else: 58 | filehandle = io.StringIO() # to return as string for print preview perhaps 59 | csv_writer = csv.writer(filehandle) 60 | row_count = 0 61 | 62 | for isegmentnumber, conversionsegment in enumerate(conversionsegments): 63 | if isegmentnumber == 0: # only first segment gets a CSV header for the whole file (even if it is not consistent for the remaining segments) 64 | if isinstance(conversionsegment, ConversionSegment): 65 | Cheaderadditionals = [ dimension.label for dimension in conversionsegment.dimensions if dimension.label not in template.headermeasurementnamesSet ] 66 | assert len(Cheaderadditionals) == conversionsegment.numheaderadditionals 67 | elif pandas is not None: 68 | assert isinstance(conversionsegment, pandas.DataFrame), "function takes only ConversionSegments of pandas.DataFrames" 69 | if not isinstance(conversionsegment.index, pandas.RangeIndex): 70 | conversionsegment = conversionsegment.reset_index() # in case of playing around with indexes 71 | Cheaderadditionals = [colname for colname in conversionsegment.columns if colname not in template.headermeasurementnamesSet and colname[:2] != "__"] 72 | csv_writer.writerow(HLDUPgenerate_header_row(len(Cheaderadditionals))) 73 | 74 | if isinstance(conversionsegment, ConversionSegment): 75 | timeunitmessage = "" 76 | if conversionsegment.processedrows is None: 77 | timeunitmessage = conversionsegment.process() 78 | 79 | if outputfile is not None: 80 | print("conversionwrite segment size %d table '%s'; %s" % (len(conversionsegment.processedrows), conversionsegment.tab.name, timeunitmessage)) 81 | for row in conversionsegment.processedrows: 82 | csv_writer.writerow(Lyield_dimension_values(row, isegmentnumber, Cheaderadditionals)) 83 | row_count += 1 84 | 85 | else: # pandas.Dataframe case 86 | assert pandas is not None 87 | if outputfile is not None: 88 | print("pdconversionwrite segment size %d" % (len(conversionsegment))) 89 | for i in range(len(conversionsegment)): # quick and dirty to use same dict-based function 90 | csv_writer.writerow(Lyield_dimension_values(dict(conversionsegment.iloc[i].dropna()), isegmentnumber, Cheaderadditionals)) 91 | row_count += 1 92 | 93 | csv_writer.writerow(["*"*9, row_count]) 94 | if outputfile is not None: 95 | filehandle.close() 96 | else: 97 | return filehandle.getvalue() 98 | 99 | 100 | 101 | def readtechnicalCSV(wdafile, bverbose=False, baspandas=True): 102 | if baspandas and pandas is None: 103 | baspandas = False 104 | 105 | "Read a WDA CSV back from its file into an lookup table from segment number to (each a list of dicts)" 106 | if isinstance(wdafile, str): 107 | if len(wdafile) > 200 and '\n' in wdafile: 108 | filehandle = io.StringIO(wdafile) 109 | else: 110 | filehandle = open(wdafile, "r", encoding='utf-8') 111 | else: 112 | assert isinstance(wdafile, io.StringIO) 113 | filehandle = wdafile 114 | 115 | wdain = csv.reader(filehandle) 116 | # First check that the headers are what we expect 117 | wdaheaders = wdain.__next__() 118 | numheaderadditionals = (len(wdaheaders) - len(template.headermeasurements))//len(template.headeradditionals) 119 | if not (wdaheaders == HLDUPgenerate_header_row(numheaderadditionals)): 120 | print("WDA heades don't match. nothing is likely to work now") 121 | 122 | wdasegments = { } # { segmentnumber: ( [ data_dicts ], [ordered_header_list] ) } 123 | previsegmentnumber = None 124 | segmentheaderssegmentL = [ ] # [ [ordered_header_list] ] 125 | 126 | for row in wdain: 127 | if row[0] == '*********': 128 | nrows = sum(len(wdasegment) for wdasegment, segmentheaders in wdasegments.values()) 129 | if int(row[1]) != nrows: 130 | warnings.warn("row number doesn't match %d should be %d" % (int(row[1]), nrows)) 131 | assert len(list(wdain)) == 0, "***** must be on last row" 132 | break 133 | 134 | dval = { } 135 | isegmentnumber = None 136 | for r, k in zip(row, template.headermeasurements): 137 | if isinstance(k, tuple): 138 | nk = k[1] 139 | if r: 140 | assert nk not in dval or dval[nk] == r 141 | dval[nk] = r 142 | else: 143 | assert not dval.get(nk) 144 | elif k == template.conversionsegmentnumbercolumn and r: 145 | isegmentnumber = int(r) 146 | else: 147 | assert not r 148 | 149 | lnumheaderadditionals = (len(row) - len(template.headermeasurements)) 150 | assert lnumheaderadditionals % len(template.headeradditionals) == 0 151 | numheaderadditionals = lnumheaderadditionals//len(template.headeradditionals) 152 | 153 | segmentheaderssegmentJ = [ ] 154 | for i in range(numheaderadditionals): 155 | rname, rvalue = None, None 156 | i0 = len(template.headermeasurements) + i*len(template.headeradditionals) 157 | for r, k in zip(row[i0:i0+len(template.headeradditionals)], template.headeradditionals): 158 | if isinstance(k, tuple): 159 | if k[1] == "NAME": 160 | assert rname is None or rname == r, (rname, r) 161 | rname = r 162 | else: 163 | assert k[1] == "VALUE" 164 | assert rvalue is None or rvalue == r 165 | rvalue = r 166 | else: 167 | assert not r 168 | assert rname, (rname, dval, row) 169 | dval[rname] = rvalue 170 | segmentheaderssegmentJ.append(rname) 171 | 172 | if isegmentnumber is None: 173 | if not segmentheaderssegmentL or segmentheaderssegmentL[-1] != segmentheaderssegmentJ: 174 | segmentheaderssegmentL.append(segmentheaderssegmentJ) 175 | isegmentnumber = len(segmentheaderssegmentL) - 1 176 | elif isegmentnumber in wdasegments: 177 | assert wdasegments[isegmentnumber][1] == segmentheaderssegmentJ 178 | 179 | if isegmentnumber not in wdasegments: 180 | if bverbose and previsegmentnumber is not None: 181 | print("segment %d loaded with %d rows" % (previsegmentnumber, len(wdasegments[previsegmentnumber][0]))) 182 | wdasegments[isegmentnumber] = ([ ], segmentheaderssegmentJ) 183 | 184 | wdasegments[isegmentnumber][0].append(dval) 185 | previsegmentnumber = isegmentnumber 186 | if bverbose and previsegmentnumber is not None: 187 | print("segment %d loaded with %d rows" % (previsegmentnumber, len(wdasegments[previsegmentnumber][0]))) 188 | filehandle.close() 189 | 190 | if not baspandas: 191 | return [ wdasegment for wdasegment, segmentheaders in wdasegments.values() ] 192 | 193 | res = [ ] 194 | for wdasegment, segmentheaders in wdasegments.values(): 195 | df = pandas.DataFrame.from_dict(wdasegment) 196 | 197 | # sort the columns (problem with using from_dict) 198 | dfcols = list(df.columns) 199 | newdfcols = [ ] 200 | for k in template.headermeasurements: 201 | if isinstance(k, tuple): 202 | if k[1] in dfcols: 203 | newdfcols.append(k[1]) 204 | dfcols.remove(k[1]) 205 | for segmentheader in segmentheaders: 206 | assert segmentheader in dfcols 207 | newdfcols.append(segmentheader) 208 | dfcols.remove(segmentheader) 209 | assert not dfcols, ("unexplained extra columns", dfcols) 210 | 211 | res.append(df[newdfcols]) # map the new column list in 212 | return res 213 | 214 | 215 | 216 | # code below should probably be deprecated, or at least upgraded to pandas comparison functionality 217 | 218 | # separated out so we can decide the severity of them before printing them out 219 | wdamsgstrings = { 220 | "WDAHEADERSINCONSISTENT": "Inconsistent extra headings headings in segment: %s", 221 | "WDAHEADERSMISSING": "Extra headings in segment than in wda file: %s", 222 | "WDAHEADERSEXTRA": "Extra headings in wda file not in segment: %s", 223 | "WDACOLUMNNOTCONSTANT": "Constant column %s has multiple values: %s", 224 | "WDACOLUMNCONSTCHANGED": "Constant column %s value changed %s but was in wda file %s", 225 | "NEWVALUESINSEGMENT": "Unmatched new values in segment %s", 226 | "WDAEXTRAVALUES": "Unmatched extra values in wda file %s", 227 | "WDADUPLICATESMISMATCH": "Duplicates mismatch counts %s", 228 | "EXTRAWDACONVERSIONSEGMENTS": "Extra conversion segments in wda file %s", 229 | } 230 | 231 | def headersfromwdasegment(wdaseg, msglist): 232 | derivedheaders = [ databaker.constants.OBS ] + (template.SH_Create_ONS_time and [ databaker.constants.TIMEUNIT ] or []) + (databaker.constants.DATAMARKER and [databaker.constants.DATAMARKER] or []) 233 | headersunion = None 234 | headersintersection = None 235 | for wdarow in wdaseg: 236 | ahset = set(k for k in wdarow.keys() if k not in derivedheaders) 237 | if headersunion is None: 238 | headersintersection = set(ahset) 239 | headersunion = set(ahset) 240 | else: 241 | headersunion.update(ahset) 242 | headersintersection.intersection_update(ahset) 243 | if headersunion != headersintersection: 244 | msglist.append(("WDAHEADERSINCONSISTENT", headersunion.difference(headersintersection))) 245 | return headersintersection 246 | 247 | def extraheaderscheck(conversionsegment, wdaseg, msglist): 248 | wdaheaders = headersfromwdasegment(wdaseg, msglist) 249 | segmentheaders = set([c.label for c in conversionsegment.dimensions]) 250 | extraheadersinsegment = segmentheaders.difference(wdaheaders) 251 | extraheadersinwdaseg = wdaheaders.difference(segmentheaders) 252 | if extraheadersinsegment: 253 | msglist.append(("WDAHEADERSMISSING", extraheadersinsegment)) 254 | if extraheadersinwdaseg: 255 | msglist.append(("WDAHEADERSEXTRA", extraheadersinwdaseg)) 256 | return wdaheaders.intersection(segmentheaders) 257 | 258 | def checktheconstantdimensions(conversionsegment, headers, wdaseg, msglist): 259 | for dimension in conversionsegment.dimensions: 260 | if dimension.label in headers: 261 | if dimension.hbagset is None: 262 | constval = dimension.cellvalueoverride.get(None) 263 | wdaconst = set(row.get(dimension.label) for row in wdaseg) 264 | if len(wdaconst) != 1: 265 | msglist.append(("WDACOLUMNNOTCONSTANT", (dimension.label, wdaconst))) 266 | elif constval not in wdaconst: 267 | msglist.append(("WDACOLUMNCONSTCHANGED", (dimension.label, constval, wdaconst.pop()))) 268 | headers.remove(dimension.label) 269 | return headers 270 | 271 | def checksegmentobsvalues(processedrows, headers, wdaseg, msglist): 272 | oheaders = [databaker.constants.OBS]+list(headers) 273 | 274 | # produce counts of each element in case there are duplicates (we are not keeping the orders of the lists) 275 | ccounts = collections.Counter(tuple(row.get(h) for h in oheaders) for row in processedrows) 276 | wcounts = collections.Counter(tuple(wrow.get(h) for h in oheaders) for wrow in wdaseg) 277 | cset = set(ccounts.keys()) 278 | wset = set(wcounts.keys()) 279 | 280 | cdiffextra = cset.difference(wset) 281 | sdiffextra = wset.difference(cset) 282 | if cdiffextra: 283 | msglist.append(("NEWVALUESINSEGMENT", cdiffextra)) 284 | if sdiffextra: 285 | msglist.append(("WDAEXTRAVALUES", sdiffextra)) 286 | 287 | dupmismatch = { } 288 | for s in cset.intersection(wset): 289 | if ccounts[s] != wcounts[s]: 290 | dupmismatch[s] = (ccounts[s], wcounts[s]) 291 | if dupmismatch: 292 | msglist.append(("WDADUPLICATESMISMATCH", dupmismatch)) 293 | 294 | 295 | def CompareConversionSegments(conversionsegments, wdafile, bprintwarnings): 296 | bverbose = True 297 | if type(conversionsegments) is ConversionSegment: 298 | conversionsegments = [conversionsegments] 299 | 300 | msglistperseg = { } 301 | wdasegs = readtechnicalCSV(wdafile, bverbose) 302 | extracsegs = [ c for c in wdasegs.keys() if not 0<=c\n') 46 | key.append('') 47 | for i, label, bag in tsubs: 48 | for h in bag: 49 | ixyheaderlookup[(h.x, h.y)] = i 50 | if blocalstylesheet: 51 | key.append('%s' % (i, label)) 52 | else: 53 | key.append('%s' % (i, colourlist.get(i,"white"), label)) 54 | key.append('') 55 | key.append('\n') 56 | 57 | 58 | sty = [ ] 59 | sty.append("\n\n") 77 | 78 | htm = [ ] 79 | htm.append('\n') 80 | htm.append('\n' % tab.name) 81 | for row in tab.rows(): 82 | htm.append("") 83 | assert len(row) == tab._max_x + 1 84 | rrow = sorted(row, key=lambda X: X.x) 85 | for c in rrow: 86 | ih = ixyheaderlookup.get((c.x, c.y)) 87 | if blocalstylesheet: 88 | cs = [ ] 89 | if ih is not None: cs.append("xc%s" % ih) 90 | if c.properties.cell.sheet.book.font_list: # overcome bug in messytables caused by https://www.communities-ni.gov.uk/sites/default/files/publications/communities/ni-housing-stats-15-16-tables1.xlsx 91 | if c.properties.get_bold(): cs.append("xb") 92 | if c.is_number(): cs.append("xn") 93 | htm.append('") 115 | htm.append("\n") 116 | htm.append("
%s
' % (" ".join(cs), c.x, c.y)) 94 | else: 95 | ls = [ ] 96 | if ih is not None: ls.append("background-color:%s" % colourlist.get(ih,"white")) 97 | if c.properties.cell.sheet.book.font_list: 98 | if c.properties.get_bold(): ls.append("font-weight:bold") 99 | lss = ' style="%s"' % ";".join(ls) if ls else '' 100 | htm.append('' % (lss, c.x, c.y)) 101 | 102 | if (c.x, c.y) in consolidatedcellvalueoverride: 103 | prevcellval = svalue(c) or "*blank*" # want to see empty cells that have been overwritten 104 | overridecellval = consolidatedcellvalueoverride[(c.x, c.y)] 105 | if blocalstylesheet: 106 | htm.append('%s%s' % (prevcellval, overridecellval)) 107 | else: 108 | htm.append('%s%s' % (prevcellval, overridecellval)) 109 | else: 110 | htm.append(svalue(c)) 111 | 112 | if (c.x, c.y) in consolidatedcellvalueoverride: 113 | consolidatedcellvalueoverride 114 | htm.append("
\n") 117 | 118 | jsty = "".join(sty) 119 | jkey = "".join(key) 120 | jhtm = "".join(htm) 121 | return "%s\n%s\n%s\n" % (jsty, jkey, jhtm) 122 | 123 | jscode = """ 124 | 152 | """ 153 | 154 | # generate the lookup table from titles to references 155 | def calcjslookup(conversionsegment): 156 | obslist = list(conversionsegment.segment.unordered_cells) # list(segment) otherwise gives bags of one element 157 | 158 | # this is where we could check/override the lookup values in some way 159 | dimvalues = [ [ hdim.cellvalobs(ob)[0] for hdim in conversionsegment.dimensions if hdim.hbagset is not None ] for ob in obslist ] 160 | jslookup = '{%s}' % ",".join('"%d %d":[%s]' % (k.x, k.y, ",".join("%d,%d" % (d.x, d.y) for d in tup if d)) \ 161 | for k, tup in zip(obslist, dimvalues)) 162 | return jslookup 163 | 164 | 165 | # could do this as a html-frame and reload 166 | def sidewindowhtmldisplay(): 167 | sjs = ''' 168 | 177 | ''' 178 | display(HTML(sjs % dividNUM)) 179 | 180 | 181 | def savepreviewhtml(conversionsegment, fname=None, verbose=True): 182 | "Preview a highlighted table, cellbag, dimension, list of bags or ConversionSegment inline or into a secondary html file" 183 | # wrap a singleton or list of bags, tables and HDims to a ConversionSegment 184 | if not isinstance(conversionsegment, ConversionSegment): 185 | param1 = conversionsegment 186 | if not isinstance(param1, (tuple, list)): 187 | param1 = [param1] 188 | 189 | tab = None 190 | dimensions = [ ] 191 | for i, p in enumerate(param1): 192 | if "Table" in str(type(p)): 193 | ltab = p 194 | lhdim = None 195 | elif isinstance(p, HDim): 196 | ltab = p.hbagset.table 197 | lhdim = p 198 | else: 199 | ltab = p.table 200 | lhdim = HDim(p, "item %d"%i, databaker.constants.DIRECTLY, databaker.constants.ABOVE) # (fake lookup) 201 | 202 | if not tab: 203 | tab = ltab 204 | else: 205 | assert ltab is tab, "must all be same table" 206 | 207 | if lhdim: 208 | dimensions.append(lhdim) 209 | 210 | conversionsegment = ConversionSegment(tab, dimensions, []) 211 | 212 | # now we have a ConversionSegment 213 | incrementdividNUM() 214 | if fname is None: 215 | fout = io.StringIO() 216 | blocalstylesheet = not (len(conversionsegment.tab) < 1500) 217 | else: 218 | fout = io.open(fname, "w", encoding='utf-8') 219 | fout.write("\n%s\n\n" % conversionsegment.tab.name) 220 | blocalstylesheet = True 221 | 222 | htmtable = tabletohtml(conversionsegment.tab, conversionsegment.dsubsets(), conversionsegment.consolidatedcellvalueoverride(), blocalstylesheet) 223 | fout.write('
\n' % (dividNUM)) 224 | fout.write(htmtable) 225 | fout.write('
\n') 226 | 227 | if fname is not None and verbose: 228 | print("tablepart '%s' written #%s" % (conversionsegment.tab.name, dividNUM)) 229 | if conversionsegment.dimensions and conversionsegment.segment: 230 | jslookup = calcjslookup(conversionsegment) 231 | if fname is not None and verbose: 232 | print("javascript calculated") 233 | fout.write(jscode % (jslookup, dividNUM)) 234 | 235 | if fname is None: 236 | display(HTML(fout.getvalue())) 237 | else: 238 | fout.write("\n") 239 | fout.close() 240 | local_file = FileLink(path=os.path.basename(fname), 241 | result_html_prefix="Written to file: ") 242 | display(local_file) 243 | -------------------------------------------------------------------------------- /databaker/jupybakeutils.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | # HTML preview of the dimensions and table (will be moved to a function in databakersolo) 3 | 4 | import io, os, collections, re, warnings, csv, datetime 5 | import databaker.constants 6 | import xypath 7 | from databaker import richxlrd 8 | template = databaker.constants.template 9 | 10 | try: import pandas 11 | except ImportError: pandas = None # no pandas in pypy 12 | 13 | def svalue(cell): 14 | if not isinstance(cell.value, datetime.datetime): 15 | return str(cell.value) 16 | # the fmt string is some excel generated garbage format, like: '[$-809]dd\\ mmmm\\ yyyy;@' 17 | # the xlrd module does its best and creates a date tuple, which messytables constructs into a datetime using xldate_as_tuple() 18 | xls_format = cell.properties['formatting_string'].upper() 19 | quarter = int((cell.value.month -1 ) // 3) + 1 20 | if 'Q' in xls_format: py_format = "%Y Q{quarter}" # may be very rare 21 | elif 'D' in xls_format: py_format = "%Y-%m-%d" 22 | elif 'M' in xls_format: py_format = "%b %Y" 23 | elif 'Y' in xls_format: py_format = "%Y" 24 | else: py_format = "%Y-%m-%d" 25 | return cell.value.strftime(py_format).format(quarter=quarter) 26 | 27 | 28 | class HDim: 29 | "Dimension object which defines the lookup between an observation cell and a bag of header cells" 30 | def __init__(self, hbagset, label, strict=None, direction=None, cellvalueoverride=None): 31 | self.label = label 32 | self.name = label 33 | 34 | self.cellvalueoverride = cellvalueoverride or {} # do not put {} into default value otherwise there is only one static one for everything 35 | assert not isinstance(hbagset, str), "Use empty set and default value for single value dimension" 36 | self.hbagset = hbagset 37 | self.bhbagsetCopied = False 38 | 39 | if self.hbagset is None: # single value type 40 | assert direction is None and strict is None 41 | assert len(cellvalueoverride) == 1 and None in cellvalueoverride, "single value type should have cellvalueoverride={None:defaultvalue}" 42 | return 43 | 44 | assert isinstance(self.hbagset, xypath.xypath.Bag), "dimension should be made from xypath.Bag type, not %s" % type(self.hbagset) 45 | self.strict = strict 46 | self.direction = direction 47 | assert direction is not None and strict is not None 48 | 49 | self.bxtype = (self.direction[1] == 0) 50 | self.samerowlookup = None 51 | 52 | 53 | def celllookup(self, scell): 54 | "Lookup function from a given cell to the matching header cell" 55 | 56 | # caching that can be removed in AddCellValueOverride 57 | if self.strict and self.samerowlookup is None: 58 | self.samerowlookup = {} 59 | for hcell in self.hbagset.unordered_cells: 60 | k = hcell.y if self.bxtype else hcell.x 61 | if k not in self.samerowlookup: 62 | self.samerowlookup[k] = [] 63 | self.samerowlookup[k].append(hcell) 64 | 65 | def mult(cell): 66 | return cell.x * self.direction[0] + cell.y * self.direction[1] 67 | def dgap(cell, target_cell): 68 | if direction[1] == 0: 69 | return abs(cell.x - target_cell.x) 70 | return abs(cell.y - target_cell.y) 71 | 72 | def betweencells(scell, target_cell, best_cell): 73 | if mult(scell) <= mult(target_cell): 74 | if not best_cell or mult(target_cell) <= mult(best_cell): 75 | return True 76 | return False 77 | 78 | def same_row_col(a, b): 79 | return (a.x - b.x == 0 and self.direction[0] == 0) or (a.y - b.y == 0 and self.direction[1] == 0) 80 | 81 | if self.strict: 82 | hcells = self.samerowlookup.get(scell.y if self.bxtype else scell.x, []) 83 | else: 84 | hcells = self.hbagset.unordered_cells 85 | hcells = self.hbagset.unordered_cells 86 | 87 | best_cell = None 88 | second_best_cell = None 89 | 90 | #if strict: print(len(list(hcells)), len(list(hbagset.unordered_cells))) 91 | for target_cell in hcells: 92 | if betweencells(scell, target_cell, best_cell): 93 | if not self.strict or same_row_col(scell, target_cell): 94 | second_best_cell = best_cell 95 | best_cell = target_cell 96 | if second_best_cell and mult(best_cell) == mult(second_best_cell): 97 | raise xypath.LookupConfusionError("{!r} is as good as {!r} for {!r}".format(best_cell, second_best_cell, scell)) 98 | if best_cell is None: 99 | return None 100 | return best_cell 101 | 102 | def headcellval(self, hcell): 103 | "Extract the string value of a member header cell (including any value overrides)" 104 | if hcell is not None: 105 | assert isinstance(hcell, xypath.xypath._XYCell), "celllookups should only go to an _XYCell" 106 | if hcell in self.cellvalueoverride: 107 | val = self.cellvalueoverride[hcell] 108 | assert isinstance(val, (str, float, int)), "Override from hcell value should go directly to a str,float,int,None-value (%s)" % type(val) 109 | return val 110 | val = svalue(hcell) 111 | #assert val is None or isinstance(val, (str, float, int)), "cell value should only be str,float,int,None (%s)" % type(val) 112 | else: 113 | val = None 114 | 115 | # It's allowed to have {None:defaultvalue} to set the NoLookupValue 116 | if val in self.cellvalueoverride: 117 | val = self.cellvalueoverride[val] 118 | assert val is None or isinstance(val, (str, float, int)), "Override from value should only be str,float,int,None (%s)" % type(val) 119 | 120 | # type call if no other things match 121 | elif type(val) in self.cellvalueoverride: 122 | val = self.cellvalueoverride[type(val)](val) 123 | 124 | return val 125 | 126 | 127 | def cellvalobs(self, ob): 128 | "Full lookup from a observation cell to its dimensional value (which can apply before lookup)" 129 | if isinstance(ob, xypath.xypath.Bag): 130 | assert len(ob) == 1, "Can only lookupobs a single cell" 131 | ob = ob._cell 132 | assert isinstance(ob, xypath.xypath._XYCell), "Lookups only allowed on an obs cell" 133 | 134 | # we do two steps through cellvalueoverride in three places on mutually distinct sets (obs, heading, strings) 135 | # and not recursively as these are wholly different applications. the celllookup is itself like a cellvalueoverride 136 | if ob in self.cellvalueoverride: 137 | val = self.cellvalueoverride[ob] # knock out an individual obs for this cell 138 | assert isinstance(val, str), "Override from obs should go directly to a string-value" 139 | return None, val 140 | 141 | if self.hbagset is not None: 142 | hcell = self.celllookup(ob) 143 | else: 144 | hcell = None 145 | 146 | return hcell, self.headcellval(hcell) 147 | 148 | def AddCellValueOverride(self, overridecell, overridevalue): 149 | "Override the value of a header cell (and insert it if not present in the bag)" 150 | if isinstance(overridecell, str): 151 | self.cellvalueoverride[overridecell] = overridevalue 152 | return 153 | if overridecell is None: 154 | self.cellvalueoverride[overridecell] = overridevalue 155 | return 156 | if isinstance(overridecell, xypath.xypath.Bag): 157 | assert len(overridecell) == 1, "Can only lookupobs a single cell" 158 | overridecell = overridecell._cell 159 | assert isinstance(overridecell, xypath.xypath._XYCell), "Lookups only allowed on an obs cell" 160 | 161 | # add the cell into the base set of cells if it's new 162 | if overridecell not in self.hbagset.unordered_cells: 163 | if not self.bhbagsetCopied: 164 | self.hbagset = self.hbagset | (self.hbagset.by_index(1) if len(self.hbagset) else self.hbagset) # force copy by adding element from itself 165 | self.bhbagsetCopied = True # avoid inefficient copying every single time 166 | self.hbagset.add(overridecell) 167 | self.samerowlookup = None # abolish any caching 168 | else: 169 | if overridecell in self.cellvalueoverride: 170 | if self.cellvalueoverride[overridecell] != overridevalue: 171 | warnings.warn("Cell %s was already overridden by value %s; is this a mistake?" % (overridecell, self.cellvalueoverride[overridecell])) 172 | 173 | assert overridevalue is None or isinstance(overridevalue, (str, float, int)), "Override from value should only be str,float,int,None (%s)" % type(overridevalue) 174 | self.cellvalueoverride[overridecell] = overridevalue 175 | 176 | def discardcellsnotlookedup(self, obs): 177 | "Remove header cells to which none of the observation cells looks up to" 178 | hbagsetT = xypath.xypath.Bag(self.hbagset.table) 179 | for ob in obs.unordered_cells: 180 | hbagsetT.add(self.celllookup(ob)) 181 | self.hbagset = hbagsetT 182 | 183 | def valueslist(self): 184 | "List of all the header cell values" 185 | return [self.headcellval(cell) for cell in sorted(self.hbagset.unordered_cells, key=lambda cell: (cell.y, cell.x))] 186 | 187 | def checkvalues(self, vlist): 188 | "Check that the header cell values match" 189 | scells = sorted(self.hbagset.unordered_cells, key=lambda cell: (cell.y, cell.x)) 190 | if len(scells) != len(vlist): 191 | warnings.warn("checkvalues list length doesn't match") 192 | return False 193 | 194 | for cell, v in zip(scells, vlist): 195 | nv = self.headcellval(cell) 196 | if nv != v: 197 | warnings.warn("checkvalues mismatch in cell (%d,%d) cell value '%s' doesn't match '%s'" % (cell.x, cell.y, nv, v)) 198 | return False 199 | return True 200 | 201 | 202 | def HDimConst(name, val): 203 | "Define a constant value dimension across the whole segment" 204 | return HDim(None, name, cellvalueoverride={None:val}) 205 | 206 | 207 | def Ldatetimeunitloose(date): 208 | if not isinstance(date, str): 209 | if isinstance(date, (float, int)) and 1000<=date<=9999 and int(date)==date: 210 | return "Year" 211 | return '' 212 | d = date.strip() 213 | if re.match('\d{4}(?:\.0)?$', d): 214 | return 'Year' 215 | if re.match('\d{4}(?:\.0)?\s*[Qq]\d$', d): 216 | return 'Quarter' 217 | if re.match('[Qq]\d\s*\d{4}(?:\.0)?$', d): 218 | return 'Quarter' 219 | if re.match('[A-Za-z]{3}-[A-Za-z]{3}\s*\d{4}(?:\.0)?$', d): 220 | return 'Quarter' 221 | if re.match('[A-Za-z]{3}\s*\d{4}(?:\.0)?$', d): 222 | return 'Month' 223 | return '' 224 | 225 | def Ldatetimeunitforce(st, timeunit): 226 | st = str(st).strip() 227 | if timeunit == 'Year': 228 | mst = re.match("(\d\d\d\d)(?:\.0)?$", st) 229 | if mst: 230 | return mst.group(1) 231 | 232 | elif timeunit == "Quarter": 233 | mq1 = re.match('(\d{4})(?:\.0)?\s*[Qq](\d)', st) 234 | mq2 = re.match('([A-Za-z]{3}-[A-Za-z]{3})\s*(\d{4})', st) 235 | mq3 = re.match('[Qq](\d)\s*(\d{4})', st) 236 | if mq1: 237 | return "%s Q%s" % (mq1.group(1), mq1.group(2)) 238 | if mq2: 239 | return "%s %s" % (mq2.group(1), mq2.group(2)) 240 | if mq3: 241 | return "%s Q%s" % (mq3.group(2), mq3.group(1)) 242 | 243 | elif timeunit == "Month": 244 | mm1 = re.match('\s*([A-Za-z]{3})\s*(\d{4})', st) 245 | if mm1: 246 | return "%s %s" % (mm1.group(1), mm1.group(2)) 247 | elif timeunit == "": 248 | return st 249 | else: 250 | timeunit = "unknown:%s" % timeunit 251 | warnings.warn("TIME %s disagrees with TIMEUNIT %s" % (st, timeunit)) 252 | return st 253 | 254 | 255 | def HLDUPgenerate_header_row(numheaderadditionals): 256 | res = [ (k[0] if isinstance(k, tuple) else k) for k in template.headermeasurements ] 257 | for i in range(numheaderadditionals): 258 | for k in template.headeradditionals: 259 | if isinstance(k, tuple): 260 | sk = k[0] 261 | else: 262 | sk = k 263 | res.append("%s_%d" % (sk, i+1)) 264 | return res 265 | 266 | 267 | class ConversionSegment: 268 | "Single output table object generated from a bag of observations that look up to a list of dimensions" 269 | def __init__(self, observations, dimensions, Lobservations=None, processTIMEUNIT=True, includecellxy=False): 270 | if Lobservations is None: # new format that drops the unnecessary table element 271 | tab = observations.table 272 | Lobservations = observations 273 | else: 274 | tab = observations # old function format 275 | 276 | self.tab = tab 277 | self.dimensions = dimensions 278 | self.segment = Lobservations # original name for observations list 279 | 280 | self.processtimeunit = processTIMEUNIT 281 | self.includecellxy = includecellxy 282 | 283 | for dimension in self.dimensions: 284 | assert isinstance(dimension, HDim), ("Dimensions must have type HDim()") 285 | assert dimension.hbagset is None or dimension.hbagset.table is tab, "dimension %s from different tab" % dimension.name 286 | 287 | self.numheaderadditionals = sum(1 for dimension in self.dimensions if dimension.label not in template.headermeasurementnamesSet) 288 | 289 | # generate the ordered obslist here (so it is fixed here and can be reordered before processing) 290 | if isinstance(self.segment, xypath.xypath.Bag): 291 | assert self.segment.table is tab, "segments from different tab" 292 | self.obslist = list(self.segment.unordered_cells) # list(segment) otherwise gives bags of one element 293 | self.obslist.sort(key=lambda cell: (cell.y, cell.x)) 294 | else: 295 | assert isinstance(self.segment, (tuple, list)), "segment needs to be a Bag or a list, not a %s" % type(self.segment) 296 | self.obslist = self.segment 297 | 298 | # holding place for output of processing. 299 | # technically no reason we shouldn't process at this point either, on this constructor, 300 | # but doing it in stages allows for interventions along the way 301 | self.processedrows = None 302 | 303 | 304 | # used in tabletohtml for the subsets, and where we would find the mappings for over-ride values 305 | def dsubsets(self): 306 | tsubs = [ ] 307 | if self.segment: 308 | tsubs.append((0, "OBS", self.segment)) 309 | for i, dimension in enumerate(self.dimensions): 310 | if dimension.hbagset is not None: # filter out TempValue headers 311 | tsubs.append((i+1, dimension.name, dimension.hbagset)) 312 | return tsubs 313 | 314 | # used in tabletohtml for the subsets, and where we would find the mappings for over-ride values 315 | def consolidatedcellvalueoverride(self): 316 | res = { } 317 | for i, dimension in enumerate(self.dimensions): 318 | if dimension.hbagset is not None: # filter out TempValue headers 319 | for hcell in dimension.hbagset.unordered_cells: 320 | sval = svalue(hcell) 321 | val = hcell.value 322 | if hcell in dimension.cellvalueoverride: 323 | val = str(dimension.cellvalueoverride[hcell]) 324 | elif sval in dimension.cellvalueoverride: 325 | val = str(dimension.cellvalueoverride[val]) 326 | elif type(hcell.value) in dimension.cellvalueoverride: 327 | val = str(dimension.cellvalueoverride[type(hcell.value)](hcell.value)) 328 | else: 329 | val = sval 330 | if val != sval: 331 | res[(hcell.x, hcell.y)] = val 332 | return res 333 | 334 | # individual lookup across the dimensions here 335 | def lookupobs(self, ob): 336 | if type(ob) is xypath.xypath.Bag: 337 | assert len(ob) == 1, "Can only lookupobs on a single cell" 338 | ob = ob._cell 339 | 340 | # force it to be float and split off anything not float into the datamarker 341 | if not isinstance(ob.value, float): 342 | if ob.properties['richtext']: # should this case be implemented into the svalue() function? 343 | sval = richxlrd.RichCell(ob.properties.cell.sheet, ob.y, ob.x).fragments.not_script.value 344 | else: 345 | sval = svalue(ob) 346 | 347 | if template.SH_Split_OBS: 348 | assert template.SH_Split_OBS == databaker.constants.DATAMARKER, (template.SH_Split_OBS, databaker.constants.DATAMARKER) 349 | ob_value, dm_value = re.match(r"([-+]?[0-9]+\.?[0-9]*)?(.*)", sval).groups() 350 | dval = { } 351 | if dm_value: 352 | dval[template.SH_Split_OBS] = dm_value 353 | if ob_value: 354 | dval[databaker.constants.OBS] = float(ob_value) 355 | else: 356 | dval[databaker.constants.OBS] = "" 357 | else: 358 | dval = { databaker.constants.OBS:sval } 359 | else: 360 | dval = { databaker.constants.OBS:ob.value } 361 | 362 | for hdim in self.dimensions: 363 | hcell, val = hdim.cellvalobs(ob) 364 | dval[hdim.label] = val 365 | 366 | if self.includecellxy: 367 | dval["__x"] = ob.x 368 | dval["__y"] = ob.y 369 | dval["__tablename"] = self.tab.name 370 | return dval 371 | 372 | def guesstimeunit(self): 373 | for dval in self.processedrows: 374 | dval[template.TIMEUNIT] = Ldatetimeunitloose(dval[template.TIME]) 375 | ctu = collections.Counter(dval[template.TIMEUNIT] for dval in self.processedrows) 376 | if len(ctu) == 1: 377 | return "TIMEUNIT='%s'" % list(ctu.keys())[0] 378 | return "multiple TIMEUNITs: %s" % ", ".join("'%s'(%d)" % (k,v) for k,v in ctu.items()) 379 | 380 | def fixtimefromtimeunit(self): # this works individually and not across the whole segment homogeneously 381 | for dval in self.processedrows: 382 | dval[template.TIME] = Ldatetimeunitforce(dval[template.TIME], dval[template.TIMEUNIT]) 383 | 384 | def process(self): 385 | assert self.processedrows is None, "Conversion segment already processed" 386 | self.processedrows = [ self.lookupobs(ob) for ob in self.obslist ] 387 | 388 | kdim = dict((dimension.label, dimension) for dimension in self.dimensions) 389 | timeunitmessage = "" 390 | if self.processtimeunit: 391 | if template.SH_Create_ONS_time and ((template.TIMEUNIT not in kdim) and (template.TIME in kdim)): 392 | timeunitmessage = self.guesstimeunit() 393 | self.fixtimefromtimeunit() 394 | elif template.TIME in kdim and template.TIMEUNIT not in kdim: 395 | self.fixtimefromtimeunit() 396 | return timeunitmessage 397 | 398 | 399 | def topandas(self): 400 | if pandas is None: 401 | warnings.warn("Sorry, you do not have pandas installed in this environment") 402 | return None 403 | 404 | timeunitmessage = "" 405 | if self.processedrows is None: 406 | timeunitmessage = self.process() 407 | print(timeunitmessage) 408 | df = pandas.DataFrame.from_dict(self.processedrows) 409 | 410 | # sort the columns 411 | dfcols = list(df.columns) 412 | newdfcols = [ ] 413 | for k in template.headermeasurements: 414 | if isinstance(k, tuple): 415 | if k[1] in dfcols: 416 | newdfcols.append(k[1]) 417 | dfcols.remove(k[1]) 418 | for dimension in self.dimensions: 419 | if dimension.label not in template.headermeasurementnamesSet: 420 | assert dimension.label in dfcols 421 | newdfcols.append(dimension.label) 422 | dfcols.remove(dimension.label) 423 | 424 | for excol in ["__x", "__y", "__tablename"]: 425 | if excol in dfcols: 426 | if self.includecellxy: 427 | newdfcols.append(excol) 428 | dfcols.remove(excol) 429 | assert not dfcols, ("unexplained extra columns", dfcols) 430 | 431 | df = df[newdfcols] # map the new column list in 432 | return df 433 | 434 | def pdguessforceTIMEUNIT(df): 435 | df["TIMEUNIT"] = df.apply(lambda row: Ldatetimeunitloose(row.TIME), axis=1) 436 | df["TIME"] = df.apply(lambda row: Ldatetimeunitforce(row.TIME, row.TIMEUNIT), axis=1) 437 | 438 | 439 | -------------------------------------------------------------------------------- /databaker/overrides.py: -------------------------------------------------------------------------------- 1 | """ 2 | Patches xypath and messytables. 3 | """ 4 | 5 | import re 6 | import datetime 7 | import warnings 8 | 9 | import xypath 10 | import messytables 11 | 12 | class MatchNotFound(Exception): 13 | """failed to find match in bag.group""" 14 | pass 15 | 16 | # === Cell Overrides ====================================== 17 | 18 | def cell_repr(cell): 19 | column = xypath.contrib.excel.excel_column_label(cell.x+1) 20 | return "<{}{} {!r}>".format(column, cell.y+1, cell.value) 21 | xypath.xypath._XYCell.__repr__ = cell_repr 22 | 23 | # === TableSet Overrides ================================== 24 | 25 | @property 26 | def tabnames(tableset): 27 | return set(x.name for x in tableset.tables) 28 | messytables.TableSet.names = tabnames 29 | 30 | # === Table Overrides ===================================== 31 | 32 | def excel_ref(table, reference): 33 | if ':' not in reference: 34 | (col, row) = xypath.contrib.excel.excel_address_coordinate(reference, partial=True) 35 | return table.get_at(col, row) 36 | else: 37 | ((left, top), (right, bottom)) = xypath.contrib.excel.excel_range(reference) 38 | bag = xypath.Bag(table=table) 39 | if top is None and bottom is None: 40 | for col in range(left, right + 1): 41 | bag = bag | table.get_at(col, None) 42 | elif left is None and right is None: 43 | for row in range(top, bottom + 1): 44 | bag = bag | table.get_at(None, row) 45 | else: 46 | for row in range(top, bottom + 1): 47 | for col in range(left, right + 1): 48 | bag = bag | table.get_at(col, row) 49 | return bag 50 | xypath.Table.excel_ref = excel_ref 51 | 52 | # copied in just for one function to enable deletion of utils.py 53 | def Ddatematch(date, silent=False): 54 | """match mmm yyyy, mmm-mmm yyyy, yyyy Qn, yyyy""" 55 | if not isinstance(date, str): 56 | if (isinstance(date, float) or isinstance(date, int)) and date>=1000 and date<=9999 and int(date)==date: 57 | return "Year" 58 | if not silent: 59 | warnings.warn("Couldn't identify date {!r}".format(date)) 60 | return '' 61 | d = date.strip() 62 | if re.match('\d{4}$', d): 63 | return 'Year' 64 | if re.match('\d{4} [Qq]\d$', d): 65 | return 'Quarter' 66 | if re.match('[A-Za-z]{3}-[A-Za-z]{3} \d{4}$', d): 67 | return 'Quarter' 68 | if re.match('[A-Za-z]{3} \d{4}$', d): 69 | return 'Month' 70 | if not silent: 71 | warnings.warn("Couldn't identify date {!r}".format(date)) 72 | return '' 73 | 74 | # === Bag Overrides ======================================= 75 | 76 | xypath.Bag.regex = lambda self, x: self.filter(re.compile(x)) 77 | 78 | def is_date(bag): 79 | return bag.filter(lambda cell: Ddatematch(cell.value, silent=True)) 80 | xypath.Bag.is_date = is_date 81 | 82 | def is_number(bag): 83 | return bag.filter(lambda cell: isinstance(cell.value, (int, float, int))) 84 | xypath.Bag.is_number = is_number 85 | def is_not_number(bag): 86 | return bag.filter(lambda cell: not isinstance(cell.value, (int, float, int))) 87 | xypath.Bag.is_not_number = is_not_number 88 | 89 | def group(bag, regex): 90 | """get the text""" 91 | bag.assert_one() 92 | match = re.search(regex, bag.value) 93 | if not match: 94 | raise MatchNotFound("Can't find {!r} in {!r}".format(regex, bag.value)) 95 | matchtext = match.groups(0)[0] 96 | assert matchtext 97 | return matchtext 98 | xypath.Bag.group = group 99 | 100 | def one_of(bag, options): 101 | output = None 102 | for option in options: 103 | if output is None: 104 | output = bag.filter(option) 105 | else: 106 | output = output | bag.filter(option) 107 | return output 108 | xypath.Bag.one_of = one_of 109 | 110 | def parent(bag): 111 | """for cell, get its top-left cell""" 112 | output_bag = xypath.Bag(table = bag.table) 113 | for cell in bag.unordered: 114 | row, _, col, _ = cell.properties.raw_span(always=True) 115 | output_bag.add(cell.table.get_at(col, row)._cell) 116 | return output_bag 117 | xypath.Bag.parent = parent 118 | 119 | def children(bag): 120 | """for top-left cell, get all cells it spans""" 121 | outputbag = xypath.Bag(table=bag.table) 122 | for parent in bag: 123 | top, bottom, left, right = parent.properties.raw_span(always=True) 124 | for row in range(top, bottom + 1): 125 | for col in range(left, right + 1): 126 | outputbag = outputbag | bag.table.get_at(col, row) 127 | return outputbag 128 | xypath.Bag.children = children 129 | 130 | def rich_text(bag): 131 | r = bag.property.rich 132 | return r 133 | xypath.Bag.rich_text = rich_text 134 | 135 | def spaceprefix(bag, count): 136 | """filter: cells starting with exactly count whitespace: no more, no less""" 137 | return bag.filter(re.compile("^\s{%s}\S" % count)) 138 | xypath.Bag.spaceprefix = spaceprefix 139 | 140 | def is_whitespace(bag): 141 | """filter: cells which do not contain printable characters""" 142 | return bag.filter(lambda cell: not str(cell.value).strip()) 143 | xypath.Bag.is_whitespace = is_whitespace 144 | 145 | def is_not_whitespace(bag): 146 | """filter: cells which do contain printable characters""" 147 | return bag.filter(lambda cell: str(cell.value).strip()) 148 | xypath.Bag.is_not_whitespace = is_not_whitespace 149 | 150 | def by_index(bag, items): 151 | """filter: return numbered items from a bag. 152 | Note that this is 1-indexed! 153 | Items can be a list or a single number""" 154 | if isinstance(items, int): 155 | return bag.by_index([items]) 156 | new = xypath.Bag(table=bag.table) 157 | for i, cell in enumerate(bag): 158 | if i+1 in items: 159 | new.add(cell._cell) 160 | if i+1 == max(items): 161 | return new 162 | raise xypath.XYPathError("get_nth needed {} items, but bag only contained {}.\n{!r}".format(max(items), len(bag), bag)) 163 | xypath.Bag.by_index = by_index 164 | 165 | -------------------------------------------------------------------------------- /databaker/richxlrd/__init__.py: -------------------------------------------------------------------------------- 1 | from .richxlrd import RichCell, Fragments, Fragment 2 | -------------------------------------------------------------------------------- /databaker/richxlrd/rich.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cantabular/databaker/930b65e85038a3afd59574bbdc1d4cb79ce9b6ad/databaker/richxlrd/rich.xls -------------------------------------------------------------------------------- /databaker/richxlrd/richxlrd.py: -------------------------------------------------------------------------------- 1 | import xlrd 2 | 3 | "Horrid workaround! Can get 'no' every time from LibreOffice xls" 4 | @property 5 | def _bold(self): 6 | return self.weight > 500 7 | xlrd.formatting.Font._bold = _bold 8 | 9 | @property 10 | def _script(self): 11 | return self.escapement 12 | xlrd.formatting.Font.script = _script 13 | 14 | 15 | class RichCell(object): 16 | def __init__(self, sheet, y, x): 17 | self.sheet = sheet 18 | self.y = y 19 | self.x = x 20 | 21 | @property 22 | def cell(self): 23 | return self.sheet.cell(self.y, self.x) 24 | 25 | @property 26 | def raw_fontlist(self): 27 | """the position of a font change, and the new font code. 28 | note that it doesn't include the first font!""" 29 | return self.sheet.rich_text_runlist_map.get((self.y, self.x), []) 30 | 31 | @property 32 | def first_font(self): 33 | """the first font number""" 34 | xf = self.cell.xf_index 35 | return self.sheet.book.xf_list[xf].font_index 36 | 37 | @property 38 | def fontlist(self): 39 | full_fontlist = list(self.raw_fontlist) 40 | full_fontlist.insert(0, (0, self.first_font)) 41 | return list((pos, self.sheet.book.font_list[font]) for pos, font in full_fontlist) 42 | 43 | @property 44 | def fragments(self): 45 | fontlist = self.fontlist 46 | output = Fragments() 47 | for i, (start, font) in enumerate(fontlist): 48 | try: 49 | end = fontlist[i+1][0] 50 | except IndexError: 51 | end = None 52 | output.append(Fragment(self.cell.value[start:end], font)) 53 | start = end 54 | return output 55 | 56 | class Fragments(list): 57 | @classmethod 58 | def from_rich_text(self, richtext): 59 | return richtext.fragments 60 | 61 | @property 62 | def value(self): 63 | return ''.join(x.value for x in self) 64 | 65 | def __getattr__(self, v): 66 | if v.startswith('only_'): 67 | sense = True 68 | word = v[5:] 69 | elif v.startswith('not_'): 70 | sense = False 71 | word = v[4:] 72 | else: 73 | raise AttributeError("{!r} object has no attribute {!r}".format(self.__class__.__name__, v)) 74 | if word in ['bold']: 75 | word = '_' + word 76 | return Fragments(frag for frag in self if bool(getattr(frag.font, word)) == sense) 77 | 78 | 79 | class Fragment(object): 80 | def __init__(self, value, font): 81 | self.value = value 82 | self.font = font 83 | 84 | def __repr__(self): 85 | return "<{!r}:{!r}>".format(self.value, self.font) 86 | -------------------------------------------------------------------------------- /databaker/structure_csv_default.py: -------------------------------------------------------------------------------- 1 | """ 2 | Template/Options file for altering the structure of the .csv flatfile output. 3 | 4 | """ 5 | 6 | import collections 7 | headermeasurements = [ 8 | ('observation', "OBS"), 9 | ('data_marking', "DATAMARKER"), 10 | ('statistical_unit_eng', "STATUNIT"), 'statistical_unit_cym', 11 | ('measure_type_eng', "MEASURETYPE"), 'measure_type_cym', 'observation_type', 'empty', 'obs_type_value', 12 | ('unit_multiplier', "UNITMULTIPLIER"), 13 | ('unit_of_measure_eng', "UNITOFMEASURE"), 'unit_of_measure_cym', 'confidentuality', 'empty1', 14 | ('geographic_area', "GEOG"), 'empty2', 'empty3', 15 | ('time_dim_item_id', "TIME"), 16 | ('time_dim_item_label_eng',"TIME"), 'time_dim_item_label_cym', 17 | ('time_type', "TIMEUNIT"), 'empty4', 18 | ('statistical_population_id', "STATPOP"), 19 | ('statistical_population_label_eng',"STATPOP"), 'statistical_population_label_cym', 20 | ('cdid', "CDID"), 'cdiddescrip', 'empty5', 'empty6', 'empty7', 'empty8', 'empty9', 'empty10', 'empty11', 'empty12' 21 | ] 22 | 23 | headeradditionals = [ 24 | ("dim_id", "NAME"), ("dimension_label_eng", "NAME"), "dimension_label_cym", 25 | ("dim_item_id", "VALUE"), ("dimension_item_label_eng", "VALUE"), "dimension_item_label_cym", 26 | "is_total", "is_sub_total" 27 | ] 28 | 29 | conversionsegmentnumbercolumn = "empty11" 30 | 31 | # Do we want to create a TIMEUNIT dimension using a TIME dimension - ONS specific 32 | SH_Create_ONS_time = True 33 | 34 | # Do you want to split the OBS, placing non float data into your next column. 35 | SH_Split_OBS = "DATAMARKER" # see value set to int value below 36 | 37 | 38 | #### Below this point is derived data (used in old code) from the above tables 39 | 40 | # derive the elements of the headernames above into the values below 41 | headermeasurementnames = list(collections.OrderedDict.fromkeys(k[1] for k in headermeasurements if isinstance(k, tuple))) 42 | headermeasurementnamesSet = set(headermeasurementnames) 43 | 44 | # Create variables (This is terrible!) 45 | # TODO: Do this more cleanly e.g. as in https://stackoverflow.com/q/4859217/ 46 | exec("%s = '%s'" % (", ".join(headermeasurementnames), "', '".join(map(str, headermeasurementnames)))) 47 | exec("SH_Split_OBS = %s" % SH_Split_OBS) 48 | 49 | __all__ = list(headermeasurementnames) # don't expose unnecessary items when using `from foo import *` 50 | 51 | 52 | 53 | -------------------------------------------------------------------------------- /databaker/tutorial.py: -------------------------------------------------------------------------------- 1 | # Based on altair tutorial loader: 2 | # https://github.com/altair-viz/altair/blob/273a1fcf9cec1956474af755d5fe32f0e3f0aee8/altair/tutorial.py 3 | 4 | # Copyright (c) 2015, Brian E. Granger and Jake Vanderplas 5 | # All rights reserved. 6 | # 7 | # Redistribution and use in source and binary forms, with or without 8 | # modification, are permitted provided that the following conditions are met: 9 | # 10 | # * Redistributions of source code must retain the above copyright notice, this 11 | # list of conditions and the following disclaimer. 12 | # 13 | # * Redistributions in binary form must reproduce the above copyright notice, 14 | # this list of conditions and the following disclaimer in the documentation 15 | # and/or other materials provided with the distribution. 16 | # 17 | # * Neither the name of altair nor the names of its 18 | # contributors may be used to endorse or promote products derived from 19 | # this software without specific prior written permission. 20 | # 21 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 24 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 25 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 27 | # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 28 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 29 | # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 30 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | 32 | import os 33 | import shutil 34 | 35 | SRC_PATH = os.path.join( 36 | os.path.split(os.path.abspath(__file__) 37 | )[0], 'tutorial') 38 | 39 | DEST_PATH = os.path.relpath('DatabakerTutorial') 40 | 41 | def copy_tutorial(overwrite=False): 42 | """Copy the Databaker tutorial notebooks into ./DatabakerTutorial.""" 43 | if os.path.isdir(DEST_PATH) and overwrite: 44 | print('Removing old tutorial directory: {}'.format(DEST_PATH)) 45 | shutil.rmtree(DEST_PATH, ignore_errors=True) 46 | if os.path.isdir(DEST_PATH): 47 | raise RuntimeError('{} already exists, run with overwrite=True to discard *all* existing files in tutorial directory'.format(DEST_PATH)) 48 | print('Copying notebooks into fresh tutorial directory: {}'.format(DEST_PATH)) 49 | shutil.copytree(SRC_PATH, DEST_PATH) 50 | 51 | 52 | def tutorial(overwrite=False): 53 | """Copy the Databaker tutorial notebooks into ./DatabakerTutorial and show a link in the notebook.""" 54 | copy_tutorial(overwrite=overwrite) 55 | print('Click on the following notebooks to explore the tutorial:') 56 | from IPython.display import FileLinks, display 57 | file_links = FileLinks(path=DEST_PATH, 58 | included_suffixes=['.ipynb'], 59 | recursive=False) 60 | display(file_links) 61 | -------------------------------------------------------------------------------- /databaker/tutorial/Introduction.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Introduction\n", 8 | "\n", 9 | "[Databaker](https://github.com/sensiblecodeio/databaker) is an Open Source Python library for converting semi-structured spreadsheets into computer-friendly datatables. The resulting data can be stored into [Pandas data tables](http://pandas.pydata.org/) or the ONS-specific WDA format.\n", 10 | "\n", 11 | "The system is embedded into the interactive programming environment called [Jupyter](http://jupyter.org/) for fast prototyping and development, and depends for its spreadsheet processing on [messytables](http://messytables.readthedocs.io/en/latest/) and [xypath](https://github.com/sensiblecodeio/xypath).\n", 12 | "\n", 13 | "Install it with the command:\n", 14 | "\n", 15 | "> `pip3 install databaker`\n", 16 | "\n", 17 | "Your main interaction with databaker is through the Jupyter notebook interface. There are many tutorials to show you how to master this system elsewhere on-line. \n", 18 | "\n", 19 | "Once you've have a working program to converts a particular spreadsheet style into the output which you want, there are ways to rerun the notebook on other spreadsheets externally or from the command line. " 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": {}, 25 | "source": [ 26 | "# Example\n", 27 | "\n", 28 | "Although Databaker can handle spreadsheets of any size, here is a tiny example from the tutorials to illustrate what it does." 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 1, 34 | "metadata": { 35 | "collapsed": false 36 | }, 37 | "outputs": [ 38 | { 39 | "data": { 40 | "text/html": [ 41 | "
\n", 42 | "\n", 49 | "\n", 50 | "\n", 51 | "\n", 52 | "\n", 53 | "\n", 54 | "\n", 55 | "\n", 56 | "\n", 57 | "\n", 58 | "\n", 59 | "\n", 60 | "\n", 61 | "
beatles
Date2014.0
CarsPlanesTrains
John2.02.01.0
Paul4.03.02.0
Ringo4.01.03.0
George2.05.05.0
\n", 62 | "\n", 63 | "
\n" 64 | ], 65 | "text/plain": [ 66 | "" 67 | ] 68 | }, 69 | "metadata": {}, 70 | "output_type": "display_data" 71 | } 72 | ], 73 | "source": [ 74 | "from databaker.framework import *\n", 75 | "\n", 76 | "tab = loadxlstabs(\"example1.xls\", \"beatles\", verbose=False)[0]\n", 77 | "savepreviewhtml(tab, verbose=False)\n" 78 | ] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "metadata": {}, 83 | "source": [ 84 | "## Conversion segments\n", 85 | "Databaker gives you tools to help you write the code to navigate around the spreadsheet and select the cells and their correspondences. \n", 86 | "\n", 87 | "When you are done your code will look like the following. \n", 88 | "\n", 89 | "You can click on the OBS (observation) cells to see how they connect to the headings." 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": 2, 95 | "metadata": { 96 | "collapsed": false 97 | }, 98 | "outputs": [ 99 | { 100 | "data": { 101 | "text/html": [ 102 | "
\n", 103 | "\n", 110 | "\n", 111 | "\n", 112 | "\n", 113 | "
OBSTIMEVehiclesName
\n", 114 | "\n", 115 | "\n", 116 | "\n", 117 | "\n", 118 | "\n", 119 | "\n", 120 | "\n", 121 | "\n", 122 | "\n", 123 | "\n", 124 | "
beatles
Date2014.0
CarsPlanesTrains
John2.02.01.0
Paul4.03.02.0
Ringo4.01.03.0
George2.05.05.0
\n", 125 | "\n", 126 | "
\n", 127 | "\n", 128 | "\n" 156 | ], 157 | "text/plain": [ 158 | "" 159 | ] 160 | }, 161 | "metadata": {}, 162 | "output_type": "display_data" 163 | } 164 | ], 165 | "source": [ 166 | "r1 = tab.excel_ref('B3').expand(RIGHT)\n", 167 | "r2 = tab.excel_ref('A3').fill(DOWN)\n", 168 | "dimensions = [ \n", 169 | " HDim(tab.excel_ref('B1'), TIME, CLOSEST, ABOVE), \n", 170 | " HDim(r1, \"Vehicles\", DIRECTLY, ABOVE), \n", 171 | " HDim(r2, \"Name\", DIRECTLY, LEFT), \n", 172 | " HDimConst(\"Category\", \"Beatles\")\n", 173 | "]\n", 174 | "observations = tab.excel_ref('B4').expand(DOWN).expand(RIGHT).is_not_blank().is_not_whitespace()\n", 175 | "c1 = ConversionSegment(observations, dimensions)\n", 176 | "savepreviewhtml(c1)\n" 177 | ] 178 | }, 179 | { 180 | "cell_type": "markdown", 181 | "metadata": {}, 182 | "source": [ 183 | "## Output in pandas\n", 184 | "[Pandas data tables](http://pandas.pydata.org/) provides an enormous scope for further processing and cleaning of the data. \n", 185 | "\n", 186 | "To make full use of its power you should become familiar with its [Time series functionality](http://pandas.pydata.org/pandas-docs/stable/timeseries.html), which will allows you to plot, resample and align multple data sources at once.\n" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": 3, 192 | "metadata": { 193 | "collapsed": false 194 | }, 195 | "outputs": [ 196 | { 197 | "name": "stdout", 198 | "output_type": "stream", 199 | "text": [ 200 | "TIMEUNIT='Year'\n" 201 | ] 202 | }, 203 | { 204 | "data": { 205 | "text/html": [ 206 | "
\n", 207 | "\n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | "
OBSTIMETIMEUNITVehiclesNameCategory__x__y__tablename
02.02014YearCarsJohnBeatles13beatles
12.02014YearPlanesJohnBeatles23beatles
21.02014YearTrainsJohnBeatles33beatles
34.02014YearCarsPaulBeatles14beatles
43.02014YearPlanesPaulBeatles24beatles
52.02014YearTrainsPaulBeatles34beatles
64.02014YearCarsRingoBeatles15beatles
71.02014YearPlanesRingoBeatles25beatles
83.02014YearTrainsRingoBeatles35beatles
92.02014YearCarsGeorgeBeatles16beatles
105.02014YearPlanesGeorgeBeatles26beatles
115.02014YearTrainsGeorgeBeatles36beatles
\n", 369 | "
" 370 | ], 371 | "text/plain": [ 372 | " OBS TIME TIMEUNIT Vehicles Name Category __x __y __tablename\n", 373 | "0 2.0 2014 Year Cars John Beatles 1 3 beatles\n", 374 | "1 2.0 2014 Year Planes John Beatles 2 3 beatles\n", 375 | "2 1.0 2014 Year Trains John Beatles 3 3 beatles\n", 376 | "3 4.0 2014 Year Cars Paul Beatles 1 4 beatles\n", 377 | "4 3.0 2014 Year Planes Paul Beatles 2 4 beatles\n", 378 | "5 2.0 2014 Year Trains Paul Beatles 3 4 beatles\n", 379 | "6 4.0 2014 Year Cars Ringo Beatles 1 5 beatles\n", 380 | "7 1.0 2014 Year Planes Ringo Beatles 2 5 beatles\n", 381 | "8 3.0 2014 Year Trains Ringo Beatles 3 5 beatles\n", 382 | "9 2.0 2014 Year Cars George Beatles 1 6 beatles\n", 383 | "10 5.0 2014 Year Planes George Beatles 2 6 beatles\n", 384 | "11 5.0 2014 Year Trains George Beatles 3 6 beatles" 385 | ] 386 | }, 387 | "execution_count": 3, 388 | "metadata": {}, 389 | "output_type": "execute_result" 390 | } 391 | ], 392 | "source": [ 393 | "c1.topandas()" 394 | ] 395 | }, 396 | { 397 | "cell_type": "markdown", 398 | "metadata": {}, 399 | "source": [ 400 | "## Output in WDA Observation File\n", 401 | "The WDA system in the ONS has been the primary use for this library. If you need output into WDA the result would look like the following:" 402 | ] 403 | }, 404 | { 405 | "cell_type": "code", 406 | "execution_count": 4, 407 | "metadata": { 408 | "collapsed": false 409 | }, 410 | "outputs": [ 411 | { 412 | "name": "stdout", 413 | "output_type": "stream", 414 | "text": [ 415 | "observation,data_marking,statistical_unit_eng,statistical_unit_cym,measure_type_eng,measure_type_cym,observation_type,empty,obs_type_value,unit_multiplier,unit_of_measure_eng,unit_of_measure_cym,confidentuality,empty1,geographic_area,empty2,empty3,time_dim_item_id,time_dim_item_label_eng,time_dim_item_label_cym,time_type,empty4,statistical_population_id,statistical_population_label_eng,statistical_population_label_cym,cdid,cdiddescrip,empty5,empty6,empty7,empty8,empty9,empty10,empty11,empty12,dim_id_1,dimension_label_eng_1,dimension_label_cym_1,dim_item_id_1,dimension_item_label_eng_1,dimension_item_label_cym_1,is_total_1,is_sub_total_1,dim_id_2,dimension_label_eng_2,dimension_label_cym_2,dim_item_id_2,dimension_item_label_eng_2,dimension_item_label_cym_2,is_total_2,is_sub_total_2,dim_id_3,dimension_label_eng_3,dimension_label_cym_3,dim_item_id_3,dimension_item_label_eng_3,dimension_item_label_cym_3,is_total_3,is_sub_total_3\r\n", 416 | "2.0,,,,,,,,,,,,,,,,,2014,2014,,Year,,,,,,,,,,,,,0,,Vehicles,Vehicles,,Cars,Cars,,,,Name,Name,,John,John,,,,Category,Category,,Beatles,Beatles,,,\r\n", 417 | "2.0,,,,,,,,,,,,,,,,,2014,2014,,Year,,,,,,,,,,,,,0,,Vehicles,Vehicles,,Planes,Planes,,,,Name,Name,,John,John,,,,Category,Category,,Beatles,Beatles,,,\r\n", 418 | "1.0,,,,,,,,,,,,,,,,,2014,2014,,Year,,,,,,,,,,,,,0,,Vehicles,Vehicles,,Trains,Trains,,,,Name,Name,,John,John,,,,Category,Category,,Beatles,Beatles,,,\r\n", 419 | "4.0,,,,,,,,,,,,,,,,,2014,2014,,Year,,,,,,,,,,,,,0,,Vehicles,Vehicles,,Cars,Cars,,,,Name,Name,,Paul,Paul,,,,Category,Category,,Beatles,Beatles,,,\r\n", 420 | "3.0,,,,,,,,,,,,,,,,,2014,2014,,Year,,,,,,,,,,,,,0,,Vehicles,Vehicles,,Planes,Planes,,,,Name,Name,,Paul,Paul,,,,Category,Category,,Beatles,Beatles,,,\r\n", 421 | "2.0,,,,,,,,,,,,,,,,,2014,2014,,Year,,,,,,,,,,,,,0,,Vehicles,Vehicles,,Trains,Trains,,,,Name,Name,,Paul,Paul,,,,Category,Category,,Beatles,Beatles,,,\r\n", 422 | "4.0,,,,,,,,,,,,,,,,,2014,2014,,Year,,,,,,,,,,,,,0,,Vehicles,Vehicles,,Cars,Cars,,,,Name,Name,,Ringo,Ringo,,,,Category,Category,,Beatles,Beatles,,,\r\n", 423 | "1.0,,,,,,,,,,,,,,,,,2014,2014,,Year,,,,,,,,,,,,,0,,Vehicles,Vehicles,,Planes,Planes,,,,Name,Name,,Ringo,Ringo,,,,Category,Category,,Beatles,Beatles,,,\r\n", 424 | "3.0,,,,,,,,,,,,,,,,,2014,2014,,Year,,,,,,,,,,,,,0,,Vehicles,Vehicles,,Trains,Trains,,,,Name,Name,,Ringo,Ringo,,,,Category,Category,,Beatles,Beatles,,,\r\n", 425 | "2.0,,,,,,,,,,,,,,,,,2014,2014,,Year,,,,,,,,,,,,,0,,Vehicles,Vehicles,,Cars,Cars,,,,Name,Name,,George,George,,,,Category,Category,,Beatles,Beatles,,,\r\n", 426 | "5.0,,,,,,,,,,,,,,,,,2014,2014,,Year,,,,,,,,,,,,,0,,Vehicles,Vehicles,,Planes,Planes,,,,Name,Name,,George,George,,,,Category,Category,,Beatles,Beatles,,,\r\n", 427 | "5.0,,,,,,,,,,,,,,,,,2014,2014,,Year,,,,,,,,,,,,,0,,Vehicles,Vehicles,,Trains,Trains,,,,Name,Name,,George,George,,,,Category,Category,,Beatles,Beatles,,,\r\n", 428 | "*********,12\r\n", 429 | "\n" 430 | ] 431 | } 432 | ], 433 | "source": [ 434 | "print(writetechnicalCSV(None, c1))" 435 | ] 436 | }, 437 | { 438 | "cell_type": "markdown", 439 | "metadata": {}, 440 | "source": [ 441 | "## Further notes\n", 442 | "Databaker has been developed by the [Sensible Code Company](http://sensiblecode.io/) on contract from the [Office of National Statistics](https://www.ons.gov.uk/).\n", 443 | "\n", 444 | "The first version was written in 2014 and ran only as a command line script where previews were made by via a coloured Excel spreadsheet. This version still exists under the [version 1.2.0](https://github.com/sensiblecodeio/databaker/tree/1.2.0) tag and the documentation is hosted [here](https://sensiblecodeio.github.io/quickcode-ons-docs/).\n", 445 | "\n", 446 | "This new version was developed at the end of 2015 to take advantage of the interactive programming capabilities of Jupyter and the freedom not to maintain backward compatibility.\n", 447 | "\n", 448 | "See the remaining tutorial notebooks for more details." 449 | ] 450 | }, 451 | { 452 | "cell_type": "code", 453 | "execution_count": null, 454 | "metadata": { 455 | "collapsed": true 456 | }, 457 | "outputs": [], 458 | "source": [] 459 | } 460 | ], 461 | "metadata": { 462 | "kernelspec": { 463 | "display_name": "Python 3", 464 | "language": "python", 465 | "name": "python3" 466 | }, 467 | "language_info": { 468 | "codemirror_mode": { 469 | "name": "ipython", 470 | "version": 3 471 | }, 472 | "file_extension": ".py", 473 | "mimetype": "text/x-python", 474 | "name": "python", 475 | "nbconvert_exporter": "python", 476 | "pygments_lexer": "ipython3", 477 | "version": "3.5.2" 478 | } 479 | }, 480 | "nbformat": 4, 481 | "nbformat_minor": 1 482 | } 483 | -------------------------------------------------------------------------------- /databaker/tutorial/blank_template.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "from databaker.framework import *" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 2, 17 | "metadata": { 18 | "collapsed": false 19 | }, 20 | "outputs": [ 21 | { 22 | "name": "stdout", 23 | "output_type": "stream", 24 | "text": [ 25 | "Loading example1.xls which has size 7168 bytes\n", 26 | "Table names: ['beatles', 'stones']\n" 27 | ] 28 | } 29 | ], 30 | "source": [ 31 | "# put your input-output files here\n", 32 | "inputfile = \"example1.xls\"\n", 33 | "outputfile = \"example1.csv\"\n", 34 | "previewfile = \"preview.html\"\n", 35 | "\n", 36 | "tabs = loadxlstabs(inputfile)" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 3, 42 | "metadata": { 43 | "collapsed": false 44 | }, 45 | "outputs": [], 46 | "source": [ 47 | "# recipe area\n", 48 | "conversionsegments = [ ]\n", 49 | "tab = tabs[0]\n", 50 | "\n", 51 | "obs = tab.excel_ref('B4').expand(DOWN).expand(RIGHT).is_not_blank().is_not_whitespace()\n", 52 | "dimensions = [ \n", 53 | " HDim(tab.excel_ref('B1'), TIME, CLOSEST, ABOVE), \n", 54 | " HDim(tab.excel_ref('B3').expand(RIGHT), \"Vehicles\", DIRECTLY, ABOVE, cellvalueoverride={\"Cars\":\"Car\"}), \n", 55 | " HDim(tab.excel_ref('A3').fill(DOWN), \"Name\", DIRECTLY, LEFT), \n", 56 | " HDimConst(\"All\", \"thing\")\n", 57 | "]\n", 58 | "\n", 59 | "conversionsegment = ConversionSegment(tab, dimensions, obs)\n", 60 | "conversionsegments.append(conversionsegment)\n" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 4, 66 | "metadata": { 67 | "collapsed": false 68 | }, 69 | "outputs": [ 70 | { 71 | "data": { 72 | "text/plain": [ 73 | "{, , , , , , , , , , , }" 74 | ] 75 | }, 76 | "execution_count": 4, 77 | "metadata": {}, 78 | "output_type": "execute_result" 79 | } 80 | ], 81 | "source": [ 82 | "conversionsegment = conversionsegments[0]\n", 83 | "\n", 84 | "hdim = conversionsegment.dimensions[1]\n", 85 | "hdim.hbagset\n", 86 | "conversionsegment.segment" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 5, 92 | "metadata": { 93 | "collapsed": false 94 | }, 95 | "outputs": [ 96 | { 97 | "name": "stdout", 98 | "output_type": "stream", 99 | "text": [ 100 | "{'All': 'thing', 'Vehicles': 'Car', 'Name': 'John', -2: 2014.0, -9: 2.0}\n", 101 | "{'All': 'thing', 'Vehicles': 'Planes', 'Name': 'John', -2: 2014.0, -9: 2.0}\n", 102 | "{'All': 'thing', 'Vehicles': 'Trains', 'Name': 'John', -2: 2014.0, -9: 1.0}\n", 103 | "{'All': 'thing', 'Vehicles': 'Car', 'Name': 'Paul', -2: 2014.0, -9: 3.0}\n", 104 | "{'All': 'thing', 'Vehicles': 'Planes', 'Name': 'Paul', -2: 2014.0, -9: 3.0}\n", 105 | "{'All': 'thing', 'Vehicles': 'Trains', 'Name': 'Paul', -2: 2014.0, -9: 2.0}\n", 106 | "{'All': 'thing', 'Vehicles': 'Car', 'Name': 'Ringo', -2: 2014.0, -9: 4.0}\n", 107 | "{'All': 'thing', 'Vehicles': 'Planes', 'Name': 'Ringo', -2: 2014.0, -9: 1.0}\n", 108 | "{'All': 'thing', 'Vehicles': 'Trains', 'Name': 'Ringo', -2: 2014.0, -9: 3.0}\n", 109 | "{'All': 'thing', 'Vehicles': 'Car', 'Name': 'George', -2: 2014.0, -9: 2.0}\n", 110 | "{'All': 'thing', 'Vehicles': 'Planes', 'Name': 'George', -2: 2014.0, -9: 5.0}\n", 111 | "{'All': 'thing', 'Vehicles': 'Trains', 'Name': 'George', -2: 2014.0, -9: 5.0}\n" 112 | ] 113 | } 114 | ], 115 | "source": [ 116 | "for ob in list(conversionsegment.segment):\n", 117 | " print(conversionsegment.lookupobs(ob))" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": 6, 123 | "metadata": { 124 | "collapsed": false 125 | }, 126 | "outputs": [ 127 | { 128 | "name": "stdout", 129 | "output_type": "stream", 130 | "text": [ 131 | "tablepart 'beatles' written #injblock1001\n", 132 | "javascript calculated\n" 133 | ] 134 | }, 135 | { 136 | "data": { 137 | "text/html": [ 138 | "Written to file /home/goatchurch/sensiblecode/quickcode-ons-recipes/helpnotes/preview.html" 139 | ], 140 | "text/plain": [ 141 | "" 142 | ] 143 | }, 144 | "metadata": {}, 145 | "output_type": "display_data" 146 | }, 147 | { 148 | "data": { 149 | "text/html": [ 150 | "
#injblock1002\n", 151 | "\n", 182 | "\n", 183 | "\n", 184 | "Table: beatles \n", 185 | "
OBSTIMEVehiclesName
\n", 186 | "\n", 187 | "\n", 188 | "\n", 189 | "\n", 190 | "\n", 191 | "\n", 192 | "\n", 193 | "\n", 194 | "\n", 195 | "
Date2014.0
CarsPlanesTrains
John2.02.01.0
Paul3.03.02.0
Ringo4.01.03.0
George2.05.05.0
\n", 196 | "\n", 197 | "
\n", 198 | "\n", 199 | "\n" 227 | ], 228 | "text/plain": [ 229 | "" 230 | ] 231 | }, 232 | "metadata": {}, 233 | "output_type": "display_data" 234 | } 235 | ], 236 | "source": [ 237 | "# this is the preview system\n", 238 | "conversionsegment = conversionsegments[0]\n", 239 | "savepreviewhtml(conversionsegment, previewfile)\n", 240 | "savepreviewhtml(conversionsegment)\n" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": 7, 246 | "metadata": { 247 | "collapsed": false 248 | }, 249 | "outputs": [ 250 | { 251 | "name": "stdout", 252 | "output_type": "stream", 253 | "text": [ 254 | "writing 1 conversion segments into /home/goatchurch/sensiblecode/quickcode-ons-recipes/helpnotes/example1.csv\n", 255 | "conversionwrite segment size 12 table 'beatles; TIMEUNIT='Year'\n" 256 | ] 257 | } 258 | ], 259 | "source": [ 260 | "writetechnicalCSV(outputfile, conversionsegments)\n" 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": 8, 266 | "metadata": { 267 | "collapsed": false 268 | }, 269 | "outputs": [ 270 | { 271 | "data": { 272 | "text/html": [ 273 | "
\n", 274 | "\n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | "
AllNameVehicles-2-9
0thingJohnCar2014.02.0
1thingJohnPlanes2014.02.0
2thingJohnTrains2014.01.0
3thingPaulCar2014.03.0
4thingPaulPlanes2014.03.0
5thingPaulTrains2014.02.0
6thingRingoCar2014.04.0
7thingRingoPlanes2014.01.0
8thingRingoTrains2014.03.0
9thingGeorgeCar2014.02.0
10thingGeorgePlanes2014.05.0
11thingGeorgeTrains2014.05.0
\n", 384 | "
" 385 | ], 386 | "text/plain": [ 387 | " All Name Vehicles -2 -9\n", 388 | "0 thing John Car 2014.0 2.0\n", 389 | "1 thing John Planes 2014.0 2.0\n", 390 | "2 thing John Trains 2014.0 1.0\n", 391 | "3 thing Paul Car 2014.0 3.0\n", 392 | "4 thing Paul Planes 2014.0 3.0\n", 393 | "5 thing Paul Trains 2014.0 2.0\n", 394 | "6 thing Ringo Car 2014.0 4.0\n", 395 | "7 thing Ringo Planes 2014.0 1.0\n", 396 | "8 thing Ringo Trains 2014.0 3.0\n", 397 | "9 thing George Car 2014.0 2.0\n", 398 | "10 thing George Planes 2014.0 5.0\n", 399 | "11 thing George Trains 2014.0 5.0" 400 | ] 401 | }, 402 | "execution_count": 8, 403 | "metadata": {}, 404 | "output_type": "execute_result" 405 | } 406 | ], 407 | "source": [ 408 | "topandas(conversionsegment)\n" 409 | ] 410 | }, 411 | { 412 | "cell_type": "code", 413 | "execution_count": 9, 414 | "metadata": { 415 | "collapsed": false 416 | }, 417 | "outputs": [ 418 | { 419 | "name": "stdout", 420 | "output_type": "stream", 421 | "text": [ 422 | "observation,data_marking,statistical_unit_eng,statistical_unit_cym,measure_type_eng,measure_type_cym,observation_type,empty,obs_type_value,unit_multiplier,unit_of_measure_eng,unit_of_measure_cym,confidentuality,empty1,geographic_area,empty2,empty3,time_dim_item_id,time_dim_item_label_eng,time_dim_item_label_cym,time_type,empty4,statistical_population_id,statistical_population_label_eng,statistical_population_label_cym,cdid,cdiddescrip,empty5,empty6,empty7,empty8,empty9,empty10,empty11,empty12,dim_id_1,dimension_label_eng_1,dimension_label_cym_1,dim_item_id_1,dimension_item_label_eng_1,dimension_item_label_cym_1,is_total_1,is_sub_total_1,dim_id_2,dimension_label_eng_2,dimension_label_cym_2,dim_item_id_2,dimension_item_label_eng_2,dimension_item_label_cym_2,is_total_2,is_sub_total_2,dim_id_3,dimension_label_eng_3,dimension_label_cym_3,dim_item_id_3,dimension_item_label_eng_3,dimension_item_label_cym_3,is_total_3,is_sub_total_3\r\n", 423 | "2.0,,,,,,,,,,,,,,,,,2014,2014,,Year,,,,,,,,,,,,,,,Vehicles,Vehicles,,Car,Car,,,,Name,Name,,John,John,,,,All,All,,thing,thing,,,\r\n", 424 | "2.0,,,,,,,,,,,,,,,,,2014,2014,,Year,,,,,,,,,,,,,,,Vehicles,Vehicles,,Planes,Planes,,,,Name,Name,,John,John,,,,All,All,,thing,thing,,,\r\n", 425 | "1.0,,,,,,,,,,,,,,,,,2014,2014,,Year,,,,,,,,,,,,,,,Vehicles,Vehicles,,Trains,Trains,,,,Name,Name,,John,John,,,,All,All,,thing,thing,,,\r\n", 426 | "3.0,,,,,,,,,,,,,,,,,2014,2014,,Year,,,,,,,,,,,,,,,Vehicles,Vehicles,,Car,Car,,,,Name,Name,,Paul,Paul,,,,All,All,,thing,thing,,,\r\n", 427 | "3.0,,,,,,,,,,,,,,,,,2014,2014,,Year,,,,,,,,,,,,,,,Vehicles,Vehicles,,Planes,Planes,,,,Name,Name,,Paul,Paul,,,,All,All,,thing,thing,,,\r\n", 428 | "2.0,,,,,,,,,,,,,,,,,2014,2014,,Year,,,,,,,,,,,,,,,Vehicles,Vehicles,,Trains,Trains,,,,Name,Name,,Paul,Paul,,,,All,All,,thing,thing,,,\r\n", 429 | "4.0,,,,,,,,,,,,,,,,,2014,2014,,Year,,,,,,,,,,,,,,,Vehicles,Vehicles,,Car,Car,,,,Name,Name,,Ringo,Ringo,,,,All,All,,thing,thing,,,\r\n", 430 | "1.0,,,,,,,,,,,,,,,,,2014,2014,,Year,,,,,,,,,,,,,,,Vehicles,Vehicles,,Planes,Planes,,,,Name,Name,,Ringo,Ringo,,,,All,All,,thing,thing,,,\r\n", 431 | "3.0,,,,,,,,,,,,,,,,,2014,2014,,Year,,,,,,,,,,,,,,,Vehicles,Vehicles,,Trains,Trains,,,,Name,Name,,Ringo,Ringo,,,,All,All,,thing,thing,,,\r\n", 432 | "2.0,,,,,,,,,,,,,,,,,2014,2014,,Year,,,,,,,,,,,,,,,Vehicles,Vehicles,,Car,Car,,,,Name,Name,,George,George,,,,All,All,,thing,thing,,,\r\n", 433 | "5.0,,,,,,,,,,,,,,,,,2014,2014,,Year,,,,,,,,,,,,,,,Vehicles,Vehicles,,Planes,Planes,,,,Name,Name,,George,George,,,,All,All,,thing,thing,,,\r\n", 434 | "5.0,,,,,,,,,,,,,,,,,2014,2014,,Year,,,,,,,,,,,,,,,Vehicles,Vehicles,,Trains,Trains,,,,Name,Name,,George,George,,,,All,All,,thing,thing,,,\r\n", 435 | "*********,12\r\n", 436 | "\n" 437 | ] 438 | } 439 | ], 440 | "source": [ 441 | "writetechnicalCSV(None, conversionsegments)\n" 442 | ] 443 | }, 444 | { 445 | "cell_type": "code", 446 | "execution_count": null, 447 | "metadata": { 448 | "collapsed": true 449 | }, 450 | "outputs": [], 451 | "source": [] 452 | }, 453 | { 454 | "cell_type": "code", 455 | "execution_count": null, 456 | "metadata": { 457 | "collapsed": true 458 | }, 459 | "outputs": [], 460 | "source": [] 461 | } 462 | ], 463 | "metadata": { 464 | "kernelspec": { 465 | "display_name": "Python 3", 466 | "language": "python", 467 | "name": "python3" 468 | }, 469 | "language_info": { 470 | "codemirror_mode": { 471 | "name": "ipython", 472 | "version": 3 473 | }, 474 | "file_extension": ".py", 475 | "mimetype": "text/x-python", 476 | "name": "python", 477 | "nbconvert_exporter": "python", 478 | "pygments_lexer": "ipython3", 479 | "version": "3.5.2" 480 | } 481 | }, 482 | "nbformat": 4, 483 | "nbformat_minor": 1 484 | } 485 | -------------------------------------------------------------------------------- /databaker/tutorial/example1.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cantabular/databaker/930b65e85038a3afd59574bbdc1d4cb79ce9b6ad/databaker/tutorial/example1.xls -------------------------------------------------------------------------------- /databaker/tutorial/nbconvert_demo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# `nbconvert`\n", 8 | "\n", 9 | "`nbconvert` is a Jupyter command line tool that can convert Jupyter notebooks to other output formats, and also execute them before converting. It's very useful for logging \"blind\" processing of a notebook." 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "You can use it like:\n", 17 | "\n", 18 | "```sh\n", 19 | "jupyter nbconvert --to html --execute my_notebook.ipynb\n", 20 | "```\n", 21 | "\n", 22 | "which will execute the input cells in `my_notebook.ipynb` and save the entire output as HTML.\n", 23 | "\n", 24 | "[Full documentation](https://nbconvert.readthedocs.io) is available from the developers." 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "## Input filename wrangling\n", 32 | "\n", 33 | "Unfortunately, `nbconvert` is a little limited to implement the `bake.py` usage we used to have in Databaker where you could specify filenames as it does not support passing in arguments to the notebook, e.g. so that you can change a variable, such as filename.\n", 34 | "\n", 35 | "So, we've written a wrapper, `databaker_nbconvert` around this that allows you to specify a notebook filename and an input filename. **The notebook and the input file should be in the same directory.** The notebook filename you specify can be an absolute path, but the input file should be just the filename without any path. Simplest way is to just stick everything in one directory and run `databaker_nbconvert` from there; it should work as a standalone command.\n", 36 | "\n", 37 | "Here's a very simple demo that shows this in action. We're not doing any processing of the spreadsheets here, but is only designed to show how you could switch a filename at the command line, while still being able to specify the filename within the notebook for development." 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": null, 43 | "metadata": { 44 | "collapsed": false 45 | }, 46 | "outputs": [], 47 | "source": [ 48 | "import databaker.framework" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "`databaker.framework.DATABAKER_INPUT_FILE` is just a string of the filename to use. we specify the input filename that we're using within this notebook. By default, this is the file that will get used." 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "metadata": { 62 | "collapsed": false 63 | }, 64 | "outputs": [], 65 | "source": [ 66 | "databaker.framework.DATABAKER_INPUT_FILE = 'example1.xls'" 67 | ] 68 | }, 69 | { 70 | "cell_type": "markdown", 71 | "metadata": {}, 72 | "source": [ 73 | "`getinputfilename()` is a function that gives you back the spreadsheet filename that we've passed to `databaker_nbconvert` or, if that's not the case, it gives us back the `DATABAKER_INPUT_FILE` value specified above.\n", 74 | "\n", 75 | "This way, we can leave `f` unspecified and allows us to do the following:\n", 76 | "\n", 77 | "* if we process the notebook here, then we will process `example1.xls`.\n", 78 | "\n", 79 | "* if we process with `databaker_nbconvert` with a specified spreadsheet filename, then we override the `example1.xls` here with whichever filename we specified to `databaker_nbconvert`.\n", 80 | "\n", 81 | "(This is actually a little bit of a hack that uses operating system environment variables to pass the values in, and we wrap this in another Python script, so this is transparent to the user, and also simplifies how this works across Windows and Linux.) " 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": { 88 | "collapsed": false 89 | }, 90 | "outputs": [], 91 | "source": [ 92 | "f = databaker.framework.getinputfilename()\n", 93 | "print(f)" 94 | ] 95 | }, 96 | { 97 | "cell_type": "markdown", 98 | "metadata": {}, 99 | "source": [ 100 | "Below, you'll see the loaded XLS details. If you process this notebook with `databaker_nbconvert` and enter `ott.xls` as a spreadsheet filename, e.g.\n", 101 | "\n", 102 | "```sh\n", 103 | "databaker_nbconvert \"nbconvert_demo.ipynb\" \"ott.xls\"\n", 104 | "```\n", 105 | "\n", 106 | "you'll see that's what gets loaded, not the `example1.xls` we specified above (but is ignored)." 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": null, 112 | "metadata": { 113 | "collapsed": false 114 | }, 115 | "outputs": [], 116 | "source": [ 117 | "databaker.framework.loadxlstabs(f)" 118 | ] 119 | } 120 | ], 121 | "metadata": { 122 | "kernelspec": { 123 | "display_name": "Python 3", 124 | "language": "python", 125 | "name": "python3" 126 | }, 127 | "language_info": { 128 | "codemirror_mode": { 129 | "name": "ipython", 130 | "version": 3 131 | }, 132 | "file_extension": ".py", 133 | "mimetype": "text/x-python", 134 | "name": "python", 135 | "nbconvert_exporter": "python", 136 | "pygments_lexer": "ipython3", 137 | "version": "3.5.2" 138 | } 139 | }, 140 | "nbformat": 4, 141 | "nbformat_minor": 1 142 | } 143 | -------------------------------------------------------------------------------- /databaker/tutorial/ott.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cantabular/databaker/930b65e85038a3afd59574bbdc1d4cb79ce9b6ad/databaker/tutorial/ott.xls -------------------------------------------------------------------------------- /databaker/tutorial/tutorial_reference.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Function reference\n", 8 | "\n", 9 | "This is a one-page comprehensive run-down of all the functions and features in the system to be kept for reference. \n", 10 | "\n", 11 | "By executing `tutorial()` in the notebook you are taking a copy of all the tutorial notebooks, including this one. \n", 12 | "\n", 13 | "So, if a function looks useful but you don't quite understand the description, you should experiment with its input and outputs within this interactive programming environment. \n", 14 | "\n", 15 | "However, you need to execute at least the first cell in every section for it to work as it imports the libraries.\n", 16 | "\n", 17 | "## Table of contents\n", 18 | "\n", 19 | "* [Loading and saving](#Loading-and-saving) - The input (excel and WDA files) and output methods (html and WDA files)\n", 20 | "* [Cell bag selection](#Cell-bag-selection) - Selecting and transforming sets of cells in the spreadsheet\n", 21 | "* [Dimensions](#Dimensions) - Turning a mere set of cells into a dimension with look up instructions\n", 22 | "* [Conversion segments](#Conversion-Segments) - The batch of observations and list of dimensions that creates the output\n", 23 | "* [Downloading and unzipping files](#Downloading-excel-and-unzipping-files) - Further routes to full automation\n" 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "# Loading and saving\n", 31 | "\n", 32 | "### tabs = loadxlstabs(inputfile, sheetids=\"*\", verbose=True)\n", 33 | "Load xls file into a list of tables, which act as bags of cells\n", 34 | " \n", 35 | " \n", 36 | "### savepreviewhtml(tab, htmlfilename=None, verbose=True)\n", 37 | "\n", 38 | "Previews a table -- or list of cellbags or conversion segments with the same table -- either inline, or into a separate file.\n", 39 | " \n", 40 | " \n", 41 | "### writetechnicalCSV(outputfile, conversionsegments) \n", 42 | "\n", 43 | "Outputs a WDA format CSV file from a list of conversion segments or pandas dataframes\n", 44 | "\n", 45 | "\n", 46 | "### readtechnicalCSV(wdafile, bverbose=False, baspandas=True)\n", 47 | "\n", 48 | "Reads in an old WDA file into a list of pandas tables, one for each segment\n" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 7, 54 | "metadata": { 55 | "collapsed": false 56 | }, 57 | "outputs": [], 58 | "source": [ 59 | "from databaker.framework import *\n", 60 | "\n", 61 | "# put your input-output files here\n", 62 | "inputfile = \"example1.xls\"\n", 63 | "outputfile = \"example1.csv\"\n", 64 | "previewfile = \"preview.html\"\n" 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "metadata": {}, 70 | "source": [ 71 | "# Cell bag selection\n", 72 | "These functions generally apply to a table as well as a cell bag, but they always output a cell bag.\n", 73 | "\n", 74 | "A cell bag `bag` always has a pointer to its original table `bag.table`. Howwever, you can access the underlying unordered set of cells of a bag as `bag.unordered_cells`.\n", 75 | "\n", 76 | "**Note** in the examples below, please use `savepreviewhtml(cellbag)` or `savepreviewhtml([cellbagA, cellbagB, ...])` to see what the selections look like in the contents of the table. These have been left out of unused notebook only to save clutter." 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 2, 82 | "metadata": { 83 | "collapsed": false 84 | }, 85 | "outputs": [ 86 | { 87 | "name": "stdout", 88 | "output_type": "stream", 89 | "text": [ 90 | "Loading example1.xls which has size 8192 bytes\n", 91 | "Table names: ['stones']\n", 92 | "{, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , }\n" 93 | ] 94 | } 95 | ], 96 | "source": [ 97 | "from databaker.framework import *\n", 98 | "tab = loadxlstabs(\"example1.xls\", sheetids=\"stones\", verbose=True)[0]\n", 99 | "print(tab)\n" 100 | ] 101 | }, 102 | { 103 | "cell_type": "markdown", 104 | "metadata": {}, 105 | "source": [ 106 | "### cellbag.is_XXX()\n", 107 | "### cellbag.is_not_XXX()\n", 108 | "\n", 109 | "Returns cells which are or are not a XXX thing.\n", 110 | " \n", 111 | "Allowable functions: \n", 112 | "\n", 113 | "> bold, italic, underline, number, date, whitespace, strikeout, any_border, all_border, richtext\n", 114 | "\n", 115 | "These functions can be chained, eg cellbag.is_not_number().is_not_whitespace()." 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": 3, 121 | "metadata": { 122 | "collapsed": false 123 | }, 124 | "outputs": [ 125 | { 126 | "name": "stdout", 127 | "output_type": "stream", 128 | "text": [ 129 | "Numbered cells only: {, , , , , , , }\n", 130 | "\n", 131 | "Not numbers: {, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , }\n", 132 | "\n", 133 | "Not numbers and not whitespace: {, , , , , , , , , , , , , , , , , , , , , , , }\n", 134 | "\n", 135 | "Cells that seem to be a date: {, }\n" 136 | ] 137 | } 138 | ], 139 | "source": [ 140 | "cellbag = tab\n", 141 | "print(\"Numbered cells only:\", cellbag.is_number())\n", 142 | "print()\n", 143 | "print(\"Not numbers:\", cellbag.is_not_number())\n", 144 | "print()\n", 145 | "print(\"Not numbers and not whitespace:\", cellbag.is_not_number().is_not_whitespace())\n", 146 | "print()\n", 147 | "print(\"Cells that seem to be a date:\", cellbag.is_date())\n" 148 | ] 149 | }, 150 | { 151 | "cell_type": "markdown", 152 | "metadata": {}, 153 | "source": [ 154 | "### cellbag.filter(word)\n", 155 | "\n", 156 | "Only cells matching this word exactly\n", 157 | "\n", 158 | "### cellbag.filter(function(cell))\n", 159 | "\n", 160 | "Only cells where function(cell) == True\n", 161 | "\n", 162 | "\n", 163 | "### cellbag.one_of([word1, word2])\n", 164 | "\n", 165 | "Only cells matching one of the words\n", 166 | "\n", 167 | "\n", 168 | "### cellbag.regex(regexp)\n", 169 | "\n", 170 | "Only cell matching one of the words\n", 171 | "\n", 172 | "\n", 173 | "### cellbag.excel_ref(ref)\n", 174 | "\n", 175 | "Selects a cell by its excel Column-Row/Letter-Number format where 'A1' is the top left hand corner.\n", 176 | "\n", 177 | "This also works for single columns or rows (eg 'C', or '3') and ranges (eg 'A2:B3'). \n", 178 | "\n", 179 | "This way of accessing is not recommended unless you know that the spreadsheet you are working with won't have extra rows or columns inserted or deleted from it. \n", 180 | "\n", 181 | "### cellbag.by_index(n)\n", 182 | "\n", 183 | "Selects a single cell from the cell bag of index n, where n=1 is the first element. (n can also be a list of integers.)\n", 184 | "\n", 185 | "\n", 186 | "### cellbag.assert_one()\n", 187 | "\n", 188 | "Throws an exception if there is not exactly one cell in this bag (useful for validation if your filter above was supposed to return only one cell)\n", 189 | "\n", 190 | "### cellbag.value\n", 191 | "\n", 192 | "If `len(cellbag) == 1` then cellbag.value gives the original value within that cell, otherwise it throws an exception.\n" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": 20, 198 | "metadata": { 199 | "collapsed": false 200 | }, 201 | "outputs": [ 202 | { 203 | "name": "stdout", 204 | "output_type": "stream", 205 | "text": [ 206 | "Loading example1.xls which has size 8192 bytes\n", 207 | "Table names: ['stones']\n", 208 | "Get some matching cells {, }\n", 209 | "A3 is {}\n", 210 | "A3:B4 is {, , , , , }\n", 211 | "\n", 212 | "The second cell in the whole table is {}\n", 213 | "Numbers greater than 20 {, , , , }\n", 214 | "Numbers less than 20 {, , }\n" 215 | ] 216 | } 217 | ], 218 | "source": [ 219 | "from databaker.framework import * # restated import so you can run from this cell\n", 220 | "cellbag = tab = loadxlstabs(\"example1.xls\", sheetids=\"stones\", verbose=True)[0]\n", 221 | "\n", 222 | "print(\"Get some matching cells\", cellbag.one_of([\"Rocks\", \"ice\", \"mud\"]))\n", 223 | "print(\"A3 is\", cellbag.excel_ref(\"A3\"))\n", 224 | "print(\"A3:B4 is\", cellbag.excel_ref(\"A2:B4\"))\n", 225 | "print()\n", 226 | "print(\"The second cell in the whole table is\", tab.by_index(2))\n", 227 | "\n", 228 | "ngreater20 = cellbag.is_number().filter(lambda c: c.value>20)\n", 229 | "nlessthan20 = cellbag.is_number().filter(lambda c: c.value<20)\n", 230 | "print(\"Numbers greater than 20\", ngreater20)\n", 231 | "print(\"Numbers less than 20\", nlessthan20)\n", 232 | "\n", 233 | "# Uncomment this line to see these selections in contents\n", 234 | "# savepreviewhtml([ngreater20, nlessthan20])\n" 235 | ] 236 | }, 237 | { 238 | "cell_type": "markdown", 239 | "metadata": {}, 240 | "source": [ 241 | "### cellbag1.union(cellbag2)\n", 242 | "\n", 243 | "Union of two bags. Can also be expressed as `cellbag1 | cellbag2`\n", 244 | "\n", 245 | "### cellbag1.difference(cellbag2)\n", 246 | "\n", 247 | "Difference of two bags. Can also be expressed as `cellbag1 - cellbag2`\n", 248 | "\n", 249 | "### cellbag1.difference(cellbag2)\n", 250 | "\n", 251 | "Intersection of two bags. Can also be expressed as `cellbag1 & cellbag2`" 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": 121, 257 | "metadata": { 258 | "collapsed": false 259 | }, 260 | "outputs": [ 261 | { 262 | "name": "stdout", 263 | "output_type": "stream", 264 | "text": [ 265 | "colC {, , }\n", 266 | "rowC {, , , }\n", 267 | "\n", 268 | "Union is {, , , , , }\n", 269 | "Difference is {, }\n", 270 | "Intersection is {}\n", 271 | "\n", 272 | "Union is {, , , , , }\n", 273 | "Difference is {, }\n", 274 | "Intersection is {}\n" 275 | ] 276 | } 277 | ], 278 | "source": [ 279 | "colC = tab.excel_ref(\"D3:D5\")\n", 280 | "rowC = tab.excel_ref(\"A4:D4\")\n", 281 | "print(\"colC\", colC)\n", 282 | "print(\"rowC\", rowC)\n", 283 | "print()\n", 284 | "print(\"Union is\", colC.union(rowC))\n", 285 | "print(\"Difference is\", colC.difference(rowC))\n", 286 | "print(\"Intersection is\", colC.intersection(rowC))\n", 287 | "print()\n", 288 | "print(\"Union is\", (colC | rowC))\n", 289 | "print(\"Difference is\", (colC - rowC))\n", 290 | "print(\"Intersection is\", (colC & rowC))\n" 291 | ] 292 | }, 293 | { 294 | "cell_type": "markdown", 295 | "metadata": {}, 296 | "source": [ 297 | "### cellbag1.waffle(cellbag2)\n", 298 | "\n", 299 | "Get all cells which have a cell from one bag above them, and the other bag to the side. Note that the two bags are interchangable without changing the output. You can change the direction from its default (DOWN) by specifying direction=LEFT or similar.\n", 300 | "\n", 301 | "### cellbag1.junction(cellbag2)\n", 302 | "\n", 303 | "Enumerates the output of waffle in triplets\n", 304 | "\n", 305 | "\n", 306 | "### cellbag1.same_row(cellbag2)\n", 307 | "\n", 308 | "Get cells in this bag which are in the same row as a cell in the second.\n", 309 | "\n", 310 | "### cellbag1.same_column(cellbag2)\n", 311 | "\n", 312 | "Get cells in this bag which are in the same column as a cell in the second." 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": 7, 318 | "metadata": { 319 | "collapsed": false 320 | }, 321 | "outputs": [ 322 | { 323 | "name": "stdout", 324 | "output_type": "stream", 325 | "text": [ 326 | "Waffle:\n" 327 | ] 328 | }, 329 | { 330 | "data": { 331 | "text/html": [ 332 | "
\n", 333 | "\n", 340 | "\n", 341 | "\n", 342 | "\n", 343 | "
item 0item 1item 2
\n", 344 | "\n", 345 | "\n", 346 | "\n", 347 | "\n", 348 | "\n", 349 | "\n", 350 | "\n", 351 | "\n", 352 | "\n", 353 | "\n", 354 | "\n", 355 | "\n", 356 | "
stones
Date
YearMonthpresentRockscost
1972.0Janyeschalk10.0
Augnogranite30.0
1989.0Febyeslimestone2.0
Marnoshale88.0
Junyesbasalt96.0
Decyesice8.0
\n", 357 | "\n", 358 | "
\n" 359 | ], 360 | "text/plain": [ 361 | "" 362 | ] 363 | }, 364 | "metadata": {}, 365 | "output_type": "display_data" 366 | } 367 | ], 368 | "source": [ 369 | "c = tab.excel_ref(\"D3\") | tab.excel_ref(\"E4\")\n", 370 | "d = tab.excel_ref(\"A6:A7\")\n", 371 | "print(\"Waffle:\")\n", 372 | "savepreviewhtml([c,d, c.waffle(d)])" 373 | ] 374 | }, 375 | { 376 | "cell_type": "code", 377 | "execution_count": 123, 378 | "metadata": { 379 | "collapsed": false 380 | }, 381 | "outputs": [ 382 | { 383 | "name": "stdout", 384 | "output_type": "stream", 385 | "text": [ 386 | "Junction output:\n", 387 | " ({}, {}, {})\n", 388 | " ({}, {}, {})\n", 389 | " ({}, {}, {})\n", 390 | " ({}, {}, {})\n" 391 | ] 392 | } 393 | ], 394 | "source": [ 395 | "print(\"Junction output:\")\n", 396 | "for s in c.junction(d):\n", 397 | " print(\" \", s)" 398 | ] 399 | }, 400 | { 401 | "cell_type": "code", 402 | "execution_count": 128, 403 | "metadata": { 404 | "collapsed": false 405 | }, 406 | "outputs": [ 407 | { 408 | "name": "stdout", 409 | "output_type": "stream", 410 | "text": [ 411 | "Cells column A that are in same row as {, } are {, }\n", 412 | "Cells column 7 that are in same column as {, } are {, }\n" 413 | ] 414 | } 415 | ], 416 | "source": [ 417 | "print(\"Cells column A that are in same row as\", c, \"are\", tab.excel_ref(\"A\").same_row(c))\n", 418 | "print(\"Cells column 7 that are in same column as\", c, \"are\", tab.excel_ref(\"7\").same_col(c))" 419 | ] 420 | }, 421 | { 422 | "cell_type": "markdown", 423 | "metadata": {}, 424 | "source": [ 425 | "### cellbag.shift(direction)\n", 426 | "\n", 427 | "Move the selected cells UP, DOWN, LEFT or Right by one cell\n", 428 | "\n", 429 | "### cellbag.shift((dx, dy))\n", 430 | "\n", 431 | "Move the selected cells dx cells to RIGHT and dy cells DOWN (can have negative values)\n", 432 | "\n", 433 | "\n", 434 | "### cellbag.fill(direction)\n", 435 | "\n", 436 | "Take all the cells in one direction from the given cellbag\n", 437 | "\n", 438 | "### cellbag.expand(direction)\n", 439 | "\n", 440 | "All the cells in one direction, including itself.\n", 441 | "\n", 442 | "### cellbag.extrude(dx, dy)\n", 443 | "\n", 444 | "Step and include this many cells between 0 and dx and dy.\n" 445 | ] 446 | }, 447 | { 448 | "cell_type": "code", 449 | "execution_count": 120, 450 | "metadata": { 451 | "collapsed": false 452 | }, 453 | "outputs": [ 454 | { 455 | "name": "stdout", 456 | "output_type": "stream", 457 | "text": [ 458 | "Shift RIGHT from {} is {}\n", 459 | "Shift (-1,-2) from {} is {}\n", 460 | "Fill UP from {} is {, , }\n", 461 | "Expand UP from {} is {, , , }\n", 462 | "\n", 463 | "How it works: UP= (0, -1) DOWN= (0, 1) LEFT= (-1, 0) RIGHT= (1, 0)\n", 464 | "\n", 465 | "Extrude two cells rightwards {, , }\n" 466 | ] 467 | } 468 | ], 469 | "source": [ 470 | "c = tab.excel_ref(\"B4\")\n", 471 | "print(\"Shift RIGHT from\", c, \"is\", c.shift(RIGHT))\n", 472 | "print(\"Shift (-1,-2) from\", c, \"is\", c.shift((-1, -2)))\n", 473 | "print(\"Fill UP from\", c, \"is\", c.fill(UP))\n", 474 | "print(\"Expand UP from\", c, \"is\", c.expand(UP))\n", 475 | "print()\n", 476 | "print(\"How it works: UP=\", UP, \" DOWN=\", DOWN, \" LEFT=\", LEFT, \" RIGHT=\", RIGHT)\n", 477 | "print()\n", 478 | "print(\"Extrude two cells rightwards\", c.extrude(2,0))" 479 | ] 480 | }, 481 | { 482 | "cell_type": "markdown", 483 | "metadata": {}, 484 | "source": [ 485 | "# Dimensions\n", 486 | "A dimension is simply a cellbag with a label and a lookup direction applied to it. \n", 487 | "\n", 488 | "Each dimension represents a column in the output table and basically contains the instructions for how to look up to the corresponding value given a particular cell in the set of observations.\n", 489 | "\n", 490 | "\n", 491 | "### hdim = HDim(cellbag, label, strict=[DIRECTLY|CLOSEST], direction=[ABOVE|BELOW|LEFT|RIGHT])\n", 492 | "\n", 493 | "The main constructor, taking a set of cells, a string name (label), look up condition and lookup directions. \n", 494 | "\n", 495 | "The lookup conditions are:\n", 496 | "* CLOSEST (gets the *first* cell in the same column or row as the observation in a specified direction);\n", 497 | "* DIRECTLY (gets the *closest* cell in the same column or row as the observation in a specified direction).\n", 498 | "\n", 499 | "\n", 500 | "### hdim.cellvalobs(cell)\n", 501 | "\n", 502 | "This function looks up the value of an individual cell in `hdim.hbagset` (defined in the constructor) according to the lookup condition and direction, and returns the pair `(cell, value)` The `value` will always be `cell.value`, unless it has been overridden by some member of `hdim.cellvalueoverride`.\n", 503 | "\n", 504 | "\n", 505 | "### hdim.AddCellValueOverride(overridecell, overridevalue)\n", 506 | "\n", 507 | "This function is an interface to changing the return values alters the `hdim.cellvalueoverride`. It can be used to change the spellings of particular dimension values or to insert new heading cells in place of blank ones. \n", 508 | "\n", 509 | "Inserting header cells is sometimes necessary when a heading is centred and you can't look it up with a single `(strict=CLOSEST, direction=LEFT|RIGHT)` command. (The `direction=NEAREST` feature proved unreliable in the real world.) \n", 510 | "\n", 511 | "\n", 512 | "### hdim.discardcellsnotlookedup(observationcells)\n", 513 | "\n", 514 | "This function uses a set of observation cells to thin out the list of dimension cells `hdim.hbagset` to only those which can be looked up. Can be used to quickly trim out footnote in the bottom of a column that don't make any difference to the final output while making validation easier (see `hdim.checkvalues` below).\n", 515 | "\n", 516 | "\n", 517 | "### hdim.valueslist()\n", 518 | "\n", 519 | "Use this function to print the final heading cells values (the values in `hdim.hbagset` after they are overridden by `hdim.cellvalueoverride` for use in making the validation checks.\n", 520 | "\n", 521 | "\n", 522 | "### hdim.checkvalues(valueslist)\n", 523 | "\n", 524 | "This validates the dimension values against a hard-coded values list that has been generated earlier by `hdim.valueslist()` and throws an exception with an explanation if they are different. \n", 525 | "\n", 526 | "Use this function if you need to run your code against different spreadsheets and need to check that the outputs are going to be consistent.\n", 527 | "\n", 528 | "\n", 529 | "### hdimc = HDimConst(label, value)\n", 530 | "\n", 531 | "Create a constant dimension that will give the same value no matter what the observation is looked up.\n", 532 | "\n" 533 | ] 534 | }, 535 | { 536 | "cell_type": "code", 537 | "execution_count": 24, 538 | "metadata": { 539 | "collapsed": false 540 | }, 541 | "outputs": [ 542 | { 543 | "name": "stdout", 544 | "output_type": "stream", 545 | "text": [ 546 | "{, , , , , }\n" 547 | ] 548 | } 549 | ], 550 | "source": [ 551 | "from databaker.framework import *\n", 552 | "tab = loadxlstabs(\"example1.xls\", sheetids=\"stones\", verbose=False)[0]\n", 553 | "\n", 554 | "rocks = tab.filter(\"Rocks\").fill(DOWN)\n", 555 | "years = tab.filter(\"Year\").fill(DOWN).is_not_whitespace()\n", 556 | "cost = tab.filter(\"cost\").fill(DOWN)\n", 557 | "print(rocks)\n", 558 | "\n", 559 | "# savepreviewhtml([rocks, years, cost]) # <-- uncomment this line to see the table" 560 | ] 561 | }, 562 | { 563 | "cell_type": "code", 564 | "execution_count": 31, 565 | "metadata": { 566 | "collapsed": false 567 | }, 568 | "outputs": [ 569 | { 570 | "name": "stdout", 571 | "output_type": "stream", 572 | "text": [ 573 | "{} \t (, '1972.0') \t (, 'chalk')\n", 574 | "{} \t (, '1972.0') \t (, 'gneiss')\n", 575 | "{} \t (, '1989.0') \t (, 'limestone')\n", 576 | "{} \t (, '1989.0') \t (, 'shale')\n", 577 | "{} \t (, '1989.0') \t (, 'basalt')\n", 578 | "{} \t (, '1989.0') \t (, 'ice')\n" 579 | ] 580 | } 581 | ], 582 | "source": [ 583 | "hrocks = HDim(rocks, \"ROCKS!\", DIRECTLY, LEFT)\n", 584 | "hrocks.AddCellValueOverride(\"granite\", \"gneiss\")\n", 585 | "hyears = HDim(years, \"yyyy\", CLOSEST, UP)\n", 586 | "\n", 587 | "for ob in cost:\n", 588 | " print(ob, \"\\t\", hyears.cellvalobs(ob), \"\\t\", hrocks.cellvalobs(ob))\n", 589 | "\n", 590 | "# savepreviewhtml([hrocks, hyears, cost]) # <-- uncomment to see as a coloured table" 591 | ] 592 | }, 593 | { 594 | "cell_type": "markdown", 595 | "metadata": {}, 596 | "source": [ 597 | "# Conversion Segments\n", 598 | "\n", 599 | "A ConversionSegment is a set of observations together with a list of Dimensions\n", 600 | "\n", 601 | "\n", 602 | "### ConversionSegment(observations, dimensions)\n", 603 | "\n", 604 | "Constructor for the ConversionSegment, where `observations` is a bag of cells and `dimensions` is a list of `HDim` and `HDimConst` dimension objects. You can construct the dimensions at the same time as defining the list at the point when you call this function. \n", 605 | "\n", 606 | "\n", 607 | "### ConversionSegment(observations, dimensions, processTIMEUNIT=True, includecellxy=False)\n", 608 | "\n", 609 | "Two default parameters in the ConversionSegment constructor. `processTIMEUNIT` controls whether a dimension called TIME should be used to automatically set the dimension known as TIMEUNIT. This is required by the WDA output, however its operation can be implemented in pandas. \n", 610 | "\n", 611 | "`includecellxy` causes the output to include three extra columns, `[__x, __y, __tablename]` which can be used for debugging purposes. \n", 612 | "\n", 613 | "### conversionsegment.topandas()\n", 614 | "\n", 615 | "Turns a ConversionSegment into a [pandas.DataFrame](http://pandas.pydata.org/), which is an extremely powerful, efficient and widely used data manipulation library. \n", 616 | "\n", 617 | "This marks the place where you depart cleanly from the Databaker library and can go on to further analysis, or it's a temporary entry into a system where the data can be fixed up before outputting it to the WDA format. \n", 618 | "\n", 619 | "\n", 620 | "### savepreviewhtml(conversionsegment, htmlfilename=None, verbose=True)\n", 621 | "\n", 622 | "This function is restated from the Loading-and-saving section to remind you that when you use it on a ConversionSegment the Observation cells are interactive -- click on one to highlight the dimension cells it is looking up to.\n", 623 | "\n", 624 | "Also, overridden values are illustrated by strike-throughs.\n", 625 | "\n", 626 | "\n", 627 | "### writetechnicalCSV(outputfile, conversionsegments) \n", 628 | "\n", 629 | "This function is also restated from the Loading-and-saving section for saving a WDA output file. The argument can be a single ConversionSegment, a list of ConversionSegments or a list of pandas.DataFrames (which have been cleaned up). \n", 630 | "\n", 631 | "\n", 632 | "### Special WDA dimensions\n", 633 | "\n", 634 | "The WDA format contains the following special dimension columns that are output at the front of every row. They are identified by their dimension labels.\n", 635 | "\n", 636 | "For convenience, the variable names have been set to their string names, ie \n", 637 | "> `STATUNIT = \"STATUNIT\"`\n", 638 | "\n", 639 | "See the WDA documentation for their specific uses. \n", 640 | "\n", 641 | "* OBS - This is not a dimension; it's the observation column. Do not name a dimension as \"OBS\"\n", 642 | "* DATAMARKER - If OBS is not a number, then the non-numeric part is stripped off and put into the DATAMARKER column\n", 643 | "* STATUNIT \n", 644 | "* MEASURETYPE \n", 645 | "* UNITMULTIPLIER \n", 646 | "* UNITOFMEASURE\n", 647 | "* GEOG\n", 648 | "* TIME - Of the form \"2010\", \"2010 Q1\" or \"Jan 2010\"\n", 649 | "* TIMEUNIT - \"Year\", \"Quarter\", \"Month\" respectively\n", 650 | "* STATPOP\n", 651 | "\n", 652 | "\n", 653 | "### pdguessforceTIMEUNIT(dataframe)\n", 654 | "\n", 655 | "Find and set the TIMEUNIT column from the TIME column in a pandas.DataFrame. This function has two lines. The first line matches the unit from the TIME value:\n", 656 | "```python\n", 657 | "df[\"TIMEUNIT\"] = df.apply(lambda row: Ldatetimeunitloose(row.TIME), axis=1)\n", 658 | "```\n", 659 | "The second line forces the TIME value to conform to the exact format required by the WDA file, given the TIMEUNIT\n", 660 | "```python\n", 661 | "df[\"TIME\"] = df.apply(lambda row: Ldatetimeunitforce(row.TIME, row.TIMEUNIT), axis=1)\n" 662 | ] 663 | }, 664 | { 665 | "cell_type": "code", 666 | "execution_count": 39, 667 | "metadata": { 668 | "collapsed": false 669 | }, 670 | "outputs": [ 671 | { 672 | "name": "stdout", 673 | "output_type": "stream", 674 | "text": [ 675 | "2017.0 is\t Year corrected to\t 2017\n", 676 | "Q32017 is\t Quarter corrected to\t 2017 Q3\n", 677 | "Mar 2017 is\t Month corrected to\t Mar 2017\n" 678 | ] 679 | } 680 | ], 681 | "source": [ 682 | "from databaker.framework import *\n", 683 | "\n", 684 | "times = [2017.0, \"Q32017\", \"Mar 2017\"]\n", 685 | "for t in times:\n", 686 | " print(t, \"is\\t\", Ldatetimeunitloose(t), \"corrected to\\t\", Ldatetimeunitforce(t, Ldatetimeunitloose(t)))\n" 687 | ] 688 | }, 689 | { 690 | "cell_type": "code", 691 | "execution_count": 9, 692 | "metadata": { 693 | "collapsed": false 694 | }, 695 | "outputs": [ 696 | { 697 | "name": "stdout", 698 | "output_type": "stream", 699 | "text": [ 700 | "\n", 701 | "observation,data_marking,statistical_unit_eng,statistical_unit_cym,measure_type_eng,measure_type_cym,observation_type,empty,obs_type_value,unit_multiplier,unit_of_measure_eng,unit_of_measure_cym,confidentuality,empty1,geographic_area,empty2,empty3,time_dim_item_id,time_dim_item_label_eng,time_dim_item_label_cym,time_type,empty4,statistical_population_id,statistical_population_label_eng,statistical_population_label_cym,cdid,cdiddescrip,empty5,empty6,empty7,empty8,empty9,empty10,empty11,empty12\r\n", 702 | "10.0,,,,,,,,,,,,,,,,,Jan 1972,Jan 1972,,Month,,,,,,,,,,,,,0,\r\n", 703 | "30.0,,,,,,,,,,,,,,,,,Aug 1972,Aug 1972,,Month,,,,,,,,,,,,,0,\r\n", 704 | "2.0,,,,,,,,,,,,,,,,,Feb 1989,Feb 1989,,Month,,,,,,,,,,,,,0,\r\n", 705 | "88.0,,,,,,,,,,,,,,,,,Mar 1989,Mar 1989,,Month,,,,,,,,,,,,,0,\r\n", 706 | "96.0,,,,,,,,,,,,,,,,,Jun 1989,Jun 1989,,Month,,,,,,,,,,,,,0,\r\n", 707 | "8.0,,,,,,,,,,,,,,,,,Dec 1989,Dec 1989,,Month,,,,,,,,,,,,,0,\r\n", 708 | "*********,6\r\n", 709 | "\n" 710 | ] 711 | } 712 | ], 713 | "source": [ 714 | "from databaker.framework import *\n", 715 | "tab = loadxlstabs(\"example1.xls\", sheetids=\"stones\", verbose=False)[0]\n", 716 | "\n", 717 | "cs = ConversionSegment(tab.filter(\"cost\").fill(DOWN), [\n", 718 | " HDim(tab.filter(\"Year\").fill(DOWN).is_not_whitespace(), \"year\", CLOSEST, UP),\n", 719 | " HDim(tab.filter(\"Month\").fill(DOWN).is_not_whitespace(), \"month\", DIRECTLY, LEFT)\n", 720 | " ])\n", 721 | "\n", 722 | "\n", 723 | "###################\n", 724 | "# savepreviewhtml(cs) # <-- uncomment this to see the interactive table\n", 725 | "\n", 726 | "dcs = cs.topandas()\n", 727 | "# print(dcs) # uncomment to see the table\n", 728 | "\n", 729 | "# concatenate the month and year into a time\n", 730 | "dcs[\"TIME\"] = dcs.month + \" \" + dcs.year\n", 731 | "pdguessforceTIMEUNIT(dcs) # <-- fixes the date format (removing the '.0's on the years)\n", 732 | "# print(dcs) # uncomment to see the table at this point\n", 733 | "\n", 734 | "# delete the now redundant columns \n", 735 | "dcs.drop(['year', \"month\"], axis=1, inplace=True)\n", 736 | "#print(dcs) # uncomment to see pandas table\n", 737 | "\n", 738 | "# Output the finished WDA file where the dates should all work!\n", 739 | "print(writetechnicalCSV(None, dcs))" 740 | ] 741 | }, 742 | { 743 | "cell_type": "markdown", 744 | "metadata": {}, 745 | "source": [ 746 | "# Downloading excel and unzipping files\n", 747 | "\n", 748 | "If you are doing work on a computer that can actually be done by the computer, then you are not doing real work. \n", 749 | "\n", 750 | "Please automate the webscraping and unzipping of files. \n", 751 | "\n", 752 | "Here are some quick methods for downloading multiple excel spreadsheets linked to from [this page](https://www.ons.gov.uk/businessindustryandtrade/constructionindustry/datasets/outputintheconstructionindustry/current).\n" 753 | ] 754 | }, 755 | { 756 | "cell_type": "code", 757 | "execution_count": 11, 758 | "metadata": { 759 | "collapsed": false 760 | }, 761 | "outputs": [ 762 | { 763 | "name": "stdout", 764 | "output_type": "stream", 765 | "text": [ 766 | "Downloaded a webpage with 31071 bytes\n" 767 | ] 768 | } 769 | ], 770 | "source": [ 771 | "import urllib, re, os\n", 772 | "\n", 773 | "# url containing the index of a set of spreadsheets\n", 774 | "ddurl = \"https://www.ons.gov.uk/businessindustryandtrade/constructionindustry/datasets/outputintheconstructionindustry/current\"\n", 775 | "req1 = urllib.request.Request(ddurl, headers={'User-Agent' : \"Sensible code\"}) \n", 776 | "dhtml = urllib.request.urlopen(req1).read().decode(\"utf8\")\n", 777 | "print(\"Downloaded a webpage with\", len(dhtml), \"bytes\")" 778 | ] 779 | }, 780 | { 781 | "cell_type": "code", 782 | "execution_count": 20, 783 | "metadata": { 784 | "collapsed": false 785 | }, 786 | "outputs": [], 787 | "source": [ 788 | "# make the download directory\n", 789 | "dfiles = \"downloaddir\"\n", 790 | "if not os.path.isdir(dfiles):\n", 791 | " print(\"making directory\", dfiles)\n", 792 | " os.mkdir(dfiles)" 793 | ] 794 | }, 795 | { 796 | "cell_type": "code", 797 | "execution_count": 30, 798 | "metadata": { 799 | "collapsed": false 800 | }, 801 | "outputs": [], 802 | "source": [ 803 | "# quick and dirty regular expression for pullint out the links to relevant xls spreadsheets\n", 804 | "xllinklist = re.findall('href=\"(/file\\?uri=/businessindustryandtrade.*?/([^/\"]*\\.xls))\"', dhtml)\n", 805 | " \n", 806 | "for xl, xln in xllinklist:\n", 807 | " lxln = os.path.join(dfiles, xln)\n", 808 | " if os.path.exists(lxln):\n", 809 | " continue # <-- we avoid downloading the same file a second time, in this case\n", 810 | " furl = urllib.parse.urljoin(ddurl, xl)\n", 811 | " req = urllib.request.Request(furl, headers={'User-Agent' : \"Sensible code\"}) \n", 812 | " xp = urllib.request.urlopen(req).read()\n", 813 | " print(\"Downloading\", xln, len(xp), \"bytes\")\n", 814 | " fout = open(lxln, \"wb\")\n", 815 | " fout.write(xp)\n", 816 | " fout.close()\n" 817 | ] 818 | }, 819 | { 820 | "cell_type": "code", 821 | "execution_count": 31, 822 | "metadata": { 823 | "collapsed": false 824 | }, 825 | "outputs": [], 826 | "source": [ 827 | "fnames = [ os.path.join(dfiles, f) for f in os.listdir(dfiles) if f[-4:] == '.xls' ]\n", 828 | "\n", 829 | "print(\"Your list of xls files is:\\n\", \"\\n \".join(fnames))\n" 830 | ] 831 | }, 832 | { 833 | "cell_type": "markdown", 834 | "metadata": {}, 835 | "source": [ 836 | "## What to do when you have zip files\n", 837 | "\n", 838 | "If you find yourself downloading zipfiles and manually instructing the computer to unzip each file, you should think about making the computer do the work itself.\n", 839 | "\n", 840 | "An example of zipfiles containing excel spreadsheets can be found on [this page](https://www.ons.gov.uk/employmentandlabourmarket/peopleinwork/workplacepensions/datasets/annualsurveyofhoursandearningspensiontablespensiontypebyagegroupandbygrossweeklyearningsbandsp1).\n", 841 | "\n", 842 | "First job is to download one of these files, as we did above:" 843 | ] 844 | }, 845 | { 846 | "cell_type": "code", 847 | "execution_count": 36, 848 | "metadata": { 849 | "collapsed": false 850 | }, 851 | "outputs": [ 852 | { 853 | "name": "stdout", 854 | "output_type": "stream", 855 | "text": [ 856 | "We are about to download the file:\n", 857 | " https://www.ons.gov.uk/file?uri=/employmentandlabourmarket/peopleinwork/workplacepensions/datasets/annualsurveyofhoursandearningspensiontablespensiontypebyagegroupandbygrossweeklyearningsbandsp1/2015/2015provisionaltablep1.zip\n", 858 | "downloaded.zip is 44560 bytes long.\n" 859 | ] 860 | } 861 | ], 862 | "source": [ 863 | "import urllib, re\n", 864 | "\n", 865 | "# fetch the front page and find the link to the zip file we want\n", 866 | "iurl = \"https://www.ons.gov.uk/employmentandlabourmarket/peopleinwork/workplacepensions/datasets/annualsurveyofhoursandearningspensiontablespensiontypebyagegroupandbygrossweeklyearningsbandsp1\"\n", 867 | "req = urllib.request.Request(iurl, headers={'User-Agent' : \"Sensible Code\"}) \n", 868 | "ipage = urllib.request.urlopen(req).read()\n", 869 | "\n", 870 | "# search the link to the zip file and \"join\" against the baseurl to get the full url (there's a space -> %20 bug problem)\n", 871 | "zyears = [ urllib.parse.urljoin(iurl, z.replace(\" \", \"%20\")) for z in re.findall(' 8 | 9 | Where there are multiple sets of dimensions one set must come first 10 | 11 | e.g. 12 | 13 | 4,x=5,y=6 14 | 7,x=12,y=25 15 | 0,x=6,z=0 16 | 12,x=9,z=99 17 | 18 | note all x/y before any x/z 19 | 20 | Likewise if some obs are counts, and some are percentages 21 | 22 | Likewise keep together obs with the same number of dimensions 23 | 24 | 25 | We only care about: 26 | 27 | 1 obs 28 | 2 datamarking -- if obs isn't a number, make it a datamarking 29 | ? geography -- if not E-code, stick it in anyway. 30 | 18 time value -- will recieve a msg 31 | 19 copy 18 32 | 21 time type -- eg "year" 33 | 36+8n dim name -- eg "gender" 34 | 37+8n copy 36 35 | 39+8n dim value -- eg "male" 36 | 40+8n copy 40 37 | 38 | -- discussion with Rob 2015-01-20 39 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | nose==1.3.7 2 | docopt==0.6.2 3 | xypath==1.1.1 4 | xlutils==2.0.0 5 | pyhamcrest==1.9.0 6 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | long_desc = """ 4 | Transform Excel spreadsheets 5 | """ 6 | # See https://pypi.python.org/pypi?%3Aaction=list_classifiers for classifiers 7 | 8 | conf = dict( 9 | name='databaker', 10 | version='2.0.0', 11 | description="DataBaker, part of QuickCode for ONS", 12 | long_description=long_desc, 13 | classifiers=[ 14 | "Development Status :: 3 - Alpha", 15 | "License :: OSI Approved :: GNU Affero General Public License v3 or later (AGPLv3+)", 16 | "Operating System :: POSIX :: Linux", 17 | "Operating System :: Microsoft :: Windows", 18 | "Programming Language :: Python :: 3.4", 19 | "Programming Language :: Python :: 3.5", 20 | "Programming Language :: Python :: 3.6", 21 | ], 22 | keywords='', 23 | author='The Sensible Code Company Ltd', 24 | author_email='feedback@sensiblecode.io', 25 | url='https://github.com/sensiblecodeio/databaker', 26 | license='AGPL', 27 | packages=find_packages(exclude=['ez_setup', 'examples', 'tests']), 28 | namespace_packages=[], 29 | include_package_data=False, 30 | zip_safe=False, 31 | install_requires=['docopt', 'xypath>=1.1.0', 'xlutils', 'pyhamcrest'], 32 | tests_require=[], 33 | entry_points={ 34 | 'console_scripts': [ 35 | 'databaker_nbconvert = databaker.databaker_nbconvert:main', 36 | ] 37 | }, 38 | ) 39 | 40 | if __name__ == '__main__': 41 | setup(**conf) 42 | --------------------------------------------------------------------------------