├── .gitignore ├── LICENSE ├── README.md ├── src ├── .gitignore ├── TranskribusCommands │ ├── TranskribusDU_transcriptUploader.py │ ├── Transkribus_downloader.py │ ├── Transkribus_uploader.py │ ├── __init__.py │ ├── do_addDocToCollec.py │ ├── do_analyzeLayout.py │ ├── do_analyzeLayoutBatch.py │ ├── do_createCollec.py │ ├── do_deleteCollec.py │ ├── do_deleteJob.py │ ├── do_duplicateDoc.py │ ├── do_export.py │ ├── do_getDocTrp.py │ ├── do_getJobStatus.py │ ├── do_getJobs.py │ ├── do_getRnnTrainingJobStatus.py │ ├── do_htrHmm.py │ ├── do_htrRnn.py │ ├── do_htrRnnPerRegion.py │ ├── do_htrTrainRnn.py │ ├── do_listCollec.py │ ├── do_listHtrHmm.py │ ├── do_listHtrRnn.py │ ├── do_listPageLocks.py │ ├── do_login.py │ ├── do_logout.py │ ├── do_tableTemplate.py │ ├── do_transcript.py │ └── do_uploadDictionary.py ├── TranskribusDU │ └── xml_formats │ │ ├── DS2PageXml.py │ │ ├── Page2DS.py │ │ ├── PageXml.py │ │ ├── PageXmlExtractor.py │ │ ├── __init__.py │ │ ├── mpxml2pxml.py │ │ ├── multipagecontent.xsd │ │ ├── pagecontent.xsd │ │ └── tests │ │ ├── testDS2PageXml │ │ ├── .gitignore │ │ └── RRB_MM_01_033_Jahr_1810.ds.xml │ │ ├── test_DS2PageXml.py │ │ └── test_PageXml.py ├── TranskribusPyClient │ ├── TRP_FullDoc.py │ ├── __init__.py │ ├── application.wadl │ ├── client.html │ ├── client.py │ ├── common │ │ ├── DateTimeRange.py │ │ ├── IntegerRange.py │ │ ├── IntegerRangeHalfBounded.py │ │ ├── __init__.py │ │ └── trace.py │ └── test │ │ ├── __init__.py │ │ ├── test_collections_addDocToCollection.py │ │ ├── test_collections_copyDocToCollection.py │ │ ├── test_collections_fulldoc.py │ │ ├── test_collections_fulldoc_xml.py │ │ ├── test_collections_list.py │ │ ├── test_collections_listEditDeclFeatures.py │ │ └── test_collections_postPageTranscript.py ├── TranskribusPyClient_version.py └── Transkribus_credential.py └── tests ├── .gitignore └── test_commands.sh /.gitignore: -------------------------------------------------------------------------------- 1 | ################# 2 | ## READ 3 | ################# 4 | .trnskrbs 5 | 6 | ################# 7 | ## Eclipse 8 | ################# 9 | .cache 10 | *.pydevproject 11 | .project 12 | .metadata 13 | bin/ 14 | tmp/ 15 | *.tmp 16 | *.bak 17 | *.swp 18 | *~.nib 19 | local.properties 20 | .classpath 21 | .settings/ 22 | .loadpath 23 | 24 | # External tool builders 25 | .externalToolBuilders/ 26 | 27 | # Locally stored "Eclipse launch configurations" 28 | *.launch 29 | 30 | # CDT-specific 31 | .cproject 32 | 33 | # PDT-specific 34 | .buildpath 35 | 36 | 37 | ################# 38 | ## Visual Studio 39 | ################# 40 | 41 | ## Ignore Visual Studio temporary files, build results, and 42 | ## files generated by popular Visual Studio add-ons. 43 | 44 | # User-specific files 45 | *.suo 46 | *.user 47 | *.sln.docstates 48 | 49 | # Build results 50 | 51 | [Dd]ebug/ 52 | [Rr]elease/ 53 | x64/ 54 | build/ 55 | [Bb]in/ 56 | [Oo]bj/ 57 | 58 | # MSTest test Results 59 | [Tt]est[Rr]esult*/ 60 | [Bb]uild[Ll]og.* 61 | 62 | *_i.c 63 | *_p.c 64 | *.ilk 65 | *.meta 66 | *.obj 67 | *.pch 68 | *.pdb 69 | *.pgc 70 | *.pgd 71 | *.rsp 72 | *.sbr 73 | *.tlb 74 | *.tli 75 | *.tlh 76 | *.tmp 77 | *.tmp_proj 78 | *.log 79 | *.vspscc 80 | *.vssscc 81 | .builds 82 | *.pidb 83 | *.log 84 | *.scc 85 | 86 | # Visual C++ cache files 87 | ipch/ 88 | *.aps 89 | *.ncb 90 | *.opensdf 91 | *.sdf 92 | *.cachefile 93 | 94 | # Visual Studio profiler 95 | *.psess 96 | *.vsp 97 | *.vspx 98 | 99 | # Guidance Automation Toolkit 100 | *.gpState 101 | 102 | # ReSharper is a .NET coding add-in 103 | _ReSharper*/ 104 | *.[Rr]e[Ss]harper 105 | 106 | # TeamCity is a build add-in 107 | _TeamCity* 108 | 109 | # DotCover is a Code Coverage Tool 110 | *.dotCover 111 | 112 | # NCrunch 113 | *.ncrunch* 114 | .*crunch*.local.xml 115 | 116 | # Installshield output folder 117 | [Ee]xpress/ 118 | 119 | # DocProject is a documentation generator add-in 120 | DocProject/buildhelp/ 121 | DocProject/Help/*.HxT 122 | DocProject/Help/*.HxC 123 | DocProject/Help/*.hhc 124 | DocProject/Help/*.hhk 125 | DocProject/Help/*.hhp 126 | DocProject/Help/Html2 127 | DocProject/Help/html 128 | 129 | # Click-Once directory 130 | publish/ 131 | 132 | # Publish Web Output 133 | *.Publish.xml 134 | *.pubxml 135 | *.publishproj 136 | 137 | # NuGet Packages Directory 138 | ## TODO: If you have NuGet Package Restore enabled, uncomment the next line 139 | #packages/ 140 | 141 | # Windows Azure Build Output 142 | csx 143 | *.build.csdef 144 | 145 | # Windows Store app package directory 146 | AppPackages/ 147 | 148 | # Others 149 | sql/ 150 | *.Cache 151 | ClientBin/ 152 | [Ss]tyle[Cc]op.* 153 | ~$* 154 | *~ 155 | *.dbmdl 156 | *.[Pp]ublish.xml 157 | *.pfx 158 | *.publishsettings 159 | 160 | # RIA/Silverlight projects 161 | Generated_Code/ 162 | 163 | # Backup & report files from converting an old project file to a newer 164 | # Visual Studio version. Backup files are not needed, because we have git ;-) 165 | _UpgradeReport_Files/ 166 | Backup*/ 167 | UpgradeLog*.XML 168 | UpgradeLog*.htm 169 | 170 | # SQL Server files 171 | App_Data/*.mdf 172 | App_Data/*.ldf 173 | 174 | ############# 175 | ## Windows detritus 176 | ############# 177 | 178 | # Windows image file caches 179 | Thumbs.db 180 | ehthumbs.db 181 | 182 | # Folder config file 183 | Desktop.ini 184 | 185 | # Recycle Bin used on file shares 186 | $RECYCLE.BIN/ 187 | 188 | # Mac crap 189 | .DS_Store 190 | 191 | 192 | ############# 193 | ## Python 194 | ############# 195 | 196 | *.py[cod] 197 | 198 | # Packages 199 | *.egg 200 | *.egg-info 201 | dist/ 202 | build/ 203 | eggs/ 204 | parts/ 205 | var/ 206 | sdist/ 207 | develop-eggs/ 208 | .installed.cfg 209 | 210 | # Installer logs 211 | pip-log.txt 212 | 213 | # Unit test / coverage reports 214 | .coverage 215 | .tox 216 | 217 | #Translations 218 | *.mo 219 | 220 | #Mr Developer 221 | .mr.developer.cfg 222 | src/Transkribus_credential.py 223 | *.keep 224 | src/Transkribus_credential.py 225 | /trnskrbs_3820/ 226 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU LESSER GENERAL PUBLIC LICENSE 2 | Version 3, 29 June 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | 9 | This version of the GNU Lesser General Public License incorporates 10 | the terms and conditions of version 3 of the GNU General Public 11 | License, supplemented by the additional permissions listed below. 12 | 13 | 0. Additional Definitions. 14 | 15 | As used herein, "this License" refers to version 3 of the GNU Lesser 16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU 17 | General Public License. 18 | 19 | "The Library" refers to a covered work governed by this License, 20 | other than an Application or a Combined Work as defined below. 21 | 22 | An "Application" is any work that makes use of an interface provided 23 | by the Library, but which is not otherwise based on the Library. 24 | Defining a subclass of a class defined by the Library is deemed a mode 25 | of using an interface provided by the Library. 26 | 27 | A "Combined Work" is a work produced by combining or linking an 28 | Application with the Library. The particular version of the Library 29 | with which the Combined Work was made is also called the "Linked 30 | Version". 31 | 32 | The "Minimal Corresponding Source" for a Combined Work means the 33 | Corresponding Source for the Combined Work, excluding any source code 34 | for portions of the Combined Work that, considered in isolation, are 35 | based on the Application, and not on the Linked Version. 36 | 37 | The "Corresponding Application Code" for a Combined Work means the 38 | object code and/or source code for the Application, including any data 39 | and utility programs needed for reproducing the Combined Work from the 40 | Application, but excluding the System Libraries of the Combined Work. 41 | 42 | 1. Exception to Section 3 of the GNU GPL. 43 | 44 | You may convey a covered work under sections 3 and 4 of this License 45 | without being bound by section 3 of the GNU GPL. 46 | 47 | 2. Conveying Modified Versions. 48 | 49 | If you modify a copy of the Library, and, in your modifications, a 50 | facility refers to a function or data to be supplied by an Application 51 | that uses the facility (other than as an argument passed when the 52 | facility is invoked), then you may convey a copy of the modified 53 | version: 54 | 55 | a) under this License, provided that you make a good faith effort to 56 | ensure that, in the event an Application does not supply the 57 | function or data, the facility still operates, and performs 58 | whatever part of its purpose remains meaningful, or 59 | 60 | b) under the GNU GPL, with none of the additional permissions of 61 | this License applicable to that copy. 62 | 63 | 3. Object Code Incorporating Material from Library Header Files. 64 | 65 | The object code form of an Application may incorporate material from 66 | a header file that is part of the Library. You may convey such object 67 | code under terms of your choice, provided that, if the incorporated 68 | material is not limited to numerical parameters, data structure 69 | layouts and accessors, or small macros, inline functions and templates 70 | (ten or fewer lines in length), you do both of the following: 71 | 72 | a) Give prominent notice with each copy of the object code that the 73 | Library is used in it and that the Library and its use are 74 | covered by this License. 75 | 76 | b) Accompany the object code with a copy of the GNU GPL and this license 77 | document. 78 | 79 | 4. Combined Works. 80 | 81 | You may convey a Combined Work under terms of your choice that, 82 | taken together, effectively do not restrict modification of the 83 | portions of the Library contained in the Combined Work and reverse 84 | engineering for debugging such modifications, if you also do each of 85 | the following: 86 | 87 | a) Give prominent notice with each copy of the Combined Work that 88 | the Library is used in it and that the Library and its use are 89 | covered by this License. 90 | 91 | b) Accompany the Combined Work with a copy of the GNU GPL and this license 92 | document. 93 | 94 | c) For a Combined Work that displays copyright notices during 95 | execution, include the copyright notice for the Library among 96 | these notices, as well as a reference directing the user to the 97 | copies of the GNU GPL and this license document. 98 | 99 | d) Do one of the following: 100 | 101 | 0) Convey the Minimal Corresponding Source under the terms of this 102 | License, and the Corresponding Application Code in a form 103 | suitable for, and under terms that permit, the user to 104 | recombine or relink the Application with a modified version of 105 | the Linked Version to produce a modified Combined Work, in the 106 | manner specified by section 6 of the GNU GPL for conveying 107 | Corresponding Source. 108 | 109 | 1) Use a suitable shared library mechanism for linking with the 110 | Library. A suitable mechanism is one that (a) uses at run time 111 | a copy of the Library already present on the user's computer 112 | system, and (b) will operate properly with a modified version 113 | of the Library that is interface-compatible with the Linked 114 | Version. 115 | 116 | e) Provide Installation Information, but only if you would otherwise 117 | be required to provide such information under section 6 of the 118 | GNU GPL, and only to the extent that such information is 119 | necessary to install and execute a modified version of the 120 | Combined Work produced by recombining or relinking the 121 | Application with a modified version of the Linked Version. (If 122 | you use option 4d0, the Installation Information must accompany 123 | the Minimal Corresponding Source and Corresponding Application 124 | Code. If you use option 4d1, you must provide the Installation 125 | Information in the manner specified by section 6 of the GNU GPL 126 | for conveying Corresponding Source.) 127 | 128 | 5. Combined Libraries. 129 | 130 | You may place library facilities that are a work based on the 131 | Library side by side in a single library together with other library 132 | facilities that are not Applications and are not covered by this 133 | License, and convey such a combined library under terms of your 134 | choice, if you do both of the following: 135 | 136 | a) Accompany the combined library with a copy of the same work based 137 | on the Library, uncombined with any other library facilities, 138 | conveyed under the terms of this License. 139 | 140 | b) Give prominent notice with the combined library that part of it 141 | is a work based on the Library, and explaining where to find the 142 | accompanying uncombined form of the same work. 143 | 144 | 6. Revised Versions of the GNU Lesser General Public License. 145 | 146 | The Free Software Foundation may publish revised and/or new versions 147 | of the GNU Lesser General Public License from time to time. Such new 148 | versions will be similar in spirit to the present version, but may 149 | differ in detail to address new problems or concerns. 150 | 151 | Each version is given a distinguishing version number. If the 152 | Library as you received it specifies that a certain numbered version 153 | of the GNU Lesser General Public License "or any later version" 154 | applies to it, you have the option of following the terms and 155 | conditions either of that published version or of any later version 156 | published by the Free Software Foundation. If the Library as you 157 | received it does not specify a version number of the GNU Lesser 158 | General Public License, you may choose any version of the GNU Lesser 159 | General Public License ever published by the Free Software Foundation. 160 | 161 | If the Library as you received it specifies that a proxy can decide 162 | whether future versions of the GNU Lesser General Public License shall 163 | apply, that proxy's public statement of acceptance of any version is 164 | permanent authorization for you to choose that version for the 165 | Library. 166 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # TranskribusPyClient 2 | 3 | A Pythonic API and some command line tools to access the Transkribus server via its REST API 4 | 5 | ### Requirements, installation & testing 6 | 7 | #### Python 8 | 9 | * Install the latest release of [Python] 2.7.x, 3.5.x or 3.6.x 10 | 11 | ### Additional Libraries 12 | 13 | * python-dateutil 14 | 15 | ### Wiki documentation [https://github.com/Transkribus/TranskribusPyClient/wiki] 16 | 17 | ### Commands ### 18 | 19 | * do_addDocToCollec.py 20 | * do_createCollec.py 21 | * do_deleteCollec.py 22 | * do_deleteJob.py 23 | * do_duplicateDoc.py 24 | * do_getJobStatus.py 25 | * do_listCollec.py 26 | * do_listPageLocks.py 27 | * do_Transcript.py 28 | 29 | * do_analyzeLayout.py 30 | * do_tableTemplate.py 31 | * do_htrHmm.py 32 | * do_htrRnn.py 33 | * do_listHtrHmm.py 34 | * do_listHtrRnn.py 35 | 36 | * do_login.py 37 | * do_logout.py 38 | 39 | * Transkribus_downloader.py 40 | * TranskribusDU_transcriptUploader.py 41 | 42 | **Help on module client:** 43 | 44 | See in [TranskribusPyClient/client.html](http://htmlpreview.github.com/?https://github.com/Transkribus/TranskribusPyClient/blob/master/src/TranskribusPyClient/client.html 45 | ) 46 | 47 | 48 | [Python]: 49 | [Pip]: 50 | [LIBXML2]: 51 | [TranskribusDU]: 52 | -------------------------------------------------------------------------------- /src/.gitignore: -------------------------------------------------------------------------------- 1 | /Transkribus_credential.py 2 | -------------------------------------------------------------------------------- /src/TranskribusCommands/Transkribus_downloader.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding:utf-8 -*- 3 | 4 | """ 5 | Utility to extract collection or documents from Transkribus and create DS test structures 6 | 7 | Copyright Xerox(C) 2016 H. Déjean, JL. Meunier 8 | 9 | This program is free software: you can redistribute it and/or modify 10 | it under the terms of the GNU General Public License as published by 11 | the Free Software Foundation, either version 3 of the License, or 12 | (at your option) any later version. 13 | 14 | This program is distributed in the hope that it will be useful, 15 | but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 | GNU General Public License for more details. 18 | 19 | You should have received a copy of the GNU General Public License 20 | along with this program. If not, see . 21 | 22 | 23 | Developed for the EU project READ. The READ project has received funding 24 | from the European Union�s Horizon 2020 research and innovation programme 25 | under grant agreement No 674943. 26 | 27 | Created on 15 Nov 2016 28 | 29 | @author: meunier 30 | """ 31 | 32 | from __future__ import absolute_import 33 | from __future__ import print_function 34 | from __future__ import unicode_literals 35 | 36 | DEBUG = 0 37 | 38 | 39 | import sys, os, logging 40 | 41 | from optparse import OptionParser 42 | import json 43 | from io import open 44 | 45 | 46 | try: #to ease the use without proper Python installation 47 | import TranskribusPyClient_version 48 | except ImportError: 49 | sys.path.append( os.path.dirname(os.path.dirname( os.path.abspath(sys.argv[0]) )) ) 50 | import TranskribusPyClient_version 51 | 52 | from TranskribusPyClient.common.trace import traceln, trace 53 | 54 | from TranskribusCommands import sCOL, sMPXMLExtension, _Trnskrbs_default_url, __Trnskrbs_basic_options, _Trnskrbs_description, __Trnskrbs_do_login_stuff, _exit 55 | from TranskribusPyClient.client import TranskribusClient 56 | from TranskribusDU.xml_formats import PageXml 57 | 58 | 59 | 60 | 61 | class TranskribusDownloader(TranskribusClient): 62 | """ 63 | Download a Transkribus collection as a DS structured dataset 64 | """ 65 | sDefaultServerUrl = _Trnskrbs_default_url 66 | 67 | #--- INIT ------------------------------------------------------------------------------------------------------------- 68 | def __init__(self, trnkbsServerUrl, sHttpProxy=None, loggingLevel=logging.WARN): 69 | TranskribusClient.__init__(self, sServerUrl=trnkbsServerUrl, proxies=sHttpProxy, loggingLevel=loggingLevel) 70 | 71 | def createStandardFolders(self, colId, destDir): 72 | """ 73 | CReate the standard DU folde structure and return the collection folder 74 | """ 75 | if not( os.path.exists(destDir) and os.path.isdir(destDir) ): 76 | raise ValueError("Non-existing destination folder %s" % destDir) 77 | 78 | colDir = os.path.join(destDir, "trnskrbs_%s"%colId) 79 | 80 | #Creating folder structure 81 | if os.path.exists(colDir): 82 | if not os.path.isdir(colDir): raise ValueError("%s exists and is not a folder."%colDir) 83 | else: 84 | traceln('- creating folder: %s'%colDir) 85 | os.mkdir(colDir) 86 | 87 | for sSubDir in [sCOL, "xml", "ref", "run", "out"]: 88 | sDir = os.path.join(colDir, sSubDir) 89 | if os.path.exists(sDir): 90 | if not os.path.isdir(sDir): raise ValueError("%s exists and is not a folder."%sDir) 91 | else: 92 | os.mkdir(sDir) 93 | 94 | return colDir 95 | 96 | def downloadCollection(self, colId, destDir, bForce=False, bNoImage=False,sDocId=None): 97 | """ 98 | Here, we create the appropriate structure and fetch either the whole collection or one document and convert this to DS XML 99 | 100 | if bForce==True, data on disk is overwritten, otherwise raise an exception is some data is there already 101 | if bNoImage==True, do not download the images 102 | """ 103 | colDir = self.createStandardFolders(colId, destDir) 104 | 105 | col_max_ts,ldocids, dFileListPerDoc = self.download_collection(colId, os.path.join(colDir,sCOL), bForce, bNoImage,sDocId) 106 | with open(destDir+os.sep+sCOL+TranskribusClient._POSTFIX_MAX_TX, "w") as fd: fd.write("%s"%col_max_ts) #"col_max.ts" file 107 | 108 | return col_max_ts, colDir, ldocids, dFileListPerDoc 109 | 110 | def download_document_by_trp(self, colId, docId, destDir, trp_spec, bOverwrite=False, bNoImage=False): 111 | """ 112 | we have a trp, and download what is specified in it 113 | """ 114 | colDir = self.createStandardFolders(colId, destDir) 115 | 116 | docFolder = os.path.join(colDir, sCOL, str(docId)) 117 | 118 | doc_max_ts, lFileList = self.download_document(colId, docId, docFolder 119 | , bForce=False, bOverwrite=bOverwrite, bNoImage=bNoImage 120 | , trp_spec=trp_spec) 121 | return doc_max_ts, docFolder, lFileList 122 | 123 | def generateCollectionMultiPageXml(self, colDir, dFileListPerDoc, bStrict): 124 | """ 125 | We concatenate all pages into a "multi-page PageXml" for each document of the collection 126 | return the list of XML filenames 127 | """ 128 | lsXmlFilename = list() 129 | traceln("- Generating multi_page PageXml") 130 | # lsDocMaxTSFilename = sorted(glob.iglob(os.path.join(colDir, "*%s"%TranskribusClient._POSTFIX_MAX_TX)), reverse=True) # *_max.ts files 131 | for docId in dFileListPerDoc.keys(): 132 | if dFileListPerDoc[docId] is not None: 133 | lFiles= list(map(lambda x:os.path.join(colDir,docId,x+".pxml"),dFileListPerDoc[docId] )) 134 | docDir = os.path.join(colDir,docId) 135 | traceln("\t- %s"%docDir) 136 | 137 | doc = self.makeMultiPageXml(lFiles) 138 | 139 | sXmlFilename = docDir+sMPXMLExtension 140 | self.writeDom(doc, sXmlFilename, True) 141 | lsXmlFilename.append(sXmlFilename) 142 | 143 | trace("\t\t- validating the MultiPageXml ...") 144 | if not PageXml.MultiPageXml.validate(doc): 145 | if bStrict: 146 | raise ValueError("Invalid XML generated in '%s'"%sXmlFilename) 147 | else: 148 | traceln(" *** WARNING: XML file is invalid against the schema: '%s'"%sXmlFilename) 149 | traceln(" Ok!") 150 | 151 | if DEBUG>1: 152 | PageXml.MultiPageXml.splitMultiPageXml(doc, docDir, "debug_%d.xml", bIndent=True) 153 | 154 | # doc.freeDoc() 155 | traceln('\t- %s'%sXmlFilename) 156 | 157 | 158 | return lsXmlFilename 159 | 160 | def makeMultiPageXml(self, slFilenames): 161 | """ 162 | We concatenate all pages into a "multi-page PageXml" 163 | return a DOM 164 | """ 165 | doc = PageXml.MultiPageXml.makeMultiPageXml(slFilenames) 166 | 167 | return doc 168 | 169 | def writeDom(self, doc, filename, bIndent=False): 170 | doc.write(filename,xml_declaration=True,encoding='utf-8',pretty_print=True) 171 | # doc.saveFormatFileEnc(filename, "UTF-8", bIndent) 172 | 173 | # if self.bZLib: 174 | # #traceln("ZLIB WRITE") 175 | # try: 176 | # FIX_docSetCompressMode(doc, self.iZLibRatio) 177 | # except Exception, e: 178 | # traceln("WARNING: ZLib error in Component.py: cannot set the libxml2 in compression mode. Was libxml2 compiled with zlib? :", e) 179 | # if bIndent: 180 | # doc.saveFormatFileEnc(self.getOutputFileName(), "UTF-8",bIndent) 181 | # else: 182 | # #JLM - April 2009 - dump does not support the compressiondoc.dump(self.getOutputFile()) 183 | # doc.saveFileEnc(self.getOutputFileName(),"UTF-8") 184 | 185 | if __name__ == '__main__': 186 | usage = "%s [-f|--force] [--strict] [--docid ] [--trp ] [--noImage] []"%sys.argv[0] 187 | version = "v.03" 188 | description = "Extract a collection from transkribus and create a DS test structure containing that collection. \n" + _Trnskrbs_description 189 | 190 | #prepare for the parsing of the command line 191 | parser = OptionParser(usage=usage, version=version) 192 | parser.description = description 193 | 194 | #"-s", "--server", "-l", "--login" , "-p", "--pwd", "--https_proxy" OPTIONS 195 | __Trnskrbs_basic_options(parser, TranskribusDownloader.sDefaultServerUrl) 196 | 197 | parser.add_option("-f", "--force" , dest='bForce' , action="store_true", default=False, help="Force rewrite if disk data is obsolete, or force overwrite in --trp mode") 198 | parser.add_option("--strict" , dest='bStrict', action="store_true", default=False, help="Failed schema validation stops the processus.") 199 | parser.add_option("--noimage", "--noImage", dest='bNoImage', action="store_true", default=False, help="Do not download images.") 200 | parser.add_option("--docid", dest='docid', action="store", type="int", help="download specific document") 201 | parser.add_option("--trp" , dest='trp' , action="store", type="string", help="download the content specified by the trp file.") 202 | 203 | # --- 204 | #parse the command line 205 | (options, args) = parser.parse_args() 206 | proxies = {} if not options.https_proxy else {'https_proxy':options.https_proxy} 207 | 208 | # --- 209 | 210 | try: 211 | colid = args.pop(0) 212 | except: 213 | _exit(usage, 1) 214 | 215 | if args: 216 | destDir = args[0] 217 | else: 218 | destDir = "." 219 | 220 | # --- 221 | trnkbs2ds = TranskribusDownloader(options.server, proxies, loggingLevel=logging.WARN) 222 | __Trnskrbs_do_login_stuff(trnkbs2ds, options, trace=trace, traceln=traceln) 223 | 224 | if options.trp: 225 | traceln("- Loading trp data from %s" % options.trp) 226 | # trp = json.load(open(options.trp, "rb",encoding='utf-8')) 227 | trp = json.load(open(options.trp, "rt",encoding='utf-8')) 228 | 229 | traceln("- Downloading collection %s to folder %s, as specified by trp data"%(colid, os.path.abspath(destDir))) 230 | if not options.docid: 231 | options.docid = trp["md"]["docId"] 232 | traceln(" read docId from TRP: docId = %s"%options.docid) 233 | logging.basicConfig(level=logging.INFO) 234 | col_ts, docFolder, lFileList = trnkbs2ds.download_document_by_trp(colid, options.docid, destDir, trp, bOverwrite=options.bForce, bNoImage=options.bNoImage) 235 | traceln(list(map(lambda x: x.encode('utf-8'), lFileList))) 236 | colFolder = docFolder #inaccurate, but fine for rest of code 237 | else: 238 | traceln("- Downloading collection %s to folder %s"%(colid, os.path.abspath(destDir))) 239 | col_ts, colFolder, ldocids, dFileListPerDoc = trnkbs2ds.downloadCollection(colid, destDir, bForce=options.bForce, bNoImage=options.bNoImage,sDocId=options.docid) 240 | trnkbs2ds.generateCollectionMultiPageXml(os.path.join(colFolder, sCOL), dFileListPerDoc,options.bStrict) 241 | traceln("- Done") 242 | 243 | with open(os.path.join(colFolder, "config.txt"), "w") as fd: 244 | fd.write("server=%s\nforce=%s\nstrict=%s\ntrp=%s\n"%(options.server, options.bForce, options.bStrict, options.trp)) 245 | 246 | 247 | traceln('- Done, see in %s'%colFolder) 248 | 249 | -------------------------------------------------------------------------------- /src/TranskribusCommands/Transkribus_uploader.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding:utf-8 -*- 3 | 4 | """ 5 | Utility to upload to Transkribus from a DS test structure 6 | 7 | Copyright Naver Labs Europe(C) 2017 JL. Meunier 8 | 9 | This program is free software: you can redistribute it and/or modify 10 | it under the terms of the GNU General Public License as published by 11 | the Free Software Foundation, either version 3 of the License, or 12 | (at your option) any later version. 13 | 14 | This program is distributed in the hope that it will be useful, 15 | but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 | GNU General Public License for more details. 18 | 19 | You should have received a copy of the GNU General Public License 20 | along with this program. If not, see . 21 | 22 | 23 | Developed for the EU project READ. The READ project has received funding 24 | from the European Union�s Horizon 2020 research and innovation programme 25 | under grant agreement No 674943. 26 | 27 | Created on 11 October 2017 28 | 29 | @author: meunier 30 | """ 31 | 32 | from __future__ import absolute_import 33 | from __future__ import print_function 34 | from __future__ import unicode_literals 35 | DEBUG = 0 36 | 37 | import sys, os, logging 38 | from optparse import OptionParser 39 | import json 40 | from io import open 41 | 42 | 43 | try: #to ease the use without proper Python installation 44 | import TranskribusPyClient_version 45 | except ImportError: 46 | sys.path.append( os.path.dirname(os.path.dirname( os.path.abspath(sys.argv[0]) )) ) 47 | import TranskribusPyClient_version 48 | 49 | from TranskribusPyClient.common.trace import traceln, trace 50 | 51 | from TranskribusCommands import sCOL, _Trnskrbs_default_url, __Trnskrbs_basic_options, _Trnskrbs_description, __Trnskrbs_do_login_stuff, _exit 52 | from TranskribusPyClient.client import TranskribusClient 53 | 54 | from TranskribusDU.xml_formats import PageXml 55 | 56 | 57 | 58 | class TranskribusTranscriptUploader(TranskribusClient): 59 | """ 60 | Upload transcripts from the disk or memory to Transkribus 61 | """ 62 | sDefaultServerUrl = _Trnskrbs_default_url 63 | 64 | #--- INIT ------------------------------------------------------------------------------------------------------------- 65 | def __init__(self, trnkbsServerUrl, sHttpProxy=None, loggingLevel=logging.WARN): 66 | TranskribusClient.__init__(self, sServerUrl=trnkbsServerUrl, proxies=sHttpProxy, loggingLevel=loggingLevel) 67 | 68 | def uploadCollectionTranscript(self, colid, sColDSDir, sNote="",sToolName="", iVerbose=0, status=None): 69 | """ 70 | Upload the transcripts of all document in that collection into Transkribus 71 | return nothing 72 | """ 73 | if iVerbose: 74 | traceln("- Uploading all transcripts from folder %s to collection %s"%(sColDSDir, colid)) 75 | 76 | trpFilename = os.path.join(sColDSDir, "trp.json") 77 | traceln(" - reading %s"%trpFilename) 78 | if not os.path.exists(trpFilename): 79 | raise Exception("File not found %s. \nData probably created in --trp mode, so upload must be done in --trp mode."%trpFilename) 80 | trp = json.load(open(trpFilename, "r",encoding='utf-8')) 81 | 82 | for docid in [d["docId"] for d in trp]: 83 | self.uploadDocumentTranscript(colid, docid, sColDSDir, sNote=sNote, sToolName=sToolName, iVerbose=iVerbose, status=status) 84 | 85 | if iVerbose: 86 | traceln(" Done (collection %s)"%colid) 87 | return 88 | 89 | def uploadDocumentTranscript(self, colid, docid, sColDSDir, sNote="",sToolName="", iVerbose=0, status=None): 90 | """ 91 | Upload the transcripts of all document in that collection into Transkribus 92 | return nothing 93 | """ 94 | trpFilename = os.path.join(sColDSDir, str(docid), "trp.json") 95 | traceln(" - reading %s"%trpFilename) 96 | if not os.path.exists(trpFilename): 97 | raise Exception("File not found %s. \nData probably created in --trp mode, so upload must be done in --trp mode."%trpFilename) 98 | trp = json.load(open(trpFilename, "r",encoding='utf-8')) 99 | self.uploadDocumentTranscript_by_trp(colid, docid, trp, sColDSDir, sNote=sNote, sToolName=sToolName, iVerbose=iVerbose, status=status) 100 | return 101 | 102 | def uploadDocumentTranscript_by_trp(self, colid, docid, trp, sColDSDir, sNote="",sToolName="", iVerbose=0, status=None): 103 | """ 104 | Upload the transcripts of one document in that collection into Transkribus, as specified by the TRP data 105 | status = None ==> we get the status from the TRP 106 | otherwise ==> we set the given status 107 | return nothing 108 | """ 109 | if iVerbose: 110 | traceln("- Uploading as listed in TRP, the transcript(s) of document %s from folder %s to collection %s "%(docid, sColDSDir, colid)) 111 | 112 | if docid: 113 | if str(trp["md"]["docId"]) != str(docid): 114 | raise ValueError("Document ID does not match docId of TRP data.") 115 | else: 116 | docid = trp["md"]["docId"] 117 | 118 | pageList = trp["pageList"] 119 | 120 | docDir = os.path.join(sColDSDir, str(docid)) 121 | 122 | if not os.path.exists(docDir): raise ValueError("Document directory not found: %s" % docDir) 123 | 124 | lFileList= [] 125 | for dPage in pageList['pages']: 126 | pagenum= dPage['pageNr'] 127 | logging.info("\t\t- page %s"%pagenum) 128 | 129 | imgFileName = dPage['imgFileName'] 130 | base,_= os.path.splitext(imgFileName) 131 | lFileList.append(base) 132 | 133 | _trpTranscript0 = dPage['tsList']["transcripts"][0] 134 | tsId = _trpTranscript0['tsId'] 135 | sBaseName, _ = os.path.splitext(imgFileName) 136 | xmlFilename = docDir + os.sep + sBaseName + ".pxml" 137 | logging.info("\t\t\t%s"%xmlFilename) 138 | assert os.path.exists(xmlFilename) 139 | with open(xmlFilename, "r",encoding='utf-8') as fd: sXMlTranscript = fd.read() 140 | cur_status = _trpTranscript0["status"] if status == None else status 141 | traceln("page %5d : %s : %s : %s : %s : %s"%(pagenum, cur_status, sToolName, tsId, sNote, xmlFilename)) 142 | self.postPageTranscript(colid, docid, pagenum, sXMlTranscript, parentId=tsId, bEncoded=False, sNote=sNote, sToolName=sToolName, status=cur_status) 143 | 144 | 145 | if iVerbose: 146 | traceln(" Done (collection %s, document %s as per TRP)"%(colid, docid)) 147 | 148 | return lFileList 149 | 150 | def main(): 151 | usage = "%s []"%sys.argv[0] 152 | version = "v.01" 153 | description = """Upload the transcript(s) from the DS structure to Transkribus, either of the collection or one of its document(s). 154 | The must have been created by transkribus_downloader.py and should contain the 'col' directory and a trp.json file for the collection, and one per document (the 'out', 'ref', 'run', 'xml' folders are not used). 155 | The page transcript from the single page PageXml files are uploaded. (The multi-page xml file(s) are ignored)) 156 | """ + _Trnskrbs_description 157 | 158 | #prepare for the parsing of the command line 159 | parser = OptionParser(usage=usage, version=version) 160 | parser.description = description 161 | 162 | #"-s", "--server", "-l", "--login" , "-p", "--pwd", "--https_proxy" OPTIONS 163 | __Trnskrbs_basic_options(parser, TranskribusTranscriptUploader.sDefaultServerUrl) 164 | 165 | parser.add_option("-q", "--quiet" , dest='bQuiet', action="store_true", default=False, help="Quiet mode") 166 | parser.add_option("--trp" , dest='trp' , action="store", type="string", help="download the content specified by the trp file.") 167 | parser.add_option("--toolname", dest='tool' , action="store", type="string", default="", help="Set the Toolname metadata in Transkribus.") 168 | parser.add_option("--message", dest='message', action="store", type="string", default="", help="Set the message metadata in Transkribus.") 169 | parser.add_option("--set_status", dest='set_status', action="store", type="string", default=None, help="Set the status of the uploaded trasnscript.") 170 | 171 | # --- 172 | #parse the command line 173 | (options, args) = parser.parse_args() 174 | proxies = {} if not options.https_proxy else {'https_proxy':options.https_proxy} 175 | 176 | iVerbose = 0 if options.bQuiet else 2 177 | # --- 178 | try: sDSDir = args.pop(0) 179 | except: _exit(usage, 1) 180 | if not(sDSDir.endswith(sCOL) or sDSDir.endswith(sCOL+os.path.sep)): 181 | sColDSDir = os.path.abspath(os.path.join(sDSDir, sCOL)) 182 | else: 183 | sColDSDir = os.path.abspath(sDSDir) 184 | if not( os.path.exists(sColDSDir) and os.path.isdir(sColDSDir) ): 185 | raise ValueError("Non-existing folder: %s "%sColDSDir) 186 | 187 | try: colid = args.pop(0) 188 | except: _exit(usage, 1) 189 | 190 | try: docid = args.pop(0) 191 | except: docid = None 192 | 193 | # --- 194 | doer = TranskribusTranscriptUploader(options.server, proxies, loggingLevel=logging.WARN) 195 | __Trnskrbs_do_login_stuff(doer, options, trace=trace, traceln=traceln) 196 | 197 | if options.trp: 198 | trp = json.load(open(options.trp, "r",encoding='utf-8')) 199 | traceln("- Uploading to collection %s, as specified by trp data"%(colid)) 200 | if not docid: 201 | docid = trp["md"]["docId"] 202 | traceln(" read docId from TRP: docId = %s"%docid) 203 | sToolname = options.tool if options.tool else "Transkribus_uploader (--trp)" 204 | lFileList = doer.uploadDocumentTranscript_by_trp(colid, docid, trp, sColDSDir 205 | , sNote=options.message, sToolName=sToolname, iVerbose=iVerbose 206 | , status=options.set_status) 207 | #traceln(map(lambda x: x.encode('utf-8'), lFileList)) 208 | else: 209 | if docid == None: 210 | sToolname = options.tool if options.tool else "Transkribus_uploader" 211 | doer.uploadCollectionTranscript(colid, sColDSDir 212 | , sNote=options.message, sToolName=sToolname, iVerbose=iVerbose 213 | , status=options.set_status) 214 | 215 | else: 216 | sToolname = options.tool if options.tool else "Transkribus_uploader (docid)" 217 | doer.uploadDocumentTranscript(colid, docid, sColDSDir 218 | , sNote=options.message, sToolName=sToolname, iVerbose=iVerbose 219 | , status=options.set_status) 220 | 221 | traceln('- DONE, all transcripts were uploaded. See in collection %s'%colid) 222 | 223 | if __name__ == '__main__': 224 | main() -------------------------------------------------------------------------------- /src/TranskribusCommands/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | #REMOVE THIS annoying warning saying: 4 | # /usr/lib/python2.7/site-packages/requests-2.12.1-py2.7.egg/requests/packages/urllib3/connectionpool.py:843: InsecureRequestWarning: Unverified HTTPS request is being made. 5 | # Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#ssl-warnings InsecureRequestWarning) 6 | from __future__ import absolute_import 7 | from __future__ import print_function 8 | from __future__ import unicode_literals 9 | 10 | import sys 11 | 12 | import requests.packages.urllib3 13 | 14 | from requests.packages.urllib3.exceptions import InsecureRequestWarning 15 | requests.packages.urllib3.disable_warnings(InsecureRequestWarning) 16 | DEBUG=0 17 | 18 | _Trnskrbs_default_url = "https://transkribus.eu/TrpServer" 19 | 20 | _Trnskrbs_description = u"""Pass your login/password as options otherwise consider having a Transkribus_credential.py file, which defines a 'login' and a 'pwd' variables. 21 | If you need to use a proxy, use the --https_proxy option or set the environment variables HTTPS_PROXY. 22 | To use HTTP Basic Auth with your proxy, use the http://user:password@host/ syntax. 23 | """ 24 | 25 | sCOL = "col" 26 | sMPXMLExtension = ".mpxml" 27 | 28 | NS_PAGE_XML = "http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15" 29 | 30 | def __Trnskrbs_basic_options(parser, sDefaultServerUrl): 31 | """ 32 | UTILITY 33 | add the usual options for Transkribus to a command line option parser 34 | """ 35 | #prepare for the parsing of the command line 36 | #parser = OptionParser(usage=usage, version=version) 37 | 38 | parser.add_option("-s", "--server" , dest='server', action="store", type="string", default=sDefaultServerUrl, help="Transkribus server URL") 39 | 40 | parser.add_option("-l", "--login" , dest='login' , action="store", type="string", help="Transkribus login (consider storing your credentials in 'transkribus_credentials.py')") 41 | parser.add_option("-p", "--pwd" , dest='pwd' , action="store", type="string", help="Transkribus password") 42 | 43 | parser.add_option("--persist" , dest='persist', action="store_true", help="Try using an existing persistent session, or log-in and persists the session.") 44 | 45 | parser.add_option("--https_proxy" , dest='https_proxy' , action="store", type="string", help="proxy, e.g. http://cornillon:8000") 46 | 47 | 48 | def __Trnskrbs_do_login_stuff(trnskrbs_client, options, trace=None, traceln=None): 49 | """ 50 | deal with the complicated login variants... 51 | -trace and traceln are optional print methods 52 | return True or raises an exception 53 | """ 54 | bOk = False 55 | 56 | if options.persist: 57 | #try getting some persistent session token 58 | if DEBUG and trace: trace(" ---login--- Try reusing persistent session ... ") 59 | try: 60 | bOk = trnskrbs_client.reusePersistentSession() 61 | if DEBUG and traceln: traceln("OK!") 62 | except: 63 | if DEBUG and traceln: traceln("Failed") 64 | 65 | if not bOk: 66 | if options.login: 67 | login, pwd = options.login, options.pwd 68 | else: 69 | if trace: DEBUG and trace(" ---login--- no login provided, looking for stored credentials... ") 70 | login, pwd = trnskrbs_client.getStoredCredentials(bAsk=False) 71 | if DEBUG and traceln: traceln("OK") 72 | 73 | if DEBUG and traceln: trace(" ---login--- logging onto Transkribus as %s "%login) 74 | trnskrbs_client.auth_login(login, pwd) 75 | if DEBUG and traceln: traceln("OK") 76 | bOk = True 77 | 78 | return bOk 79 | 80 | def _exit(usage, status, exc=None): 81 | if usage: sys.stderr.write("ERROR: usage : %s\n"%usage) 82 | if exc != None: sys.stderr.write(str(exc)) #any exception? 83 | sys.exit(status) 84 | 85 | 86 | def strTabularFormat(lDic, lsKey, sSortKey=None): 87 | """ 88 | Format as a table a list of dictionary like: 89 | [ 90 | { 91 | "modelName": "Marine_Lives", 92 | "nrOfTokens": 0, 93 | "isUsableInTranskribus": 1, 94 | "nrOfDictTokens": 0, 95 | "nrOfLines": 0, 96 | "modelId": 45 97 | }, 98 | ... 99 | Show only keys listed in lsKey 100 | if given, sSortKey is used to sort the lines of the table. 101 | return a string 102 | """ 103 | if sSortKey: lDic.sort(key=lambda x: x[sSortKey]) 104 | #computing column width 105 | lWidth = [1] * len(lsKey) 106 | for i, k in enumerate(lsKey): lWidth[i] = max(len(k), *[len(str(v[k])) for v in lDic]) 107 | sFmt = "|".join(["%%(%s)%ds"%(name,k) for name, k in zip(lsKey, lWidth)]) #something like "%(modelName)25s %(modelId)13s ..." 108 | sFmt = sFmt + "\n" 109 | sRet = sFmt%{k:k for k in lsKey} #table header 110 | sRet += sFmt % {s:("-"*n) for s,n in zip(lsKey, lWidth)} 111 | for record in lDic: sRet += sFmt % record 112 | return sRet 113 | -------------------------------------------------------------------------------- /src/TranskribusCommands/do_addDocToCollec.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding:utf-8 -*- 3 | 4 | """ 5 | Utility to add Transkribus documents to another collection 6 | 7 | JL Meunier - Nov 2016 8 | 9 | 10 | Copyright Xerox(C) 2016 H. Déjean, JL. Meunier 11 | 12 | This program is free software: you can redistribute it and/or modify 13 | it under the terms of the GNU General Public License as published by 14 | the Free Software Foundation, either version 3 of the License, or 15 | (at your option) any later version. 16 | 17 | This program is distributed in the hope that it will be useful, 18 | but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 20 | GNU General Public License for more details. 21 | 22 | You should have received a copy of the GNU General Public License 23 | along with this program. If not, see . 24 | 25 | 26 | Developed for the EU project READ. The READ project has received funding 27 | from the European Union’s Horizon 2020 research and innovation programme 28 | under grant agreement No 674943. 29 | 30 | """ 31 | from __future__ import absolute_import 32 | from __future__ import print_function 33 | from __future__ import unicode_literals 34 | 35 | #optional: useful if you want to choose the logging level to something else than logging.WARN 36 | import sys, os, logging 37 | from optparse import OptionParser 38 | 39 | try: #to ease the use without proper Python installation 40 | import TranskribusPyClient_version 41 | except ImportError: 42 | sys.path.append( os.path.dirname(os.path.dirname( os.path.abspath(sys.argv[0]) )) ) 43 | import TranskribusPyClient_version 44 | 45 | from TranskribusPyClient.common.trace import traceln, trace 46 | from TranskribusCommands import _Trnskrbs_default_url, __Trnskrbs_basic_options, _Trnskrbs_description, __Trnskrbs_do_login_stuff, _exit 47 | from TranskribusPyClient.client import TranskribusClient 48 | 49 | DEBUG = 0 50 | 51 | description = """Add one or several documents stored in Transkribus to another Transkribus collection. 52 | Document(s) and collection are specified by their unique identifier (a number). 53 | """ + _Trnskrbs_description 54 | 55 | usage = """%s [ | - ]+ 56 | Documents are specified by a space-separated list of numbers, or number ranges, e.g. 3-36. 57 | """%sys.argv[0] 58 | 59 | class DoAddDocToCollec(TranskribusClient): 60 | """ 61 | Add a document to another collection. 62 | """ 63 | 64 | #--- INIT ------------------------------------------------------------------------------------------------------------- 65 | def __init__(self, trnkbsServerUrl, sHttpProxy=None, loggingLevel=logging.WARN): 66 | TranskribusClient.__init__(self, sServerUrl=_Trnskrbs_default_url, proxies=sHttpProxy, loggingLevel=loggingLevel) 67 | 68 | 69 | if __name__ == '__main__': 70 | version = "v.01" 71 | 72 | #prepare for the parsing of the command line 73 | parser = OptionParser(usage=usage, version=version) 74 | parser.description = description 75 | 76 | #"-s", "--server", "-l", "--login" , "-p", "--pwd", "--https_proxy" OPTIONS 77 | __Trnskrbs_basic_options(parser, _Trnskrbs_default_url) 78 | 79 | #parse the command line 80 | (options, args) = parser.parse_args() 81 | proxies = {} if not options.https_proxy else {'https_proxy':options.https_proxy} 82 | # ------------------------------------------------------------------------------------------------ 83 | doer = DoAddDocToCollec(options.server, proxies, loggingLevel=logging.INFO) 84 | __Trnskrbs_do_login_stuff(doer, options, trace=trace, traceln=traceln) 85 | 86 | # --- 87 | #target collection 88 | try: colId = int(args.pop(0)) 89 | except Exception as e: _exit(usage, 1, e) 90 | 91 | # --- 92 | # document list 93 | try: 94 | lDocId = [] 95 | while args: 96 | chunk = args.pop(0).strip() 97 | li = chunk.split('-') 98 | if li and len(li) == 2: 99 | docId1, docId2 = [int(i) for i in li] 100 | lDocId.extend( range(docId1,docId2+1) ) 101 | else: 102 | docId = int(chunk) 103 | lDocId.append(docId) 104 | except Exception as e: 105 | _exit(usage, 2, e) 106 | 107 | # --- 108 | #credentials and proxy 109 | proxies = {} if not options.https_proxy else {'https_proxy':options.https_proxy} 110 | 111 | 112 | 113 | # ------------------------------------------------------------------------------------------------ 114 | doer = DoAddDocToCollec(options.server, proxies, loggingLevel=logging.INFO) 115 | 116 | __Trnskrbs_do_login_stuff(doer, options, trace, traceln) 117 | 118 | trace("- adding to collection '%d' the %d documents: "%(colId, len(lDocId))) 119 | for docId in lDocId: 120 | trace(" %d"%docId) 121 | try: 122 | doer.addDocToCollection(colId, docId) 123 | except Exception as e: 124 | traceln() 125 | traceln("ERROR: could not add document '%d' to collection '%d'"%(docId, colId)) 126 | raise e 127 | traceln() 128 | traceln("- Done for %d documents"%len(lDocId)) 129 | 130 | -------------------------------------------------------------------------------- /src/TranskribusCommands/do_analyzeLayoutBatch.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding:utf-8 -*- 3 | 4 | """ 5 | 6 | H. Déjean - Dec 2016 7 | 8 | 9 | Copyright Xerox(C) 2016 H. Déjean 10 | 11 | This program is free software: you can redistribute it and/or modify 12 | it under the terms of the GNU General Public License as published by 13 | the Free Software Foundation, either version 3 of the License, or 14 | (at your option) any later version. 15 | 16 | This program is distributed in the hope that it will be useful, 17 | but WITHOUT ANY WARRANTY; without even the implied warranty of 18 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 | GNU General Public License for more details. 20 | 21 | You should have received a copy of the GNU General Public License 22 | along with this program. If not, see . 23 | 24 | 25 | Developed for the EU project READ. The READ project has received funding 26 | from the European Union’s Horizon 2020 research and innovation programme 27 | under grant agreement No 674943. 28 | 29 | """ 30 | 31 | # TranskribusCommands/do_LAbatch.py 3571 3820 8251 8252 32 | 33 | 34 | #optional: useful if you want to choose the logging level to something else than logging.WARN 35 | import sys, os, logging 36 | from optparse import OptionParser 37 | import json 38 | import codecs 39 | 40 | try: #to ease the use without proper Python installation 41 | import TranskribusPyClient_version 42 | except ImportError: 43 | sys.path.append( os.path.dirname(os.path.dirname( os.path.abspath(sys.argv[0]) )) ) 44 | import TranskribusPyClient_version 45 | 46 | from TranskribusCommands import _Trnskrbs_default_url, __Trnskrbs_basic_options, _Trnskrbs_description, __Trnskrbs_do_login_stuff, _exit 47 | from TranskribusPyClient.client import TranskribusClient 48 | from do_transcript import DoTranscript 49 | from TranskribusPyClient.common.IntegerRange import IntegerRange 50 | from TranskribusPyClient.TRP_FullDoc import TRP_FullDoc 51 | 52 | 53 | from TranskribusPyClient.common.trace import traceln, trace 54 | 55 | DEBUG = 0 56 | 57 | description = """Apply Layout Analysis (LA) with batch model. 58 | 59 | The syntax for specifying the page range is: 60 | - one or several specifiers separated by a comma 61 | - one separator is a page number, or a range of page number, e.g. 3-8 62 | - Examples: 1 1,3,5 1-3 1,3,5-99,100 63 | 64 | """ + _Trnskrbs_description 65 | 66 | usage = """%s [] 67 | """%sys.argv[0] 68 | 69 | class DoLAbatch(TranskribusClient): 70 | """ 71 | Hi Hervé, 72 | 73 | Sebastian has done the integration of the tools and can answer more indepth questions. 74 | 75 | Please take a look at: 76 | https://transkribus.eu/TrpServer/Swadl/wadl.html 77 | 78 | or 79 | 80 | https://transkribus.eu/TrpServer/rest/application.wadl 81 | 82 | The new methods are at: 83 | /LA/analyze 84 | 85 | Valid values for the jobImpl parameter are: 86 | NcsrLaJob 87 | CvlLaJob 88 | CITlabAdvancedLaJob 89 | 90 | You have to post a list of descriptor objects either as XML or JSON to the service, specifying the pages that have to be analyzed. A single page descriptor would look like this (regionId optional): 91 | 92 | 1 93 | 94 | 95 | 2 96 | 3 97 | aRegionId 98 | 99 | 100 | 101 | 102 | Do let us know if there are any problems with the new method. 103 | 104 | Best regards and have a nice weekend, 105 | Philip 106 | 107 | 108 | """ 109 | sDefaultServerUrl = _Trnskrbs_default_url 110 | #--- INIT ------------------------------------------------------------------------------------------------------------- 111 | def __init__(self, trnkbsServerUrl, sHttpProxy=None, loggingLevel=logging.WARN): 112 | TranskribusClient.__init__(self, sServerUrl=self.sDefaultServerUrl, proxies=sHttpProxy, loggingLevel=loggingLevel) 113 | 114 | self._trpMng = DoTranscript(self.sDefaultServerUrl, sHttpProxy=sHttpProxy, loggingLevel=loggingLevel) 115 | 116 | 117 | def buildDescription(self,colId,docpage,trp=None): 118 | """ 119 | '{"docId":17442,"pageList":{"pages":[{"pageId":400008,"tsId":1243509,"regionIds":[]}]}}' 120 | 121 | 1 122 | 123 | 124 | 2 125 | 3 126 | aRegionId 127 | 128 | 129 | 130 | 131 | """ 132 | jsonDesc = {} 133 | 134 | if trp is None: 135 | docId,pageRange= docpage.split('/') 136 | jsonDesc["docId"]=docId 137 | oPageRange = IntegerRange(pageRange) 138 | trpObj = self._trpMng.filter(colId,docId,page_filter=oPageRange,bLast=True) 139 | else: 140 | trpObj = TRP_FullDoc(trp) 141 | jsonDesc["pageList"]={} 142 | # pList= trpObj.getTranscriptList() 143 | jsonDesc["pageList"]['pages']= [] 144 | for page in trpObj.getPageList(): 145 | docId = page['docId'] 146 | jsonDesc["docId"]=page['docId'] 147 | jsonDesc["pageList"]['pages'].append({"pageId":page['pageId'],"tsId":page['tsList']['transcripts'][0]['tsId'],"regionIds":[]}) 148 | 149 | 150 | return jsonDesc["docId"], json.dumps(jsonDesc,encoding='utf-8') 151 | 152 | 153 | def run(self, colId, sDescription, sJobImpl='CITlabAdvancedLaJob',bBlockSeg,bLineSeq): 154 | ret = self.analyzeLayoutNew(colId, sDescription,sJobImpl,bBlockSeg,bLineSeq) 155 | return ret 156 | 157 | 158 | 159 | 160 | if __name__ == '__main__': 161 | version = "v.01" 162 | 163 | #prepare for the parsing of the command line 164 | parser = OptionParser(usage=usage, version=version) 165 | parser.description = description 166 | 167 | #"-s", "--server", "-l", "--login" , "-p", "--pwd", "--https_proxy" OPTIONS 168 | __Trnskrbs_basic_options(parser, DoLAbatch.sDefaultServerUrl) 169 | 170 | parser.add_option("-r", "--region" , dest='region', action="store", type="string", default=DoLAbatch.sDefaultServerUrl, help="apply Layout Analysis (textLine)") 171 | parser.add_option("--trp" , dest='trp_doc', action="store", type="string",default=None, help="use trp doc file") 172 | parser.add_option("--docid" , dest='docid' , action="store", type="string", default=None, help="document/pages to be htr'd") 173 | # --- 174 | #parse the command line 175 | (options, args) = parser.parse_args() 176 | proxies = {} if not options.https_proxy else {'https_proxy':options.https_proxy} 177 | 178 | # --- 179 | doer = DoLAbatch(options.server, proxies, loggingLevel=logging.WARN) 180 | __Trnskrbs_do_login_stuff(doer, options, trace=trace, traceln=traceln) 181 | doer._trpMng.setSessionId(doer._sessionID) 182 | 183 | # --- 184 | try: colId = int(args.pop(0)) 185 | except Exception as e: _exit(usage, 1, e) 186 | try: docId = int(args.pop(0)) 187 | except Exception as e: _exit(usage, 1, e) 188 | try: sPages = args.pop(0) 189 | except Exception as e: _exit(usage, 1, e) 190 | try: doNotBlockSeg = int(args.pop(0)) == 0 191 | except Exception as e: doNotBlockSeg = False 192 | try: doNotLineSeg = int(args.pop(0)) == 0 193 | except Exception as e: doNotLineSeg= False 194 | if args: _exit(usage, 2, Exception("Extra arguments to the command")) 195 | 196 | # --- 197 | # do the job... 198 | if options.trp_doc: 199 | trpdoc = json.load(codecs.open(options.trp_doc, "rb",'utf-8')) 200 | docId,sPageDesc = doer.buildDescription(colId,options.docid,trpdoc) 201 | else: 202 | docId,sPageDesc = doer.buildDescription(colId,options.docid) 203 | 204 | jobid = doer.run(colId, sPageDesc,not(doNotBlockSeg),not(doNotLineSeg)) 205 | traceln(jobid) 206 | 207 | traceln() 208 | traceln("- Done") 209 | 210 | -------------------------------------------------------------------------------- /src/TranskribusCommands/do_createCollec.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding:utf-8 -*- 3 | 4 | """ 5 | Create a collection 6 | 7 | JL Meunier - Nov 2016 8 | 9 | 10 | Copyright Xerox(C) 2016 H. Déjean, JL. Meunier 11 | 12 | This program is free software: you can redistribute it and/or modify 13 | it under the terms of the GNU General Public License as published by 14 | the Free Software Foundation, either version 3 of the License, or 15 | (at your option) any later version. 16 | 17 | This program is distributed in the hope that it will be useful, 18 | but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 20 | GNU General Public License for more details. 21 | 22 | You should have received a copy of the GNU General Public License 23 | along with this program. If not, see . 24 | 25 | 26 | Developed for the EU project READ. The READ project has received funding 27 | from the European Union’s Horizon 2020 research and innovation programme 28 | under grant agreement No 674943. 29 | 30 | """ 31 | from __future__ import absolute_import 32 | from __future__ import print_function 33 | from __future__ import unicode_literals 34 | # TranskribusCommands/do_copyDocToCollec.py 3571 3820 8251 8252 35 | 36 | 37 | #optional: useful if you want to choose the logging level to something else than logging.WARN 38 | import sys, os, logging 39 | from optparse import OptionParser 40 | 41 | try: #to ease the use without proper Python installation 42 | import TranskribusPyClient_version 43 | except ImportError: 44 | sys.path.append( os.path.dirname(os.path.dirname( os.path.abspath(sys.argv[0]) )) ) 45 | import TranskribusPyClient_version 46 | 47 | from TranskribusCommands import _Trnskrbs_default_url, __Trnskrbs_basic_options, _Trnskrbs_description, __Trnskrbs_do_login_stuff, _exit 48 | from TranskribusPyClient.client import TranskribusClient 49 | from TranskribusPyClient.common.trace import traceln, trace 50 | 51 | DEBUG = 0 52 | 53 | description = """create a Transkribus collection. 54 | """ + _Trnskrbs_description 55 | 56 | usage = """%s 57 | """%sys.argv[0] 58 | 59 | class DoCreateCollec(TranskribusClient): 60 | 61 | sDefaultServerUrl = _Trnskrbs_default_url 62 | 63 | #--- INIT ------------------------------------------------------------------------------------------------------------- 64 | def __init__(self, trnkbsServerUrl, sHttpProxy=None, loggingLevel=logging.WARN): 65 | TranskribusClient.__init__(self, sServerUrl=self.sDefaultServerUrl, proxies=sHttpProxy, loggingLevel=loggingLevel) 66 | 67 | 68 | 69 | if __name__ == '__main__': 70 | version = "v.01" 71 | 72 | #prepare for the parsing of the command line 73 | parser = OptionParser(usage=usage, version=version) 74 | parser.description = description 75 | 76 | #"-s", "--server", "-l", "--login" , "-p", "--pwd", "--https_proxy" OPTIONS 77 | __Trnskrbs_basic_options(parser, DoCreateCollec.sDefaultServerUrl) 78 | 79 | # --- 80 | #parse the command line 81 | (options, args) = parser.parse_args() 82 | proxies = {} if not options.https_proxy else {'https_proxy':options.https_proxy} 83 | 84 | # --- 85 | #source collection(s) 86 | try: 87 | sColName = args[0] 88 | except Exception as e: 89 | _exit(usage, 1, e) 90 | 91 | # --- 92 | doer = DoCreateCollec(options.server, proxies, loggingLevel=logging.INFO) 93 | __Trnskrbs_do_login_stuff(doer, options, trace=trace, traceln=traceln) 94 | 95 | # --- 96 | # do the job... 97 | try: 98 | resp = doer.createCollection(sColName) 99 | except Exception as e: _exit("", 1, e) 100 | 101 | 102 | traceln("- Done: --> %s"%resp) 103 | 104 | print (resp) 105 | -------------------------------------------------------------------------------- /src/TranskribusCommands/do_deleteCollec.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding:utf-8 -*- 3 | 4 | """ 5 | Delete a collection 6 | 7 | JL Meunier - Nov 2016 8 | 9 | 10 | Copyright Xerox(C) 2016 H. Déjean, JL. Meunier 11 | 12 | This program is free software: you can redistribute it and/or modify 13 | it under the terms of the GNU General Public License as published by 14 | the Free Software Foundation, either version 3 of the License, or 15 | (at your option) any later version. 16 | 17 | This program is distributed in the hope that it will be useful, 18 | but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 20 | GNU General Public License for more details. 21 | 22 | You should have received a copy of the GNU General Public License 23 | along with this program. If not, see . 24 | 25 | 26 | Developed for the EU project READ. The READ project has received funding 27 | from the European Union’s Horizon 2020 research and innovation programme 28 | under grant agreement No 674943. 29 | 30 | """ 31 | from __future__ import absolute_import 32 | from __future__ import print_function 33 | from __future__ import unicode_literals 34 | # TranskribusCommands/do_copyDocToCollec.py 3571 3820 8251 8252 35 | 36 | 37 | #optional: useful if you want to choose the logging level to something else than logging.WARN 38 | import sys, os, logging 39 | from optparse import OptionParser 40 | 41 | try: #to ease the use without proper Python installation 42 | import TranskribusPyClient_version 43 | except ImportError: 44 | sys.path.append( os.path.dirname(os.path.dirname( os.path.abspath(sys.argv[0]) )) ) 45 | import TranskribusPyClient_version 46 | 47 | from TranskribusCommands import _Trnskrbs_default_url, __Trnskrbs_basic_options, _Trnskrbs_description, __Trnskrbs_do_login_stuff, _exit 48 | from TranskribusPyClient.client import TranskribusClient 49 | from TranskribusPyClient.common.trace import traceln, trace 50 | 51 | DEBUG = 0 52 | 53 | description = """delete a Transkribus collection. 54 | """ + _Trnskrbs_description 55 | 56 | usage = """%s 57 | """%sys.argv[0] 58 | 59 | class DoDeleteCollec(TranskribusClient): 60 | 61 | sDefaultServerUrl = _Trnskrbs_default_url 62 | 63 | #--- INIT ------------------------------------------------------------------------------------------------------------- 64 | def __init__(self, trnkbsServerUrl, sHttpProxy=None, loggingLevel=logging.WARN): 65 | TranskribusClient.__init__(self, sServerUrl=self.sDefaultServerUrl, proxies=sHttpProxy, loggingLevel=loggingLevel) 66 | 67 | 68 | 69 | if __name__ == '__main__': 70 | version = "v.01" 71 | 72 | #prepare for the parsing of the command line 73 | parser = OptionParser(usage=usage, version=version) 74 | parser.description = description 75 | 76 | #"-s", "--server", "-l", "--login" , "-p", "--pwd", "--https_proxy" OPTIONS 77 | __Trnskrbs_basic_options(parser, DoDeleteCollec.sDefaultServerUrl) 78 | 79 | # --- 80 | #parse the command line 81 | (options, args) = parser.parse_args() 82 | proxies = {} if not options.https_proxy else {'https_proxy':options.https_proxy} 83 | 84 | # --- 85 | #source collection(s) 86 | try: 87 | colId = int(args[0]) 88 | except Exception as e: 89 | _exit(usage, 1, e) 90 | 91 | # --- 92 | doer = DoDeleteCollec(options.server, proxies, loggingLevel=logging.INFO) 93 | __Trnskrbs_do_login_stuff(doer, options, trace=trace, traceln=traceln) 94 | 95 | # --- 96 | # do the job... 97 | try: 98 | resp = doer.deleteCollection(colId) 99 | except Exception as e: _exit("", 1, e) 100 | 101 | traceln("- Done") 102 | 103 | -------------------------------------------------------------------------------- /src/TranskribusCommands/do_deleteJob.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding:utf-8 -*- 3 | 4 | """ 5 | Delete a job 6 | 7 | H. Déjean - Dec 2016 8 | 9 | 10 | Copyright Xerox(C) 2016 H. Déjean, JL. Meunier 11 | 12 | This program is free software: you can redistribute it and/or modify 13 | it under the terms of the GNU General Public License as published by 14 | the Free Software Foundation, either version 3 of the License, or 15 | (at your option) any later version. 16 | 17 | This program is distributed in the hope that it will be useful, 18 | but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 20 | GNU General Public License for more details. 21 | 22 | You should have received a copy of the GNU General Public License 23 | along with this program. If not, see . 24 | 25 | 26 | Developed for the EU project READ. The READ project has received funding 27 | from the European Union’s Horizon 2020 research and innovation programme 28 | under grant agreement No 674943. 29 | 30 | """ 31 | from __future__ import absolute_import 32 | from __future__ import print_function 33 | from __future__ import unicode_literals 34 | # TranskribusCommands/do_deleteJob.py 35 | 36 | 37 | #optional: useful if you want to choose the logging level to something else than logging.WARN 38 | import sys, os, logging 39 | from optparse import OptionParser 40 | 41 | try: #to ease the use without proper Python installation 42 | import TranskribusPyClient_version 43 | except ImportError: 44 | sys.path.append( os.path.dirname(os.path.dirname( os.path.abspath(sys.argv[0]) )) ) 45 | import TranskribusPyClient_version 46 | 47 | from TranskribusCommands import _Trnskrbs_default_url, __Trnskrbs_basic_options, _Trnskrbs_description, __Trnskrbs_do_login_stuff, _exit 48 | from TranskribusPyClient.client import TranskribusClient 49 | from TranskribusPyClient.common.trace import traceln, trace 50 | 51 | DEBUG = 0 52 | 53 | description = """delete a Transkribus job. 54 | """ + _Trnskrbs_description 55 | 56 | usage = """%s 57 | """%sys.argv[0] 58 | 59 | class DoDeleteJob(TranskribusClient): 60 | 61 | sDefaultServerUrl = _Trnskrbs_default_url 62 | 63 | #--- INIT ------------------------------------------------------------------------------------------------------------- 64 | def __init__(self, trnkbsServerUrl, sHttpProxy=None, loggingLevel=logging.WARN): 65 | TranskribusClient.__init__(self, sServerUrl=self.sDefaultServerUrl, proxies=sHttpProxy, loggingLevel=loggingLevel) 66 | 67 | 68 | 69 | if __name__ == '__main__': 70 | version = "v.01" 71 | 72 | #prepare for the parsing of the command line 73 | parser = OptionParser(usage=usage, version=version) 74 | parser.description = description 75 | 76 | #"-s", "--server", "-l", "--login" , "-p", "--pwd", "--https_proxy" OPTIONS 77 | __Trnskrbs_basic_options(parser, DoDeleteJob.sDefaultServerUrl) 78 | 79 | # --- 80 | #parse the command line 81 | (options, args) = parser.parse_args() 82 | proxies = {} if not options.https_proxy else {'https_proxy':options.https_proxy} 83 | 84 | # --- 85 | #source collection(s) 86 | try: 87 | jobid = int(args[0]) 88 | except Exception as e: 89 | _exit(usage, 1, e) 90 | 91 | # --- 92 | doer = DoDeleteJob(options.server, proxies, loggingLevel=logging.INFO) 93 | __Trnskrbs_do_login_stuff(doer, options, trace=trace, traceln=traceln) 94 | 95 | # --- 96 | # do the job... 97 | try: 98 | resp = doer.deleteJob(jobid) 99 | except Exception as e: _exit("", 1, e) 100 | 101 | if resp != "CANCELED": 102 | raise Exception("Job status should be CANCELED not '%s'"%resp) 103 | 104 | traceln("- Done") 105 | 106 | -------------------------------------------------------------------------------- /src/TranskribusCommands/do_duplicateDoc.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding:utf-8 -*- 3 | 4 | """ 5 | Utility to duplicate Transkribus documents from a collection to another collection 6 | 7 | JL Meunier - Nov 2016 8 | 9 | 10 | Copyright Xerox(C) 2016 H. Déjean, JL. Meunier 11 | 12 | This program is free software: you can redistribute it and/or modify 13 | it under the terms of the GNU General Public License as published by 14 | the Free Software Foundation, either version 3 of the License, or 15 | (at your option) any later version. 16 | 17 | This program is distributed in the hope that it will be useful, 18 | but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 20 | GNU General Public License for more details. 21 | 22 | You should have received a copy of the GNU General Public License 23 | along with this program. If not, see . 24 | 25 | 26 | Developed for the EU project READ. The READ project has received funding 27 | from the European Union’s Horizon 2020 research and innovation programme 28 | under grant agreement No 674943. 29 | 30 | """ 31 | from __future__ import absolute_import 32 | from __future__ import print_function 33 | from __future__ import unicode_literals 34 | # TranskribusCommands/do_copyDocToCollec.py 3571 3820 8251 8252 35 | 36 | 37 | #optional: useful if you want to choose the logging level to something else than logging.WARN 38 | import sys, os, logging 39 | from optparse import OptionParser 40 | 41 | try: #to ease the use without proper Python installation 42 | import TranskribusPyClient_version 43 | except ImportError: 44 | sys.path.append( os.path.dirname(os.path.dirname( os.path.abspath(sys.argv[0]) )) ) 45 | import TranskribusPyClient_version 46 | 47 | from TranskribusCommands import _Trnskrbs_default_url, __Trnskrbs_basic_options, _Trnskrbs_description, __Trnskrbs_do_login_stuff, _exit 48 | from TranskribusPyClient.client import TranskribusClient, getStoredCredentials 49 | from TranskribusPyClient.common.trace import traceln, trace 50 | 51 | DEBUG = 0 52 | 53 | description = """Copy (duplicate) one or several documents stored in a Transkribus collection to another Transkribus collection. 54 | Document(s) and collections are specified by their unique identifier (a number). 55 | """ + _Trnskrbs_description 56 | 57 | usage = """%s ( | - )+ 58 | Documents are specified by a space-separated list of numbers, or number ranges, e.g. 3-36. 59 | """%sys.argv[0] 60 | 61 | class DoCopyDocToCollec(TranskribusClient): 62 | """ 63 | Copy a document from a collection to another 64 | """ 65 | sDefaultServerUrl = _Trnskrbs_default_url 66 | #--- INIT ------------------------------------------------------------------------------------------------------------- 67 | def __init__(self, trnkbsServerUrl, sHttpProxy=None, loggingLevel=logging.WARN): 68 | TranskribusClient.__init__(self, sServerUrl=self.sDefaultServerUrl, proxies=sHttpProxy, loggingLevel=loggingLevel) 69 | 70 | 71 | if __name__ == '__main__': 72 | version = "v.01" 73 | 74 | #prepare for the parsing of the command line 75 | parser = OptionParser(usage=usage, version=version) 76 | parser.description = description 77 | 78 | #"-s", "--server", "-l", "--login" , "-p", "--pwd", "--https_proxy" OPTIONS 79 | __Trnskrbs_basic_options(parser, DoCopyDocToCollec.sDefaultServerUrl) 80 | 81 | # --- 82 | #parse the command line 83 | (options, args) = parser.parse_args() 84 | proxies = {} if not options.https_proxy else {'https_proxy':options.https_proxy} 85 | 86 | # --- 87 | #source collection 88 | try: colIdFrom = int(args.pop(0)) 89 | except Exception as e: _exit(usage, 1, e) 90 | #target collection 91 | try: colIdTo = int(args.pop(0)) 92 | except Exception as e: _exit(usage, 1, e) 93 | 94 | # --- 95 | # document list 96 | try: 97 | lDocId = [] 98 | while args: 99 | chunk = args.pop(0).strip() 100 | li = chunk.split('-') 101 | if li and len(li) == 2: 102 | docId1, docId2 = [int(i) for i in li] 103 | lDocId.extend( range(docId1,docId2+1) ) 104 | else: 105 | docId = int(chunk) 106 | lDocId.append(docId) 107 | except Exception as e: 108 | _exit(usage, 2, e) 109 | 110 | # ------------------------------------------------------------------------------------------------ 111 | doer = DoCopyDocToCollec(options.server, proxies, loggingLevel=logging.INFO) 112 | __Trnskrbs_do_login_stuff(doer, options, trace=trace, traceln=traceln) 113 | 114 | 115 | #the only issue is that we need to have the name of each document... 116 | traceln("- checking existence of each document in source collection '%d'"%(colIdFrom)) 117 | dName_by_docId = {} 118 | lDocDic = doer.listDocsByCollectionId(colIdFrom) 119 | for docDic in lDocDic: 120 | dName_by_docId[ docDic['docId'] ] = docDic['title'] 121 | #check now, so as to avoid partial copies... 122 | for docId in lDocId: 123 | try: 124 | name = dName_by_docId[docId] 125 | except KeyError as e: 126 | traceln() 127 | traceln("ERROR: document '%d' is not in source collection '%d'"%(docId, colIdFrom)) 128 | _exit("", 3, e) 129 | 130 | trace("- duplicating from collection %d to collection '%d' the %d documents: "%(colIdFrom, colIdTo, len(lDocId))) 131 | for docId in lDocId: 132 | name = dName_by_docId[docId] 133 | trace(" %d ('%s')"%(docId, name)) 134 | try: 135 | doer.duplicateDoc(colIdFrom, docId, colIdTo, name) 136 | except Exception as e: 137 | traceln() 138 | traceln("ERROR: could not copy document '%d' from collection '%d' to collection '%d'"%(docId, colIdFrom, colIdTo)) 139 | _exit("", 4, e) 140 | traceln() 141 | traceln("- Done for %d documents"%len(lDocId)) 142 | 143 | -------------------------------------------------------------------------------- /src/TranskribusCommands/do_export.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding:utf-8 -*- 3 | 4 | """ 5 | 6 | Hervé Déjean - april 2021 7 | 8 | 9 | Copyright Naver LabsEurope (C) 2021 10 | 11 | see https://transkribus.eu/wiki/index.php/HTR 12 | """ 13 | from __future__ import absolute_import 14 | from __future__ import print_function 15 | from __future__ import unicode_literals 16 | 17 | # TranskribusCommands/do_htrTrainRnn model-name colId docid pages 18 | 19 | 20 | #optional: useful if you want to choose the logging level to something else than logging.WARN 21 | import sys, os, logging 22 | from optparse import OptionParser 23 | import json 24 | from lxml import etree 25 | 26 | try: #to ease the use without proper Python installation 27 | import TranskribusPyClient_version 28 | except ImportError: 29 | sys.path.append( os.path.dirname(os.path.dirname( os.path.abspath(sys.argv[0]) )) ) 30 | import TranskribusPyClient_version 31 | 32 | from TranskribusCommands import _Trnskrbs_default_url, __Trnskrbs_basic_options, _Trnskrbs_description, __Trnskrbs_do_login_stuff, _exit 33 | # from TranskribusCommands import _Trnskrbs_default_url, __Trnskrbs_basic_options, _Trnskrbs_description, __Trnskrbs_do_login_stuff, _exit 34 | 35 | from TranskribusPyClient.common.IntegerRange import IntegerRange 36 | from TranskribusPyClient.common.trace import traceln, trace 37 | from TranskribusPyClient.client import TranskribusClient 38 | 39 | 40 | DEBUG = 0 41 | 42 | description = """Export a document into alto format """ 43 | 44 | 45 | usage = """%s 46 | """%sys.argv[0] 47 | 48 | class Export(TranskribusClient): 49 | 50 | sDefaultServerUrl = _Trnskrbs_default_url 51 | params=""" 52 | { "commonPars" : { 53 | "pages" : "1", 54 | "doExportDocMetadata" : true, 55 | "doWriteMets" : true, 56 | "doWriteImages" : true, 57 | "doExportPageXml" : true, 58 | "doExportAltoXml" : true, 59 | "doExportSingleTxtFiles" : false, 60 | "doWritePdf" : false, 61 | "doWriteTei" : false, 62 | "doWriteDocx" : false, 63 | "doWriteOneTxt" : false, 64 | "doWriteTagsXlsx" : false, 65 | "doWriteTagsIob" : false, 66 | "doWriteTablesXlsx" : false, 67 | "doWriteStructureInMets" : false, 68 | "doCreateTitle" : false, 69 | "useVersionStatus" : "Latest version", 70 | "writeTextOnWordLevel" : false, 71 | "doBlackening" : false, 72 | "selectedTags" : [ "add", "date", "Address", "human_production", "supplied", "work", "unclear", "sic", "structure", "div", "highlight", "place1", "regionType", "speech", "person", "gap", "organization", "comment", "abbrev", "place", "add1", "Initial", "lat" ], 73 | "font" : "FreeSerif", 74 | "splitIntoWordsInAltoXml" : true, 75 | "pageDirName" : "page", 76 | "fileNamePattern" : "${filename}", 77 | "useHttps" : true, 78 | "remoteImgQuality" : "orig", 79 | "doOverwrite" : true, 80 | "useOcrMasterDir" : true, 81 | "exportTranscriptMetadata" : true, 82 | "updatePageXmlImageDimensions" : false 83 | }, 84 | "altoPars" : { 85 | "splitIntoWordsInAltoXml" : true 86 | }, 87 | "pdfPars" : { 88 | "doPdfImagesOnly" : false, 89 | "doPdfImagesPlusText" : true, 90 | "doPdfWithTextPages" : false, 91 | "doPdfWithTags" : false, 92 | "doPdfWithArticles" : false, 93 | "doPdfA" : false, 94 | "pdfImgQuality" : "view" 95 | }, 96 | "docxPars" : { 97 | "doDocxWithTags" : false, 98 | "doDocxPreserveLineBreaks" : false, 99 | "doDocxForcePageBreaks" : false, 100 | "doDocxMarkUnclear" : false, 101 | "doDocxKeepAbbrevs" : false, 102 | "doDocxExpandAbbrevs" : false, 103 | "doDocxSubstituteAbbrevs" : false, 104 | "doDocxWriteFilenames" : false, 105 | "doDocxIgnoreSuppliedTag" : false, 106 | "doDocxShowSuppliedTagWithBrackets" : false 107 | } 108 | } 109 | """ 110 | #--- INIT ------------------------------------------------------------------------------------------------------------- 111 | def __init__(self, trnkbsServerUrl, sHttpProxy=None, loggingLevel=logging.WARN): 112 | TranskribusClient.__init__(self, sServerUrl=self.sDefaultServerUrl, proxies=sHttpProxy, loggingLevel=loggingLevel) 113 | 114 | def run(self, colId, docid,sParams): 115 | ret = self.exportCollection(colId, docid,sParams) 116 | return ret 117 | 118 | 119 | 120 | 121 | 122 | if __name__ == '__main__': 123 | version = "v.01" 124 | #prepare for the parsing of the command line 125 | parser = OptionParser(usage=usage, version=version) 126 | parser.description = description 127 | 128 | #"-s", "--server", "-l", "--login" , "-p", "--pwd", "--https_proxy" OPTIONS 129 | __Trnskrbs_basic_options(parser, Export.sDefaultServerUrl) 130 | 131 | # parser.add_option("--trp" , dest='trp_doc', action="store", type="string",default=None, help="use trp doc file") 132 | # parser.add_option("--templateID" , dest='templateID' , action="store", type="string" , help="template id") 133 | # parser.add_option("--batchjob" , dest='doBatchJob' , action="store_true", default=False, help="do one job per page") 134 | 135 | # --- 136 | #parse the command line 137 | (options, args) = parser.parse_args() 138 | proxies = {} if not options.https_proxy else {'https_proxy':options.https_proxy} 139 | 140 | # --- 141 | doer = Export(options.server, proxies, loggingLevel=logging.WARN) 142 | __Trnskrbs_do_login_stuff(doer, options, trace=trace, traceln=traceln) 143 | # doer._trpMng.setSessionId(doer._sessionID) 144 | 145 | # --- 146 | try: colId = int(args.pop(0)) 147 | except Exception as e: _exit(usage, 1, e) 148 | try: docid = args.pop(0) 149 | except Exception as e: _exit(usage, 1, e) 150 | if args: _exit(usage, 2, Exception("Extra arguments to the command")) 151 | 152 | # --- 153 | 154 | jobid = doer.run(colId, docid,doer.params) 155 | traceln("job ID:",jobid) 156 | traceln("- Done") 157 | 158 | -------------------------------------------------------------------------------- /src/TranskribusCommands/do_getDocTrp.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding:utf-8 -*- 3 | 4 | """ 5 | 6 | JL Meunier - August 2017 7 | 8 | 9 | Copyright Naver(C) 2017 JL. Meunier 10 | 11 | This program is free software: you can redistribute it and/or modify 12 | it under the terms of the GNU General Public License as published by 13 | the Free Software Foundation, either version 3 of the License, or 14 | (at your option) any later version. 15 | 16 | This program is distributed in the hope that it will be useful, 17 | but WITHOUT ANY WARRANTY; without even the implied warranty of 18 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 | GNU General Public License for more details. 20 | 21 | You should have received a copy of the GNU General Public License 22 | along with this program. If not, see . 23 | 24 | 25 | Developed for the EU project READ. The READ project has received funding 26 | from the European Union’s Horizon 2020 research and innovation programme 27 | under grant agreement No 674943. 28 | 29 | """ 30 | from __future__ import absolute_import 31 | from __future__ import print_function 32 | from __future__ import unicode_literals 33 | # TranskribusCommands/do_LAbatch.py 3571 3820 8251 8252 34 | 35 | 36 | #optional: useful if you want to choose the logging level to something else than logging.WARN 37 | import sys, os, logging 38 | from optparse import OptionParser 39 | import json 40 | 41 | try: #to ease the use without proper Python installation 42 | import TranskribusPyClient_version 43 | except ImportError: 44 | sys.path.append( os.path.dirname(os.path.dirname( os.path.abspath(sys.argv[0]) )) ) 45 | import TranskribusPyClient_version 46 | 47 | from TranskribusCommands import _Trnskrbs_default_url, __Trnskrbs_basic_options, _Trnskrbs_description, __Trnskrbs_do_login_stuff, _exit 48 | from TranskribusPyClient.client import TranskribusClient 49 | from TranskribusPyClient.common.IntegerRange import IntegerRange as PageRangeSpec 50 | from TranskribusPyClient.common.trace import traceln, trace 51 | 52 | DEBUG = 0 53 | 54 | description = """Get the TRP of a document 55 | """ + _Trnskrbs_description 56 | 57 | usage = """%s [] -n 58 | Return the so-called TRP of all or certain pages, optionally with the given number of transcript(s) per page (-1 means all). 59 | 60 | Page range is a comma-separated series of integer or pair of integers separated by a '-' 61 | For instance 1 or 1,3 or 1-4 or 1,3-6,8 62 | """%sys.argv[0] 63 | 64 | class DoGetDocTrp(TranskribusClient): 65 | sDefaultServerUrl = _Trnskrbs_default_url 66 | #--- INIT ------------------------------------------------------------------------------------------------------------- 67 | def __init__(self, trnkbsServerUrl, sHttpProxy=None, loggingLevel=logging.WARN): 68 | TranskribusClient.__init__(self, sServerUrl=self.sDefaultServerUrl, proxies=sHttpProxy, loggingLevel=loggingLevel) 69 | 70 | def run(self, colId, docId, nrOfTranscripts=1): 71 | ret = self.getDocById(colId, docId, nrOfTranscripts) 72 | return ret 73 | 74 | if __name__ == '__main__': 75 | version = "v.01" 76 | 77 | #prepare for the parsing of the command line 78 | parser = OptionParser(usage=usage, version=version) 79 | parser.description = description 80 | 81 | #"-s", "--server", "-l", "--login" , "-p", "--pwd", "--https_proxy" OPTIONS 82 | __Trnskrbs_basic_options(parser, DoGetDocTrp.sDefaultServerUrl) 83 | parser.add_option("-n", "--n" , dest='nbTranscript', action="store", type="int", default=1, help="Number of transcripts") 84 | 85 | # --- 86 | #parse the command line 87 | (options, args) = parser.parse_args() 88 | proxies = {} if not options.https_proxy else {'https_proxy':options.https_proxy} 89 | 90 | # --- 91 | doer = DoGetDocTrp(options.server, proxies, loggingLevel=logging.WARN) 92 | __Trnskrbs_do_login_stuff(doer, options, trace=trace, traceln=traceln) 93 | # --- 94 | try: colId = int(args.pop(0)) 95 | except Exception as e: _exit(usage, 1, e) 96 | try: docId = int(args.pop(0)) 97 | except Exception as e: _exit(usage, 1, e) 98 | try: sPageRangeSpec = args.pop(0) 99 | except Exception as e: sPageRangeSpec = None 100 | if args: _exit(usage, 2, Exception("Extra arguments to the command")) 101 | 102 | oPageRange = PageRangeSpec(sPageRangeSpec) if sPageRangeSpec else None 103 | 104 | # --- 105 | # do the job... 106 | resp = doer.run(colId, docId, nrOfTranscripts=options.nbTranscript) 107 | if oPageRange: 108 | traceln("Filtering response as per page specification: %s"%oPageRange) 109 | #let's filter the response (not super efficient but easy to code... 110 | ldPages = resp["pageList"]["pages"] 111 | ldPagesInRange = [ dPage for dPage in ldPages if dPage["pageNr"] in oPageRange] 112 | resp["pageList"]["pages"] = ldPagesInRange 113 | 114 | print (json.dumps(resp, sort_keys=True, indent=4, separators=(',', ': '))) 115 | 116 | traceln() 117 | traceln("- Done") 118 | 119 | -------------------------------------------------------------------------------- /src/TranskribusCommands/do_getJobStatus.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding:utf-8 -*- 3 | 4 | """ 5 | Get the status of a job 6 | 7 | JL Meunier - Dev 2016 8 | 9 | 10 | Copyright Xerox(C) 2016 H. Déjean, JL. Meunier 11 | 12 | This program is free software: you can redistribute it and/or modify 13 | it under the terms of the GNU General Public License as published by 14 | the Free Software Foundation, either version 3 of the License, or 15 | (at your option) any later version. 16 | 17 | This program is distributed in the hope that it will be useful, 18 | but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 20 | GNU General Public License for more details. 21 | 22 | You should have received a copy of the GNU General Public License 23 | along with this program. If not, see . 24 | 25 | 26 | Developed for the EU project READ. The READ project has received funding 27 | from the European Union’s Horizon 2020 research and innovation programme 28 | under grant agreement No 674943. 29 | 30 | """ 31 | from __future__ import absolute_import 32 | from __future__ import print_function 33 | from __future__ import unicode_literals 34 | # TranskribusCommands/do_deleteJob.py 35 | 36 | 37 | #optional: useful if you want to choose the logging level to something else than logging.WARN 38 | import sys, os, logging 39 | from optparse import OptionParser 40 | 41 | try: #to ease the use without proper Python installation 42 | import TranskribusPyClient_version 43 | except ImportError: 44 | sys.path.append( os.path.dirname(os.path.dirname( os.path.abspath(sys.argv[0]) )) ) 45 | import TranskribusPyClient_version 46 | 47 | from TranskribusCommands import _Trnskrbs_default_url, __Trnskrbs_basic_options, _Trnskrbs_description, __Trnskrbs_do_login_stuff, _exit 48 | from TranskribusPyClient.client import TranskribusClient 49 | from TranskribusPyClient.common.trace import traceln, trace 50 | 51 | import json 52 | DEBUG = 0 53 | 54 | description = """Get the status of a Transkribus job. 55 | """ + _Trnskrbs_description 56 | 57 | usage = """%s 58 | """%sys.argv[0] 59 | 60 | class DoDeleteJob(TranskribusClient): 61 | 62 | sDefaultServerUrl = _Trnskrbs_default_url 63 | 64 | #--- INIT ------------------------------------------------------------------------------------------------------------- 65 | def __init__(self, trnkbsServerUrl, sHttpProxy=None, loggingLevel=logging.WARN): 66 | TranskribusClient.__init__(self, sServerUrl=self.sDefaultServerUrl, proxies=sHttpProxy, loggingLevel=loggingLevel) 67 | 68 | 69 | 70 | if __name__ == '__main__': 71 | version = "v.01" 72 | 73 | #prepare for the parsing of the command line 74 | parser = OptionParser(usage=usage, version=version) 75 | parser.description = description 76 | 77 | #"-s", "--server", "-l", "--login" , "-p", "--pwd", "--https_proxy" OPTIONS 78 | __Trnskrbs_basic_options(parser, DoDeleteJob.sDefaultServerUrl) 79 | 80 | # --- 81 | #parse the command line 82 | (options, args) = parser.parse_args() 83 | proxies = {} if not options.https_proxy else {'https_proxy':options.https_proxy} 84 | 85 | # --- 86 | #source collection(s) 87 | try: 88 | jobid = int(args[0]) 89 | except Exception as e: 90 | _exit(usage, 1, e) 91 | 92 | # --- 93 | doer = DoDeleteJob(options.server, proxies, loggingLevel=logging.INFO) 94 | __Trnskrbs_do_login_stuff(doer, options, trace=trace, traceln=traceln) 95 | 96 | # --- 97 | # do the job... 98 | try: 99 | resp = doer.getJobStatus(jobid) 100 | except Exception as e: _exit("", 1, e) 101 | traceln( json.dumps(resp, sort_keys=True, indent=4, separators=(',', ': '))) 102 | 103 | -------------------------------------------------------------------------------- /src/TranskribusCommands/do_getJobs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding:utf-8 -*- 3 | 4 | """ 5 | Get the list of jobs 6 | 7 | Hervé Déjean - April 2017 8 | 9 | 10 | Copyright Xerox(C) 2016 H. Déjean, JL. Meunier 11 | 12 | This program is free software: you can redistribute it and/or modify 13 | it under the terms of the GNU General Public License as published by 14 | the Free Software Foundation, either version 3 of the License, or 15 | (at your option) any later version. 16 | 17 | This program is distributed in the hope that it will be useful, 18 | but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 20 | GNU General Public License for more details. 21 | 22 | You should have received a copy of the GNU General Public License 23 | along with this program. If not, see . 24 | 25 | 26 | Developed for the EU project READ. The READ project has received funding 27 | from the European Union’s Horizon 2020 research and innovation programme 28 | under grant agreement No 674943. 29 | 30 | """ 31 | from __future__ import absolute_import 32 | from __future__ import print_function 33 | from __future__ import unicode_literals 34 | # TranskribusCommands/do_deleteJob.py 35 | 36 | 37 | #optional: useful if you want to choose the logging level to something else than logging.WARN 38 | import sys, os, logging 39 | from optparse import OptionParser 40 | 41 | try: #to ease the use without proper Python installation 42 | import TranskribusPyClient_version 43 | except ImportError: 44 | sys.path.append( os.path.dirname(os.path.dirname( os.path.abspath(sys.argv[0]) )) ) 45 | import TranskribusPyClient_version 46 | 47 | from TranskribusCommands import _Trnskrbs_default_url, __Trnskrbs_basic_options, _Trnskrbs_description, __Trnskrbs_do_login_stuff, _exit 48 | from TranskribusPyClient.client import TranskribusClient 49 | from TranskribusPyClient.common.trace import traceln, trace 50 | 51 | import json 52 | 53 | DEBUG = 0 54 | 55 | 56 | description = """Get the status of a Transkribus job. 57 | """ + _Trnskrbs_description 58 | 59 | usage = """%s 60 | """%sys.argv[0] 61 | 62 | class DoGetJobs(TranskribusClient): 63 | 64 | sDefaultServerUrl = _Trnskrbs_default_url 65 | 66 | #--- INIT ------------------------------------------------------------------------------------------------------------- 67 | def __init__(self, trnkbsServerUrl, sHttpProxy=None, loggingLevel=logging.WARN): 68 | TranskribusClient.__init__(self, sServerUrl=self.sDefaultServerUrl, proxies=sHttpProxy, loggingLevel=loggingLevel) 69 | 70 | 71 | 72 | if __name__ == '__main__': 73 | version = "v.01" 74 | 75 | #prepare for the parsing of the command line 76 | parser = OptionParser(usage=usage, version=version) 77 | parser.description = description 78 | 79 | #"-s", "--server", "-l", "--login" , "-p", "--pwd", "--https_proxy" OPTIONS 80 | __Trnskrbs_basic_options(parser, DoGetJobs.sDefaultServerUrl) 81 | 82 | # --- 83 | #parse the command line 84 | (options, args) = parser.parse_args() 85 | proxies = {} if not options.https_proxy else {'https_proxy':options.https_proxy} 86 | 87 | # --- 88 | #source collection(s) 89 | # try: 90 | # jobid = int(args[0]) 91 | # except Exception as e: 92 | # _exit(usage, 1, e) 93 | 94 | # --- 95 | doer = DoGetJobs(options.server, proxies, loggingLevel=logging.INFO) 96 | __Trnskrbs_do_login_stuff(doer, options, trace=trace, traceln=traceln) 97 | 98 | # --- 99 | # do the job... 100 | try: 101 | resp = doer.getJobs() 102 | except Exception as e: _exit("", 1, e) 103 | traceln( json.dumps(resp, sort_keys=True, indent=4, separators=(',', ': '))) 104 | 105 | -------------------------------------------------------------------------------- /src/TranskribusCommands/do_getRnnTrainingJobStatus.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding:utf-8 -*- 3 | 4 | """ 5 | Get the status of a job 6 | 7 | JL Meunier - Dev 2016 8 | 9 | 10 | Copyright Xerox(C) 2016 H. Déjean, JL. Meunier 11 | 12 | This program is free software: you can redistribute it and/or modify 13 | it under the terms of the GNU General Public License as published by 14 | the Free Software Foundation, either version 3 of the License, or 15 | (at your option) any later version. 16 | 17 | This program is distributed in the hope that it will be useful, 18 | but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 20 | GNU General Public License for more details. 21 | 22 | You should have received a copy of the GNU General Public License 23 | along with this program. If not, see . 24 | 25 | 26 | Developed for the EU project READ. The READ project has received funding 27 | from the European Union’s Horizon 2020 research and innovation programme 28 | under grant agreement No 674943. 29 | 30 | """ 31 | from __future__ import absolute_import 32 | from __future__ import print_function 33 | from __future__ import unicode_literals 34 | 35 | # TranskribusCommands/do_deleteJob.py 36 | 37 | 38 | #optional: useful if you want to choose the logging level to something else than logging.WARN 39 | import sys, os, logging 40 | from optparse import OptionParser 41 | 42 | try: #to ease the use without proper Python installation 43 | import TranskribusPyClient_version 44 | except ImportError: 45 | sys.path.append( os.path.dirname(os.path.dirname( os.path.abspath(sys.argv[0]) )) ) 46 | import TranskribusPyClient_version 47 | 48 | from TranskribusCommands import _Trnskrbs_default_url, __Trnskrbs_basic_options, _Trnskrbs_description, __Trnskrbs_do_login_stuff, _exit 49 | from TranskribusPyClient.client import TranskribusClient 50 | from TranskribusPyClient.common.trace import traceln, trace 51 | 52 | import json 53 | DEBUG = 0 54 | 55 | description = """Get the status of a Transkribus job. 56 | """ + _Trnskrbs_description 57 | 58 | usage = """%s 59 | """%sys.argv[0] 60 | 61 | class DoDeleteJob(TranskribusClient): 62 | 63 | sDefaultServerUrl = _Trnskrbs_default_url 64 | 65 | #--- INIT ------------------------------------------------------------------------------------------------------------- 66 | def __init__(self, trnkbsServerUrl, sHttpProxy=None, loggingLevel=logging.WARN): 67 | TranskribusClient.__init__(self, sServerUrl=self.sDefaultServerUrl, proxies=sHttpProxy, loggingLevel=loggingLevel) 68 | 69 | 70 | 71 | if __name__ == '__main__': 72 | version = "v.01" 73 | 74 | #prepare for the parsing of the command line 75 | parser = OptionParser(usage=usage, version=version) 76 | parser.description = description 77 | 78 | #"-s", "--server", "-l", "--login" , "-p", "--pwd", "--https_proxy" OPTIONS 79 | __Trnskrbs_basic_options(parser, DoDeleteJob.sDefaultServerUrl) 80 | 81 | # --- 82 | #parse the command line 83 | (options, args) = parser.parse_args() 84 | proxies = {} if not options.https_proxy else {'https_proxy':options.https_proxy} 85 | 86 | # --- 87 | #source collection(s) 88 | try: 89 | jobid = int(args[0]) 90 | except Exception as e: 91 | _exit(usage, 1, e) 92 | 93 | # --- 94 | doer = DoDeleteJob(options.server, proxies, loggingLevel=logging.INFO) 95 | __Trnskrbs_do_login_stuff(doer, options, trace=trace, traceln=traceln) 96 | 97 | # --- 98 | # do the job... 99 | try: 100 | resp = doer.getJobStatus(jobid) 101 | except Exception as e: _exit("", 1, e) 102 | # traceln( json.dumps(resp, sort_keys=True, indent=4, separators=(',', ': '))) 103 | traceln( json.dumps(resp['description'], sort_keys=True, indent=4, separators=(',', ': '))) 104 | # traceln( json.dumps(resp['jobData'].split('\n')[1:], sort_keys=True, indent=4, separators=(',', ': '))) 105 | 106 | 107 | 108 | -------------------------------------------------------------------------------- /src/TranskribusCommands/do_htrHmm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding:utf-8 -*- 3 | 4 | """ 5 | 6 | JL Meunier - Dec 2016 7 | 8 | 9 | Copyright Xerox(C) 2016 JL. Meunier 10 | 11 | This program is free software: you can redistribute it and/or modify 12 | it under the terms of the GNU General Public License as published by 13 | the Free Software Foundation, either version 3 of the License, or 14 | (at your option) any later version. 15 | 16 | This program is distributed in the hope that it will be useful, 17 | but WITHOUT ANY WARRANTY; without even the implied warranty of 18 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 | GNU General Public License for more details. 20 | 21 | You should have received a copy of the GNU General Public License 22 | along with this program. If not, see . 23 | 24 | 25 | Developed for the EU project READ. The READ project has received funding 26 | from the European Union’s Horizon 2020 research and innovation programme 27 | under grant agreement No 674943. 28 | 29 | """ 30 | from __future__ import absolute_import 31 | from __future__ import print_function 32 | from __future__ import unicode_literals 33 | # TranskribusCommands/do_copyDocToCollec.py 3571 3820 8251 8252 34 | 35 | 36 | #optional: useful if you want to choose the logging level to something else than logging.WARN 37 | import sys, os, logging 38 | from optparse import OptionParser 39 | # import json 40 | 41 | try: #to ease the use without proper Python installation 42 | import TranskribusPyClient_version 43 | except ImportError: 44 | sys.path.append( os.path.dirname(os.path.dirname( os.path.abspath(sys.argv[0]) )) ) 45 | import TranskribusPyClient_version 46 | 47 | from TranskribusCommands import _Trnskrbs_default_url, __Trnskrbs_basic_options, _Trnskrbs_description, __Trnskrbs_do_login_stuff, _exit 48 | from TranskribusPyClient.client import TranskribusClient 49 | from TranskribusPyClient.common.trace import traceln, trace 50 | 51 | DEBUG = 0 52 | 53 | description = """Apply an HTR model. 54 | 55 | The syntax for specifying the page range is: 56 | - one or several specifiers separated by a comma 57 | - one separator is a page number, or a range of page number, e.g. 3-8 58 | - Examples: 1 1,3,5 1-3 1,3,5-99,100 59 | 60 | """ + _Trnskrbs_description 61 | 62 | usage = """%s [] 63 | """%sys.argv[0] 64 | 65 | class DoHtr(TranskribusClient): 66 | sDefaultServerUrl = _Trnskrbs_default_url 67 | #--- INIT ------------------------------------------------------------------------------------------------------------- 68 | def __init__(self, trnkbsServerUrl, sHttpProxy=None, loggingLevel=logging.WARN): 69 | TranskribusClient.__init__(self, sServerUrl=self.sDefaultServerUrl, proxies=sHttpProxy, loggingLevel=loggingLevel) 70 | 71 | def run(self, sModelName, colId, docId, sPages): 72 | ret = self.rehtrDecode(colId, sModelName, docId, sPages) 73 | return ret 74 | 75 | if __name__ == '__main__': 76 | version = "v.01" 77 | 78 | #prepare for the parsing of the command line 79 | parser = OptionParser(usage=usage, version=version) 80 | parser.description = description 81 | 82 | #"-s", "--server", "-l", "--login" , "-p", "--pwd", "--https_proxy" OPTIONS 83 | __Trnskrbs_basic_options(parser, DoHtr.sDefaultServerUrl) 84 | 85 | # --- 86 | #parse the command line 87 | (options, args) = parser.parse_args() 88 | proxies = {} if not options.https_proxy else {'https_proxy':options.https_proxy} 89 | 90 | # --- 91 | doer = DoHtr(options.server, proxies, loggingLevel=logging.WARN) 92 | __Trnskrbs_do_login_stuff(doer, options, trace=trace, traceln=traceln) 93 | # --- 94 | try: sModelName = args.pop(0) 95 | except Exception as e: _exit(usage, 1, e) 96 | try: colId = int(args.pop(0)) 97 | except Exception as e: _exit(usage, 1, e) 98 | try: docId = int(args.pop(0)) 99 | except Exception as e: _exit(usage, 1, e) 100 | try: sPages = args.pop(0) 101 | except Exception as e: sPages = None 102 | if args: _exit(usage, 2, Exception("Extra arguments to the command")) 103 | 104 | # --- 105 | # do the job... 106 | jobid = doer.run(sModelName, colId, docId, sPages) 107 | traceln(jobid) 108 | 109 | traceln() 110 | traceln("- Done") 111 | 112 | -------------------------------------------------------------------------------- /src/TranskribusCommands/do_htrRnn.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding:utf-8 -*- 3 | 4 | """ 5 | 6 | JL Meunier - Dec 2016 7 | 8 | 9 | Copyright Xerox(C) 2016 JL. Meunier 10 | 11 | This program is free software: you can redistribute it and/or modify 12 | it under the terms of the GNU General Public License as published by 13 | the Free Software Foundation, either version 3 of the License, or 14 | (at your option) any later version. 15 | 16 | This program is distributed in the hope that it will be useful, 17 | but WITHOUT ANY WARRANTY; without even the implied warranty of 18 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 | GNU General Public License for more details. 20 | 21 | You should have received a copy of the GNU General Public License 22 | along with this program. If not, see . 23 | 24 | 25 | Developed for the EU project READ. The READ project has received funding 26 | from the European Union’s Horizon 2020 research and innovation programme 27 | under grant agreement No 674943. 28 | 29 | """ 30 | from __future__ import absolute_import 31 | from __future__ import print_function 32 | from __future__ import unicode_literals 33 | # TranskribusCommands/do_copyDocToCollec.py 3571 3820 8251 8252 34 | 35 | 36 | #optional: useful if you want to choose the logging level to something else than logging.WARN 37 | import sys, os, logging 38 | from optparse import OptionParser 39 | from io import open 40 | 41 | import json 42 | 43 | try: #to ease the use without proper Python installation 44 | import TranskribusPyClient_version 45 | except ImportError: 46 | sys.path.append( os.path.dirname(os.path.dirname( os.path.abspath(sys.argv[0]) )) ) 47 | import TranskribusPyClient_version 48 | from TranskribusCommands import _Trnskrbs_default_url, __Trnskrbs_basic_options, _Trnskrbs_description, __Trnskrbs_do_login_stuff, _exit 49 | from TranskribusPyClient.client import TranskribusClient 50 | 51 | from TranskribusCommands.do_transcript import DoTranscript 52 | 53 | from TranskribusPyClient.common.IntegerRange import IntegerRange 54 | from TranskribusPyClient.TRP_FullDoc import TRP_FullDoc 55 | 56 | from TranskribusPyClient.common.trace import traceln, trace 57 | 58 | DEBUG = 0 59 | 60 | description = """Apply an HTR RNN model. 61 | 62 | The syntax for specifying the page range is: 63 | - one or several specifiers separated by a comma 64 | - one separator is a page number, or a range of page number, e.g. 3-8 65 | - Examples: 1 1,3,5 1-3 1,3,5-99,100 66 | """ + _Trnskrbs_description 67 | 68 | usage = """%s (--trp TRP_FILE | --docid DOCID) 69 | """%sys.argv[0] 70 | 71 | class DoHtrRnn(TranskribusClient): 72 | """ 73 | 10/16/2017: at region level 74 | {"docId":2278,"pageList":{"pages":[{"pageId":10070,"tsId":25143,"regionIds":["r2","r1"]}]}} 75 | 76 | Our client sends it like this: 77 | 78 | 3 > POST 79 | https://transkribus.eu/TrpServerTesting/rest/recognition/2/241/htrCITlab?id=2278 80 | 3 > Accept: text/plain 81 | ... 82 | 3 > Content-Type: application/json 83 | 3 > Cookie: $Version=1;JSESSIONID=.... 84 | {"docId":2278,"pageList":{"pages":[{"pageId":10070,"tsId":25143,"regionIds":["r2","r1"]}]}} 85 | 86 | 87 | """ 88 | sDefaultServerUrl = _Trnskrbs_default_url 89 | #--- INIT ------------------------------------------------------------------------------------------------------------- 90 | def __init__(self, trnkbsServerUrl, sHttpProxy=None, loggingLevel=logging.WARN): 91 | TranskribusClient.__init__(self, sServerUrl=self.sDefaultServerUrl, proxies=sHttpProxy, loggingLevel=loggingLevel) 92 | 93 | self._trpMng = DoTranscript(self.sDefaultServerUrl, sHttpProxy=sHttpProxy, loggingLevel=loggingLevel) 94 | 95 | def run(self, sModelID, colId, docId, sDescPages,bPyLaia): 96 | ret = self.htrRnnDecode(colId, sModelID, docId, sDescPages,bPyLaia) 97 | return ret 98 | 99 | def buildDescription(self,colId,docpage,trp=None): 100 | """ 101 | '{"docId":17442,"pageList":{"pages":[{"pageId":400008,"tsId":1243509,"regionIds":[]}]}}' 102 | """ 103 | jsonDesc = {} 104 | 105 | if trp is None: 106 | try: docId,pageRange= docpage.split('/') 107 | except ValueError: docId=docpage; pageRange = "" 108 | jsonDesc["docId"]=docId 109 | oPageRange = IntegerRange(pageRange) 110 | trpObj = self._trpMng.filter(colId,docId,page_filter=oPageRange,bLast=True) 111 | else: 112 | trpObj = TRP_FullDoc(trp) 113 | jsonDesc["pageList"]={} 114 | # pList= trpObj.getTranscriptList() 115 | jsonDesc["pageList"]['pages']= [] 116 | for page in trpObj.getPageList(): 117 | docId = page['docId'] 118 | jsonDesc["docId"]=page['docId'] 119 | jsonDesc["pageList"]['pages'].append({"pageId":page['pageId'],"tsId":page['tsList']['transcripts'][0]['tsId'],"regionIds":[]}) 120 | 121 | 122 | # return jsonDesc["docId"], json.dumps(jsonDesc,encoding='utf-8') 123 | return jsonDesc["docId"], json.dumps(jsonDesc) 124 | 125 | if __name__ == '__main__': 126 | version = "v.01" 127 | 128 | #prepare for the parsing of the command line 129 | parser = OptionParser(usage=usage, version=version) 130 | parser.description = description 131 | 132 | #"-s", "--server", "-l", "--login" , "-p", "--pwd", "--https_proxy" OPTIONS 133 | __Trnskrbs_basic_options(parser, DoHtrRnn.sDefaultServerUrl) 134 | 135 | parser.add_option("-r", "--region" , dest='region', action="store", type="string", default=DoHtrRnn.sDefaultServerUrl, help="apply HTR at region level") 136 | parser.add_option("--trp" , dest='trp_doc', action="store", type="string",default=None, help="use trp doc file") 137 | parser.add_option("--docid" , dest='docid' , action="store", type="string", default=None, help="document/pages to be htr'd") 138 | parser.add_option("--tempdict" , dest='dictTemp' , action="store_true", default=False, help="use tempDict folder") 139 | parser.add_option("--pylaia" , dest='bPylaia' , action="store_true", default=True, help="use PyLaia model") 140 | 141 | # --- 142 | #parse the command line 143 | (options, args) = parser.parse_args() 144 | proxies = {} if not options.https_proxy else {'https_proxy':options.https_proxy} 145 | 146 | # --- 147 | doer = DoHtrRnn(options.server, proxies, loggingLevel=logging.WARN) 148 | __Trnskrbs_do_login_stuff(doer, options, trace=trace, traceln=traceln) 149 | doer._trpMng.setSessionId(doer._sessionID) 150 | 151 | # --- 152 | 153 | try: sModelID = args.pop(0) 154 | except Exception as e: _exit(usage, 1, e) 155 | #try: sDictName = args.pop(0) 156 | #except Exception as e: _exit(usage, 1, e) 157 | try: colId = int(args.pop(0)) 158 | except Exception as e: _exit(usage, 1, e) 159 | # try: docId = int(args.pop(0)) 160 | # except Exception as e: _exit(usage, 1, e) 161 | # try: sPages = args.pop(0) 162 | # except Exception as e: sPages = None 163 | 164 | if args: _exit(usage, 2, Exception("Extra arguments to the command")) 165 | 166 | if options.trp_doc: 167 | trpdoc = json.load(open(options.trp_doc, "r",encoding='utf-8')) 168 | docId,sPageDesc = doer.buildDescription(colId,options.docid,trpdoc) 169 | else: 170 | docId,sPageDesc = doer.buildDescription(colId,options.docid) 171 | 172 | # do the job... 173 | #jobid = doer.run(sModelID, sDictName, colId, docId, sPageDesc,options.bPylaia,options.dictTemp) 174 | jobid = doer.run(sModelID, colId, docId, sPageDesc,options.bPylaia) 175 | traceln(jobid) 176 | 177 | traceln() 178 | traceln("- Done") 179 | 180 | -------------------------------------------------------------------------------- /src/TranskribusCommands/do_htrRnnPerRegion.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding:utf-8 -*- 3 | 4 | """ 5 | 6 | Hervé Déjean 7 | 8 | Copyright NLE(C) 2017 9 | 10 | This program is free software: you can redistribute it and/or modify 11 | it under the terms of the GNU General Public License as published by 12 | the Free Software Foundation, either version 3 of the License, or 13 | (at your option) any later version. 14 | 15 | This program is distributed in the hope that it will be useful, 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 | GNU General Public License for more details. 19 | 20 | You should have received a copy of the GNU General Public License 21 | along with this program. If not, see . 22 | 23 | 24 | Developed for the EU project READ. The READ project has received funding 25 | from the European Union’s Horizon 2020 research and innovation programme 26 | under grant agreement No 674943. 27 | 28 | """ 29 | 30 | # TranskribusCommands/do_copyDocToCollec.py 3571 3820 8251 8252 31 | 32 | 33 | #optional: useful if you want to choose the logging level to something else than logging.WARN 34 | import sys, os, logging 35 | from optparse import OptionParser 36 | from io import open 37 | import json 38 | 39 | try: #to ease the use without proper Python installation 40 | import TranskribusPyClient_version 41 | except ImportError: 42 | sys.path.append( os.path.dirname(os.path.dirname( os.path.abspath(sys.argv[0]) )) ) 43 | import TranskribusPyClient_version 44 | 45 | from TranskribusCommands import _Trnskrbs_default_url, __Trnskrbs_basic_options, _Trnskrbs_description, __Trnskrbs_do_login_stuff, _exit 46 | from TranskribusPyClient.client import TranskribusClient 47 | 48 | from do_transcript import DoTranscript 49 | 50 | from TranskribusPyClient.common.IntegerRange import IntegerRange 51 | from TranskribusPyClient.TRP_FullDoc import TRP_FullDoc 52 | 53 | from TranskribusPyClient.common.trace import traceln, trace 54 | 55 | DEBUG = 0 56 | 57 | description = """Apply an HTR RNN model for a given table column with a specific dictionary. 58 | 59 | """ + _Trnskrbs_description 60 | 61 | usage = """%s [--trp] [--docid] [--colnum] [--dict] [--tempdict] 62 | """%sys.argv[0] 63 | 64 | class DoHtrRnnPerColumn(TranskribusClient): 65 | """ 66 | 10/16/2017: at region level 67 | {"docId":2278,"pageList":{"pages":[{"pageId":10070,"tsId":25143,"regionIds":["r2","r1"]}]}} 68 | 69 | Our client sends it like this: 70 | 71 | 3 > POST 72 | https://transkribus.eu/TrpServerTesting/rest/recognition/2/241/htrCITlab?id=2278 73 | 3 > Accept: text/plain 74 | ... 75 | 3 > Content-Type: application/json 76 | 3 > Cookie: $Version=1;JSESSIONID=.... 77 | {"docId":2278,"pageList":{"pages":[{"pageId":10070,"tsId":25143,"regionIds":["r2","r1"]}]}} 78 | 79 | 80 | """ 81 | sDefaultServerUrl = _Trnskrbs_default_url 82 | #--- INIT ------------------------------------------------------------------------------------------------------------- 83 | def __init__(self, trnkbsServerUrl, sHttpProxy=None, loggingLevel=logging.WARN): 84 | TranskribusClient.__init__(self, sServerUrl=self.sDefaultServerUrl, proxies=sHttpProxy, loggingLevel=loggingLevel) 85 | 86 | self._trpMng = DoTranscript(self.sDefaultServerUrl, sHttpProxy=sHttpProxy, loggingLevel=loggingLevel) 87 | 88 | def run(self, sModelID, sDictName, colId, docId,sDescPages,bDictTemp): 89 | """ 90 | 91 | """ 92 | ret = self.htrRnnDecode(colId, sModelID, sDictName, docId, sDescPages,bDictTemp) 93 | return ret 94 | 95 | def buildDescription(self,colId,docpage,colnum,trp=None): 96 | """ 97 | '{"docId":17442,"pageList":{"pages":[{"pageId":400008,"tsId":1243509,"regionIds":[]}]}}' 98 | """ 99 | # return 17442,'{"docId":17442,"pageList":{"pages":[{"pageId":400008,"tsId":1243509,"regionIds":[]}]}}'.encode('utf-8') 100 | jsonDesc = {} 101 | 102 | if trp is None: 103 | try: docId,pageRange= docpage.split('/') 104 | except ValueError: docId=docpage; pageRange = "" 105 | jsonDesc["docId"]=docId 106 | oPageRange = IntegerRange(pageRange) 107 | trpObj = self._trpMng.filter(colId,docId,page_filter=oPageRange,bLast=True) 108 | else: 109 | trpObj = TRP_FullDoc(trp) 110 | jsonDesc["pageList"]={} 111 | # pList= trpObj.getTranscriptList() 112 | jsonDesc["pageList"]['pages']= [] 113 | for page in trpObj.getPageList(): 114 | ## need to upload the page!!!! 115 | regionsIDs=[] 116 | docId = page['docId'] 117 | jsonDesc["docId"]=page['docId'] 118 | jsonDesc["pageList"]['pages'].append({"pageId":page['pageId'],"tsId":page['tsList']['transcripts'][0]['tsId'],"regionIds":regionsIDs}) 119 | 120 | 121 | # return jsonDesc["docId"], json.dumps(jsonDesc,encoding='utf-8') 122 | return jsonDesc["docId"], json.dumps(jsonDesc) 123 | 124 | if __name__ == '__main__': 125 | version = "v.01" 126 | 127 | #prepare for the parsing of the command line 128 | parser = OptionParser(usage=usage, version=version) 129 | parser.description = description 130 | 131 | #"-s", "--server", "-l", "--login" , "-p", "--pwd", "--https_proxy" OPTIONS 132 | __Trnskrbs_basic_options(parser, DoHtrRnnPerColumn.sDefaultServerUrl) 133 | 134 | parser.add_option("-r", "--region" , dest='region', action="store", type="string", default=DoHtrRnnPerColumn.sDefaultServerUrl, help="apply HTR at region level") 135 | parser.add_option("--trp" , dest='trp_doc', action="store", type="string",default=None, help="use trp doc file") 136 | parser.add_option("--docid" , dest='docid' , action="store", type="string", default=None, help="document/pages to be htr'd") 137 | parser.add_option("--colnum" , dest='colnum' , action="store", type="string", default=None, help="column to be htr'd") 138 | parser.add_option("--tempdict" , dest='dictTemp' , action="store_true", default=False, help="use tempDict folder") 139 | # --- 140 | #parse the command line 141 | (options, args) = parser.parse_args() 142 | proxies = {} if not options.https_proxy else {'https_proxy':options.https_proxy} 143 | 144 | # --- 145 | doer = DoHtrRnnPerColumn(options.server, proxies, loggingLevel=logging.WARN) 146 | __Trnskrbs_do_login_stuff(doer, options, trace=trace, traceln=traceln) 147 | doer._trpMng.setSessionId(doer._sessionID) 148 | 149 | # --- 150 | 151 | try: sModelID = args.pop(0) 152 | except Exception as e: _exit(usage, 1, e) 153 | try: sDictName = args.pop(0) 154 | except Exception as e: _exit(usage, 1, e) 155 | try: colId = int(args.pop(0)) 156 | except Exception as e: _exit(usage, 1, e) 157 | # try: docId = int(args.pop(0)) 158 | # except Exception as e: _exit(usage, 1, e) 159 | # try: sPages = args.pop(0) 160 | # except Exception as e: sPages = None 161 | 162 | if args: _exit(usage, 2, Exception("Extra arguments to the command")) 163 | 164 | if options.trp_doc: 165 | trpdoc = json.load(open(options.trp_doc, "rb",encoding='utf-8')) 166 | docId,sPageDesc = doer.buildDescription(colId,options.docid,options.colnum,trpdoc) 167 | else: 168 | docId,sPageDesc = doer.buildDescription(colId,options.docid,options.colnum) 169 | 170 | # do the job... 171 | jobid = doer.run(sModelID, sDictName, colId, docId,sPageDesc,options.dictTemp) 172 | traceln(jobid) 173 | 174 | traceln() 175 | traceln("- Done") 176 | 177 | -------------------------------------------------------------------------------- /src/TranskribusCommands/do_listCollec.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding:utf-8 -*- 3 | 4 | """ 5 | List the content of a collection 6 | 7 | JL Meunier - Nov 2016 8 | 9 | 10 | Copyright Xerox(C) 2016 H. Déjean, JL. Meunier 11 | 12 | This program is free software: you can redistribute it and/or modify 13 | it under the terms of the GNU General Public License as published by 14 | the Free Software Foundation, either version 3 of the License, or 15 | (at your option) any later version. 16 | 17 | This program is distributed in the hope that it will be useful, 18 | but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 20 | GNU General Public License for more details. 21 | 22 | You should have received a copy of the GNU General Public License 23 | along with this program. If not, see . 24 | 25 | 26 | Developed for the EU project READ. The READ project has received funding 27 | from the European Union’s Horizon 2020 research and innovation programme 28 | under grant agreement No 674943. 29 | 30 | """ 31 | from __future__ import absolute_import 32 | from __future__ import print_function 33 | from __future__ import unicode_literals 34 | 35 | # TranskribusCommands/do_copyDocToCollec.py 3571 3820 8251 8252 36 | 37 | import json 38 | 39 | #optional: useful if you want to choose the logging level to something else than logging.WARN 40 | import sys, os, logging 41 | from optparse import OptionParser 42 | 43 | try: #to ease the use without proper Python installation 44 | import TranskribusPyClient_version 45 | except ImportError: 46 | sys.path.append( os.path.dirname(os.path.dirname( os.path.abspath(sys.argv[0]) )) ) 47 | import TranskribusPyClient_version 48 | 49 | from TranskribusCommands import _Trnskrbs_default_url, __Trnskrbs_basic_options, _Trnskrbs_description, __Trnskrbs_do_login_stuff, _exit 50 | from TranskribusPyClient.client import TranskribusClient 51 | from TranskribusPyClient.common.trace import traceln, trace 52 | 53 | DEBUG = 0 54 | 55 | description = """List the content of one or several Transkribus collection. 56 | """ + _Trnskrbs_description 57 | 58 | usage = """%s + 59 | """%sys.argv[0] 60 | 61 | class DoListCollec(TranskribusClient): 62 | """ 63 | List the content of a collection 64 | """ 65 | sDefaultServerUrl = _Trnskrbs_default_url 66 | #--- INIT ------------------------------------------------------------------------------------------------------------- 67 | def __init__(self, trnkbsServerUrl, sHttpProxy=None, loggingLevel=logging.WARN): 68 | TranskribusClient.__init__(self, sServerUrl=self.sDefaultServerUrl, proxies=sHttpProxy, loggingLevel=loggingLevel) 69 | 70 | def run(self, colId, options): 71 | """ 72 | 73 | [{u'collectionList': {u'colList': [{u'colId': 3571, 74 | u'colName': u'READDU', 75 | u'description': u'created by herve.dejean@xrce.xerox.com'}]}, 76 | u'createdFromTimestamp': 33175290, 77 | u'createdToTimestamp': 33175290, 78 | u'docId': 7749, 79 | u'fimgStoreColl': u'TrpDoc_DEA_7749', 80 | u'nrOfPages': 10, 81 | u'scriptType': u'HANDWRITTEN', 82 | u'status': 0, 83 | u'title': u'MM_1_001', 84 | u'uploadTimestamp': 1478161395893L, 85 | u'uploader': u'herve.dejean@xrce.xerox.com', 86 | u'uploaderId': 275}, 87 | {u'collectionList': {u'colList': [{u'colId': 3571, 88 | u'colName': u'READDU', 89 | u'description': u'created by herve.dejean@xrce.xerox.com'}]}, 90 | u'createdFromTimestamp': 0, 91 | u'createdToTimestamp': 0, 92 | u'docId': 7750, 93 | u'fimgStoreColl': u'TrpDoc_DEA_7750', 94 | u'nrOfPages': 10, 95 | u'scriptType': u'HANDWRITTEN', 96 | u'status': 0, 97 | u'title': u'MM_1_005', 98 | u'uploadTimestamp': 1478161451242L, 99 | u'uploader': u'herve.dejean@xrce.xerox.com', 100 | u'uploaderId': 275}] 101 | 102 | """ 103 | bRaw=options.bRaw 104 | data = self.listDocsByCollectionId(colId) 105 | if options.trp: 106 | with open(options.trp, "wt",) as fd: json.dump(data, fd, indent=2) 107 | if bRaw: 108 | while data: 109 | dic = data.pop(0) 110 | print (dic[u'docId']) 111 | else: 112 | if data: 113 | _d = data[0][u'collectionList'][u'colList'][-1] 114 | print( "Collection: %s (%s)"%(_d[u'colName'], _d[u'colId'])) 115 | 116 | while data: 117 | dic = data.pop(0) 118 | print (">> (%s) #p=%d '%s' by %s (status=%s)" % (dic[u'docId'], dic[u'nrOfPages'], dic[u'title'], dic[u'uploader'], dic[u'status'])) 119 | else: 120 | print (">> Collection is empty!") 121 | 122 | 123 | 124 | if __name__ == '__main__': 125 | version = "v.01" 126 | 127 | #prepare for the parsing of the command line 128 | parser = OptionParser(usage=usage, version=version) 129 | parser.description = description 130 | 131 | #"-s", "--server", "-l", "--login" , "-p", "--pwd", "--https_proxy" OPTIONS 132 | __Trnskrbs_basic_options(parser, DoListCollec.sDefaultServerUrl) 133 | 134 | parser.add_option("--raw", dest='bRaw', action="store_true", default=False, help="Raw output, one docid per line") 135 | parser.add_option("--trp" , dest='trp' , action="store", type="string", default=None, help="Store the TRP data reflecting the documents in the given file.") 136 | 137 | # --- 138 | #parse the command line 139 | (options, args) = parser.parse_args() 140 | proxies = {} if not options.https_proxy else {'https_proxy':options.https_proxy} 141 | 142 | # --- 143 | #source collection(s) 144 | try: 145 | lColId = [ int(arg) for arg in args ] 146 | except Exception as e: 147 | _exit(usage, 1, e) 148 | 149 | # --- 150 | doer = DoListCollec(options.server, proxies, loggingLevel=logging.INFO) 151 | __Trnskrbs_do_login_stuff(doer, options, trace=trace, traceln=traceln) 152 | # --- 153 | # do the job... 154 | for colId in lColId: 155 | doer.run(colId, options) 156 | try: 157 | doer.run(colId, options) 158 | except Exception as e: 159 | traceln() 160 | traceln("ERROR: could not list collection '%d' "%colId) 161 | _exit("", 1, e) 162 | if not options.bRaw: 163 | traceln() 164 | traceln("- Done for %d collection(s)"%len(lColId)) 165 | 166 | -------------------------------------------------------------------------------- /src/TranskribusCommands/do_listHtrHmm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding:utf-8 -*- 3 | 4 | """ 5 | List the HTR Models 6 | 7 | JL Meunier - Dec 2016 8 | 9 | 10 | Copyright Xerox(C) 2016 H. Déjean, JL. Meunier 11 | 12 | This program is free software: you can redistribute it and/or modify 13 | it under the terms of the GNU General Public License as published by 14 | the Free Software Foundation, either version 3 of the License, or 15 | (at your option) any later version. 16 | 17 | This program is distributed in the hope that it will be useful, 18 | but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 20 | GNU General Public License for more details. 21 | 22 | You should have received a copy of the GNU General Public License 23 | along with this program. If not, see . 24 | 25 | 26 | Developed for the EU project READ. The READ project has received funding 27 | from the European Union’s Horizon 2020 research and innovation programme 28 | under grant agreement No 674943. 29 | 30 | """ 31 | from __future__ import absolute_import 32 | from __future__ import print_function 33 | from __future__ import unicode_literals 34 | # TranskribusCommands/do_copyDocToCollec.py 3571 3820 8251 8252 35 | 36 | 37 | #optional: useful if you want to choose the logging level to something else than logging.WARN 38 | import sys, os, logging 39 | from optparse import OptionParser 40 | # import json 41 | 42 | try: #to ease the use without proper Python installation 43 | import TranskribusPyClient_version 44 | except ImportError: 45 | sys.path.append( os.path.dirname(os.path.dirname( os.path.abspath(sys.argv[0]) )) ) 46 | import TranskribusPyClient_version 47 | 48 | from TranskribusCommands import _Trnskrbs_default_url, __Trnskrbs_basic_options, _Trnskrbs_description, __Trnskrbs_do_login_stuff, strTabularFormat 49 | from TranskribusPyClient.client import TranskribusClient 50 | from TranskribusPyClient.common.trace import traceln, trace 51 | 52 | DEBUG = 0 53 | 54 | description = """List HTR models available in Transkribus. 55 | """ + _Trnskrbs_description 56 | 57 | usage = """%s 58 | """%sys.argv[0] 59 | 60 | class DoListHtrModels(TranskribusClient): 61 | sDefaultServerUrl = _Trnskrbs_default_url 62 | #--- INIT ------------------------------------------------------------------------------------------------------------- 63 | def __init__(self, trnkbsServerUrl, sHttpProxy=None, loggingLevel=logging.WARN): 64 | TranskribusClient.__init__(self, sServerUrl=self.sDefaultServerUrl, proxies=sHttpProxy, loggingLevel=loggingLevel) 65 | 66 | def run(self): 67 | lDic = self.listHmmHtrModels() 68 | #traceln(json.dumps(data, indent=4)) 69 | traceln( strTabularFormat(lDic, ["modelName", "modelId", "isUsableInTranskribus", "nrOfTokens", "nrOfDictTokens", "nrOfLines"], "modelName") ) 70 | return lDic 71 | 72 | if __name__ == '__main__': 73 | version = "v.01" 74 | 75 | #prepare for the parsing of the command line 76 | parser = OptionParser(usage=usage, version=version) 77 | parser.description = description 78 | 79 | #"-s", "--server", "-l", "--login" , "-p", "--pwd", "--https_proxy" OPTIONS 80 | __Trnskrbs_basic_options(parser, DoListHtrModels.sDefaultServerUrl) 81 | 82 | # --- 83 | #parse the command line 84 | (options, args) = parser.parse_args() 85 | proxies = {} if not options.https_proxy else {'https_proxy':options.https_proxy} 86 | 87 | # --- 88 | doer = DoListHtrModels(options.server, proxies, loggingLevel=logging.WARN) 89 | __Trnskrbs_do_login_stuff(doer, options, trace=trace, traceln=traceln) 90 | 91 | # --- 92 | # do the job... 93 | doer.run() 94 | 95 | traceln() 96 | traceln("- Done") 97 | 98 | -------------------------------------------------------------------------------- /src/TranskribusCommands/do_listHtrRnn.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding:utf-8 -*- 3 | 4 | """ 5 | List the HTR RNN Models and Dictionaries 6 | 7 | JL Meunier - Dec 2016 8 | 9 | 10 | Copyright Xerox(C) 2016 H. Déjean, JL. Meunier 11 | 12 | This program is free software: you can redistribute it and/or modify 13 | it under the terms of the GNU General Public License as published by 14 | the Free Software Foundation, either version 3 of the License, or 15 | (at your option) any later version. 16 | 17 | This program is distributed in the hope that it will be useful, 18 | but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 20 | GNU General Public License for more details. 21 | 22 | You should have received a copy of the GNU General Public License 23 | along with this program. If not, see . 24 | 25 | 26 | Developed for the EU project READ. The READ project has received funding 27 | from the European Union’s Horizon 2020 research and innovation programme 28 | under grant agreement No 674943. 29 | 30 | """ 31 | from __future__ import absolute_import 32 | from __future__ import print_function 33 | from __future__ import unicode_literals 34 | 35 | # TranskribusCommands/do_copyDocToCollec.py 3571 3820 8251 8252 36 | 37 | 38 | #optional: useful if you want to choose the logging level to something else than logging.WARN 39 | import sys, os, logging 40 | from optparse import OptionParser 41 | # import json 42 | 43 | try: #to ease the use without proper Python installation 44 | import TranskribusPyClient_version 45 | except ImportError: 46 | sys.path.append( os.path.dirname(os.path.dirname( os.path.abspath(sys.argv[0]) )) ) 47 | import TranskribusPyClient_version 48 | 49 | from TranskribusCommands import _Trnskrbs_default_url, __Trnskrbs_basic_options, _Trnskrbs_description, __Trnskrbs_do_login_stuff, _exit, strTabularFormat 50 | from TranskribusPyClient.client import TranskribusClient 51 | from TranskribusPyClient.common.trace import traceln, trace 52 | 53 | DEBUG = 0 54 | 55 | description = """List HTR RNN models and dictionaries available in Transkribus. 56 | """ + _Trnskrbs_description 57 | 58 | usage = """%s 59 | """%sys.argv[0] 60 | 61 | class DoListHtrRnn(TranskribusClient): 62 | sDefaultServerUrl = _Trnskrbs_default_url 63 | #--- INIT ------------------------------------------------------------------------------------------------------------- 64 | def __init__(self, trnkbsServerUrl, sHttpProxy=None, loggingLevel=logging.WARN): 65 | TranskribusClient.__init__(self, sServerUrl=self.sDefaultServerUrl, proxies=sHttpProxy, loggingLevel=loggingLevel) 66 | 67 | def run(self,colid=None,bListDict=False): 68 | """ 69 | 2 textual lists 70 | """ 71 | sModels=None 72 | sColModels=None 73 | sDicts = None 74 | if colid is not None: 75 | sColModels = self.listRnns(colid) 76 | for models in sColModels: 77 | #print(models.keys()) 78 | #some old? models do not have params field 79 | #try: traceln("%s\t%s\t%s\ndescription:%s" % (models['htrId'],models['name'].strip(),models['params'].strip(),models['description'].strip())) 80 | try: traceln("%s\t%s\t%s\ndescription:%s" % (models['htrId'],models['name'].strip(),models['provider'].strip(),models['description'].strip())) 81 | except KeyError: traceln("%s\t%s\tno params" % (models['htrId'],models['name'])) 82 | traceln() 83 | else: 84 | sModels = self.listRnnsText() 85 | traceln("\n--- Models ---------------------------") 86 | traceln(sModels) 87 | 88 | if bListDict: 89 | sDicts = self.listDictsText() 90 | traceln("\n--- Dictionaries ---------------------") 91 | traceln(sDicts) 92 | 93 | return sModels, sColModels, sDicts 94 | 95 | if __name__ == '__main__': 96 | version = "v.01" 97 | 98 | #prepare for the parsing of the command line 99 | parser = OptionParser(usage=usage, version=version) 100 | parser.description = description 101 | parser.add_option("--colid", dest='colid', type='string', default=None, help = 'get models linked to the colid') 102 | parser.add_option("--dict", dest='dict', action='store_true', default=False, help = 'get dictionaries') 103 | 104 | #"-s", "--server", "-l", "--login" , "-p", "--pwd", "--https_proxy" OPTIONS 105 | __Trnskrbs_basic_options(parser, DoListHtrRnn.sDefaultServerUrl) 106 | 107 | # --- 108 | #parse the command line 109 | (options, args) = parser.parse_args() 110 | proxies = {} if not options.https_proxy else {'https_proxy':options.https_proxy} 111 | # --- 112 | doer = DoListHtrRnn(options.server, proxies, loggingLevel=logging.WARN) 113 | __Trnskrbs_do_login_stuff(doer, options, trace=trace, traceln=traceln) 114 | 115 | # --- 116 | # do the job... 117 | doer.run(options.colid,options.dict) 118 | 119 | traceln() 120 | traceln("- Done") 121 | 122 | -------------------------------------------------------------------------------- /src/TranskribusCommands/do_listPageLocks.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding:utf-8 -*- 3 | 4 | """ 5 | list the locks for a colid/docid/page 6 | 7 | H. Déjean - Nov 2016 8 | 9 | 10 | Copyright Xerox(C) 2016 H. Déjean, JL. Meunier 11 | 12 | This program is free software: you can redistribute it and/or modify 13 | it under the terms of the GNU General Public License as published by 14 | the Free Software Foundation, either version 3 of the License, or 15 | (at your option) any later version. 16 | 17 | This program is distributed in the hope that it will be useful, 18 | but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 20 | GNU General Public License for more details. 21 | 22 | You should have received a copy of the GNU General Public License 23 | along with this program. If not, see . 24 | 25 | 26 | Developed for the EU project READ. The READ project has received funding 27 | from the European Union’s Horizon 2020 research and innovation programme 28 | under grant agreement No 674943. 29 | 30 | """ 31 | from __future__ import absolute_import 32 | from __future__ import print_function 33 | from __future__ import unicode_literals 34 | # TranskribusCommands/do_ListPageLocks.py 35 | 36 | 37 | #optional: useful if you want to choose the logging level to something else than logging.WARN 38 | import sys, os, logging 39 | from optparse import OptionParser 40 | 41 | try: #to ease the use without proper Python installation 42 | import TranskribusPyClient_version 43 | except ImportError: 44 | sys.path.append( os.path.dirname(os.path.dirname( os.path.abspath(sys.argv[0]) )) ) 45 | import TranskribusPyClient_version 46 | 47 | from TranskribusCommands import _Trnskrbs_default_url, __Trnskrbs_basic_options, _Trnskrbs_description, __Trnskrbs_do_login_stuff, _exit 48 | from TranskribusPyClient.client import TranskribusClient 49 | from TranskribusPyClient.common.trace import traceln, trace 50 | 51 | DEBUG = 0 52 | 53 | description = """list the locked pages. 54 | """ + _Trnskrbs_description 55 | 56 | usage = """%s 57 | """%sys.argv[0] 58 | 59 | class listPageLocks(TranskribusClient): 60 | 61 | sDefaultServerUrl = _Trnskrbs_default_url 62 | 63 | #--- INIT ------------------------------------------------------------------------------------------------------------- 64 | def __init__(self, trnkbsServerUrl, sHttpProxy=None, loggingLevel=logging.WARN): 65 | TranskribusClient.__init__(self, sServerUrl=self.sDefaultServerUrl, proxies=sHttpProxy, loggingLevel=loggingLevel) 66 | 67 | 68 | 69 | if __name__ == '__main__': 70 | version = "v.01" 71 | 72 | #prepare for the parsing of the command line 73 | parser = OptionParser(usage=usage, version=version) 74 | parser.description = description 75 | 76 | #"-s", "--server", "-l", "--login" , "-p", "--pwd", "--https_proxy" OPTIONS 77 | __Trnskrbs_basic_options(parser, listPageLocks.sDefaultServerUrl) 78 | 79 | # --- 80 | #parse the command line 81 | (options, args) = parser.parse_args() 82 | proxies = {} if not options.https_proxy else {'https_proxy':options.https_proxy} 83 | 84 | # --- 85 | #source collection(s) 86 | try: 87 | colid = int(args[0]) 88 | except Exception as e: 89 | _exit(usage, 1, e) 90 | try: 91 | docid = int(args[0]) 92 | except Exception as e: 93 | _exit(usage, 1, e) 94 | try: 95 | page = int(args[0]) 96 | except Exception as e: 97 | _exit(usage, 1, e) 98 | 99 | # --- 100 | doer = listPageLocks(options.server, proxies, loggingLevel=logging.INFO) 101 | __Trnskrbs_do_login_stuff(doer, options, trace=trace, traceln=traceln) 102 | 103 | # --- 104 | # do the job... 105 | try: 106 | resp = doer.getListofLockedPages(colid, docid, page) 107 | except Exception as e: _exit("", 1, e) 108 | traceln(resp) 109 | traceln("- Done") 110 | 111 | -------------------------------------------------------------------------------- /src/TranskribusCommands/do_login.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding:utf-8 -*- 3 | 4 | """ 5 | Utility to login into Transkribus and store the sessionId in a secure way for next commands 6 | 7 | JL Meunier - Nov 2016 8 | 9 | 10 | Copyright Xerox(C) 2016 H. Déjean, JL. Meunier 11 | 12 | This program is free software: you can redistribute it and/or modify 13 | it under the terms of the GNU General Public License as published by 14 | the Free Software Foundation, either version 3 of the License, or 15 | (at your option) any later version. 16 | 17 | This program is distributed in the hope that it will be useful, 18 | but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 20 | GNU General Public License for more details. 21 | 22 | You should have received a copy of the GNU General Public License 23 | along with this program. If not, see . 24 | 25 | 26 | Developed for the EU project READ. The READ project has received funding 27 | from the European Union’s Horizon 2020 research and innovation programme 28 | under grant agreement No 674943. 29 | 30 | """ 31 | from __future__ import absolute_import 32 | from __future__ import print_function 33 | from __future__ import unicode_literals 34 | #optional: useful if you want to choose the logging level to something else than logging.WARN 35 | import sys, os, logging 36 | from optparse import OptionParser 37 | 38 | try: #to ease the use without proper Python installation 39 | import TranskribusPyClient_version 40 | except ImportError: 41 | sys.path.append( os.path.dirname(os.path.dirname( os.path.abspath(sys.argv[0]) )) ) 42 | import TranskribusPyClient_version 43 | 44 | from TranskribusCommands import _Trnskrbs_default_url, __Trnskrbs_basic_options, _Trnskrbs_description, _exit 45 | from TranskribusPyClient.client import TranskribusClient, getStoredCredentials 46 | from TranskribusPyClient.common.trace import traceln, trace 47 | 48 | DEBUG = 0 49 | 50 | description = """Login into Transkribus to avoid the need for login in next commands (until the session expires). 51 | """ + _Trnskrbs_description 52 | 53 | usage = """%s"""%sys.argv[0] 54 | 55 | class DoLogin(TranskribusClient): 56 | """ 57 | Download a Transkribus collection as a DS structured dataset 58 | """ 59 | sDefaultServerUrl = _Trnskrbs_default_url 60 | 61 | #--- INIT ------------------------------------------------------------------------------------------------------------- 62 | def __init__(self, trnkbsServerUrl, sHttpProxy=None, loggingLevel=logging.WARN): 63 | TranskribusClient.__init__(self, sServerUrl=self.sDefaultServerUrl, proxies=sHttpProxy, loggingLevel=loggingLevel) 64 | 65 | 66 | if __name__ == '__main__': 67 | version = "v.01" 68 | 69 | #prepare for the parsing of the command line 70 | parser = OptionParser(usage=usage, version=version) 71 | parser.description = description 72 | 73 | #"-s", "--server", "-l", "--login" , "-p", "--pwd", "--https_proxy" OPTIONS 74 | __Trnskrbs_basic_options(parser, DoLogin.sDefaultServerUrl) 75 | 76 | #parse the command line 77 | (options, args) = parser.parse_args() 78 | 79 | # --- 80 | #credentials and proxy 81 | proxies = {} if not options.https_proxy else {'https_proxy':options.https_proxy} 82 | 83 | if options.login: 84 | login, pwd = options.login, options.pwd 85 | else: 86 | trace("- no login provided, looking for stored credentials... ") 87 | login, pwd = getStoredCredentials(bAsk=False) 88 | traceln("OK") 89 | 90 | # ------------------------------------------------------------------------------------------------ 91 | 92 | doer = DoLogin(options.server, proxies, loggingLevel=logging.INFO) 93 | 94 | try: 95 | if options.persist: 96 | traceln("- Logging onto Transkribus as %s and making a persistent session"%login) 97 | doer.cleanPersistentSession() 98 | resp = doer.auth_login(login, pwd, bPersist=options.persist) 99 | traceln("\t --> %s"%os.path.join(DoLogin._sSESSION_FOLDER, DoLogin._sSESSION_FILENAME)) 100 | else: 101 | trace("- Checking Transkribus login as %s "%login) 102 | resp = doer.auth_login(login, pwd, bPersist=options.persist) 103 | traceln(" OK!") 104 | except Exception as e: _exit("", 1, e) 105 | 106 | traceln("- Done") 107 | 108 | -------------------------------------------------------------------------------- /src/TranskribusCommands/do_logout.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding:utf-8 -*- 3 | 4 | """ 5 | Utility to remove any persistent session from the disk 6 | 7 | JL Meunier - Nov 2016 8 | 9 | 10 | Copyright Xerox(C) 2016 H. Déjean, JL. Meunier 11 | 12 | This program is free software: you can redistribute it and/or modify 13 | it under the terms of the GNU General Public License as published by 14 | the Free Software Foundation, either version 3 of the License, or 15 | (at your option) any later version. 16 | 17 | This program is distributed in the hope that it will be useful, 18 | but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 20 | GNU General Public License for more details. 21 | 22 | You should have received a copy of the GNU General Public License 23 | along with this program. If not, see . 24 | 25 | 26 | Developed for the EU project READ. The READ project has received funding 27 | from the European Union’s Horizon 2020 research and innovation programme 28 | under grant agreement No 674943. 29 | 30 | """ 31 | from __future__ import absolute_import 32 | from __future__ import print_function 33 | from __future__ import unicode_literals 34 | 35 | #optional: useful if you want to choose the logging level to something else than logging.WARN 36 | import sys, os, logging 37 | from optparse import OptionParser 38 | 39 | try: #to ease the use without proper Python installation 40 | import TranskribusPyClient_version 41 | except ImportError: 42 | sys.path.append( os.path.dirname(os.path.dirname( os.path.abspath(sys.argv[0]) )) ) 43 | import TranskribusPyClient_version 44 | 45 | from TranskribusPyClient.common.trace import traceln, trace 46 | from TranskribusCommands import _Trnskrbs_default_url, __Trnskrbs_basic_options, _Trnskrbs_description, __Trnskrbs_do_login_stuff, _exit 47 | from TranskribusPyClient.client import TranskribusClient 48 | 49 | DEBUG = 0 50 | 51 | description = """Remove any persistent session from disk. 52 | """ + _Trnskrbs_description 53 | 54 | usage = """%s"""%sys.argv[0] 55 | 56 | class DoLogout(TranskribusClient): 57 | """ 58 | Add a document to another collection. 59 | """ 60 | sDefaultServerUrl = _Trnskrbs_default_url 61 | 62 | #--- INIT ------------------------------------------------------------------------------------------------------------- 63 | def __init__(self, trnkbsServerUrl, sHttpProxy=None, loggingLevel=logging.WARN): 64 | TranskribusClient.__init__(self, sServerUrl=_Trnskrbs_default_url, proxies=sHttpProxy, loggingLevel=loggingLevel) 65 | 66 | 67 | if __name__ == '__main__': 68 | version = "v.01" 69 | 70 | #prepare for the parsing of the command line 71 | parser = OptionParser(usage=usage, version=version) 72 | parser.description = description 73 | 74 | #"-s", "--server", "-l", "--login" , "-p", "--pwd", "--https_proxy" OPTIONS 75 | __Trnskrbs_basic_options(parser, DoLogout.sDefaultServerUrl) 76 | 77 | #parse the command line 78 | (options, args) = parser.parse_args() 79 | proxies = {} if not options.https_proxy else {'https_proxy':options.https_proxy} 80 | # ------------------------------------------------------------------------------------------------ 81 | doer = DoLogout(options.server, proxies, loggingLevel=logging.INFO) 82 | try: 83 | __Trnskrbs_do_login_stuff(doer, options, trace=trace, traceln=traceln) 84 | except: 85 | pass 86 | 87 | try: 88 | traceln('- cleaning any persistent session.') 89 | doer.auth_logout() 90 | except Exception as e: 91 | pass 92 | #_exit("", 1, e) 93 | 94 | traceln("- Done" ) 95 | 96 | -------------------------------------------------------------------------------- /src/TranskribusCommands/do_tableTemplate.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding:utf-8 -*- 3 | 4 | """ 5 | 6 | Hervé Déjean - Jan 2017 7 | 8 | 9 | Copyright Xerox(C) 2016 10 | 11 | This program is free software: you can redistribute it and/or modify 12 | it under the terms of the GNU General Public License as published by 13 | the Free Software Foundation, either version 3 of the License, or 14 | (at your option) any later version. 15 | 16 | This program is distributed in the hope that it will be useful, 17 | but WITHOUT ANY WARRANTY; without even the implied warranty of 18 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 | GNU General Public License for more details. 20 | 21 | You should have received a copy of the GNU General Public License 22 | along with this program. If not, see . 23 | 24 | 25 | Developed for the EU project READ. The READ project has received funding 26 | from the European Union’s Horizon 2020 research and innovation programme 27 | under grant agreement No 674943. 28 | 29 | 30 | see https://transkribus.eu/wiki/index.php/HTR 31 | """ 32 | from __future__ import absolute_import 33 | from __future__ import print_function 34 | from __future__ import unicode_literals 35 | 36 | # TranskribusCommands/do_htrTrainRnn model-name colId docid pages 37 | 38 | 39 | #optional: useful if you want to choose the logging level to something else than logging.WARN 40 | import sys, os, logging 41 | from optparse import OptionParser 42 | import json 43 | from lxml import etree 44 | 45 | try: #to ease the use without proper Python installation 46 | import TranskribusPyClient_version 47 | except ImportError: 48 | sys.path.append( os.path.dirname(os.path.dirname( os.path.abspath(sys.argv[0]) )) ) 49 | import TranskribusPyClient_version 50 | 51 | from TranskribusCommands import __Trnskrbs_basic_options, _Trnskrbs_description, __Trnskrbs_do_login_stuff, _exit 52 | 53 | from do_analyzeLayout import DoLAbatch 54 | from TranskribusPyClient.common.IntegerRange import IntegerRange 55 | from TranskribusPyClient.TRP_FullDoc import TRP_FullDoc 56 | from TranskribusPyClient.common.trace import traceln, trace 57 | 58 | 59 | DEBUG = 0 60 | 61 | description = """Apply a table template to a list of pages 62 | 63 | The syntax for specifying the page range is: 64 | - one or several specifiers separated by a comma 65 | - one separator is a page number, or a range of page number, e.g. 3-8 66 | - Examples: 1 1,3,5 1-3 1,3,5-99,100 67 | """ + _Trnskrbs_description 68 | 69 | 70 | usage = """%s --templateID <> 71 | """%sys.argv[0] 72 | 73 | class DoTableTemplate(DoLAbatch): 74 | 75 | 76 | def run(self, templateID, colId, sDescription, sJobImpl): 77 | ret = self.tableMatching(templateID, colId, sDescription, sJobImpl) 78 | jobid= self.getJobIDsFromXMLStatuses(ret) 79 | return ret,jobid 80 | 81 | 82 | def jsonToXMLDescription(self,jsonDesc): 83 | """ 84 | convert json description to XML 85 | 86 | 87 | 88 | 89 | 90 | 1 91 | 92 | 93 | 2 94 | 3 95 | 96 | 97 | 98 | 99 | 100 | 101 | templateId 102 | 1543 103 | 104 | 105 | 106 | 107 | """ 108 | # s = '{"docId":17442,"pageList":{"pages":[{"pageId":400008,"tsId":1243509,"regionIds":[]}]}}' 109 | # s ='{"pageList": {"pages": [{"tsId": "1305027", "regionIds": [], "pageId": "478362"}]}, "docId": "18975"}' 110 | # 111 | jsonDesc=json.loads(jsonDesc) 112 | 113 | root = etree.Element("jobParameters") 114 | xmldesc= etree.ElementTree(root) 115 | root2 = etree.Element("jobParameters") 116 | root.append(root2) 117 | 118 | docList =etree.Element("docList") 119 | # root2.append(docList) 120 | root.append(docList) 121 | 122 | docs= etree.Element("docs") 123 | docList.append(docs) 124 | 125 | # docId 126 | node = etree.Element("docId") 127 | docs.append(node) 128 | node.text = str(jsonDesc["docId"]) 129 | 130 | #pageList 131 | nodelp = etree.Element("pageList") 132 | docs.append(nodelp) 133 | 134 | for page in jsonDesc["pageList"]['pages']: 135 | nodep = etree.Element("pages") 136 | nodelp.append(nodep) 137 | pageId = etree.Element("pageId") 138 | pageId.text = str(page['pageId']) 139 | tsId=etree.Element("tsId") 140 | tsId.text= str(page['tsId']) 141 | # regId=etree.Element("regionIds") 142 | # regId.text = '' 143 | nodep.append(pageId) 144 | nodep.append(tsId) 145 | # nodep.append(regId) 146 | 147 | params= etree.Element('params') 148 | root.append(params) 149 | 150 | entry=etree.Element('entry') 151 | params.append(entry) 152 | 153 | key=etree.Element('key') 154 | key.text = 'templateId' 155 | entry.append(key) 156 | 157 | value=etree.Element('value') 158 | value.text= str(jsonDesc['template']) 159 | entry.append(value) 160 | 161 | return etree.tostring(xmldesc, encoding='utf-8',pretty_print=True) 162 | 163 | def buildDescription(self,colId,docpage,templateId,trp=None): 164 | """ 165 | 166 | 167 | 168 | 169 | 1 170 | 171 | 172 | 2 173 | 3 174 | 175 | 176 | 177 | 178 | 179 | 180 | templateId 181 | 1543 182 | 183 | 184 | 185 | """ 186 | jsonDesc = {} 187 | 188 | if trp is None: 189 | try: docId,pageRange= docpage.split('/') 190 | except ValueError: docId=docpage; pageRange = "" 191 | jsonDesc["docId"]=docId 192 | oPageRange = IntegerRange(pageRange) 193 | trpObj = self._trpMng.filter(colId,docId,page_filter=oPageRange,bLast=True) 194 | else: 195 | trpObj = TRP_FullDoc(trp) 196 | jsonDesc["pageList"]={} 197 | # pList= trpObj.getTranscriptList() 198 | jsonDesc["pageList"]['pages']= [] 199 | jsonDesc['template'] = str(templateId) 200 | for page in trpObj.getPageList(): 201 | docId = page['docId'] 202 | jsonDesc["docId"]=page['docId'] 203 | jsonDesc["pageList"]['pages'].append({"pageId":page['pageId'],"tsId":page['tsList']['transcripts'][0]['tsId'],"regionIds":[]}) 204 | 205 | # return jsonDesc["docId"], json.dumps(jsonDesc,encoding='utf-8') 206 | return jsonDesc["docId"], json.dumps(jsonDesc) 207 | 208 | if __name__ == '__main__': 209 | version = "v.01" 210 | #prepare for the parsing of the command line 211 | parser = OptionParser(usage=usage, version=version) 212 | parser.description = description 213 | 214 | #"-s", "--server", "-l", "--login" , "-p", "--pwd", "--https_proxy" OPTIONS 215 | __Trnskrbs_basic_options(parser, DoTableTemplate.sDefaultServerUrl) 216 | 217 | parser.add_option("--trp" , dest='trp_doc', action="store", type="string",default=None, help="use trp doc file") 218 | parser.add_option("--templateID" , dest='templateID' , action="store", type="string" , help="template id") 219 | # parser.add_option("--batchjob" , dest='doBatchJob' , action="store_true", default=False, help="do one job per page") 220 | 221 | # --- 222 | #parse the command line 223 | (options, args) = parser.parse_args() 224 | proxies = {} if not options.https_proxy else {'https_proxy':options.https_proxy} 225 | 226 | # --- 227 | doer = DoTableTemplate(options.server, proxies, loggingLevel=logging.WARN) 228 | __Trnskrbs_do_login_stuff(doer, options, trace=trace, traceln=traceln) 229 | doer._trpMng.setSessionId(doer._sessionID) 230 | 231 | # --- 232 | try: colId = int(args.pop(0)) 233 | except Exception as e: _exit(usage, 1, e) 234 | try: docidpages = args.pop(0) 235 | except Exception as e: _exit(usage, 1, e) 236 | if args: _exit(usage, 2, Exception("Extra arguments to the command")) 237 | 238 | # --- 239 | # do the job... 240 | if options.trp_doc: 241 | trpdoc = json.load(open(options.trp_doc, "r",encoding='utf-8')) 242 | docId,sPageDesc = doer.buildDescription(colId,docidpages,options.templateID,trpdoc) 243 | else: 244 | docId,sPageDesc = doer.buildDescription(colId,docidpages,options.templateID) 245 | # NcsrLaJob 246 | # CITlabAdvancedLaJob 247 | sPageDesc = doer.jsonToXMLDescription(sPageDesc) 248 | """ 249 | do_tableTemplate.py --temp 6078228 23017 87023/14 250 | """ 251 | 252 | # jobImpl = CvlTableJob 253 | status, jobid = doer.run(options.templateID,colId, sPageDesc,"CvlTableJob") 254 | traceln("job ID:",jobid) 255 | traceln("- Done") 256 | 257 | -------------------------------------------------------------------------------- /src/TranskribusCommands/do_uploadDictionary.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding:utf-8 -*- 3 | 4 | """ 5 | 6 | H Déjean 7 | 8 | 9 | Copyright NLE 2017 10 | 11 | This program is free software: you can redistribute it and/or modify 12 | it under the terms of the GNU General Public License as published by 13 | the Free Software Foundation, either version 3 of the License, or 14 | (at your option) any later version. 15 | 16 | This program is distributed in the hope that it will be useful, 17 | but WITHOUT ANY WARRANTY; without even the implied warranty of 18 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 | GNU General Public License for more details. 20 | 21 | You should have received a copy of the GNU General Public License 22 | along with this program. If not, see . 23 | 24 | 25 | Developed for the EU project READ. The READ project has received funding 26 | from the European Union’s Horizon 2020 research and innovation programme 27 | under grant agreement No 674943. 28 | 29 | """ 30 | from __future__ import absolute_import 31 | from __future__ import print_function 32 | from __future__ import unicode_literals 33 | 34 | #optional: useful if you want to choose the logging level to something else than logging.WARN 35 | import sys, os, logging 36 | from io import open 37 | 38 | from optparse import OptionParser 39 | # import json 40 | 41 | try: #to ease the use without proper Python installation 42 | import TranskribusPyClient_version 43 | except ImportError: 44 | sys.path.append( os.path.dirname(os.path.dirname( os.path.abspath(sys.argv[0]) )) ) 45 | import TranskribusPyClient_version 46 | 47 | from TranskribusCommands import _Trnskrbs_default_url, __Trnskrbs_basic_options, _Trnskrbs_description, __Trnskrbs_do_login_stuff, _exit 48 | from TranskribusPyClient.client import TranskribusClient 49 | 50 | from TranskribusPyClient.common.trace import traceln, trace 51 | 52 | DEBUG = 0 53 | 54 | description = """upload a private dictionary 55 | 56 | """ + _Trnskrbs_description 57 | 58 | usage = """%s -d 59 | 60 | a single file called will be created by concatenating and will be uploaded in the tempDict user ftp folder 61 | """%sys.argv[0] 62 | 63 | class DoHtrRnn(TranskribusClient): 64 | """ 65 | Good morning, 66 | 67 | temp. dictionaries also can be sent now, see example below. 68 | The response will contain the dict. filename to be used in the HTR 69 | request's tempDict parameter. If extension of the given name does not 70 | match ".dict", this will be appended. 71 | The POST request's body should contain the dictionary data as UTF-8 String. 72 | The temp. dictionaries are now bound to the user account and you can 73 | check the transmission outcome by logging in via FTP to transkribus.eu 74 | with your credentials. There you will find a dir. called "dictTmp" 75 | containing the sent files, that will be used for HTR. You can also put 76 | dictionaries there via FTP and use them for HTR with the tempDict parameter. 77 | 78 | Best regards, 79 | Philip 80 | 81 | POST /TrpServerTesting/rest/recognition/tempDict?fileName=test.dict HTTP/1.1 82 | Host: transkribus.eu 83 | Content-Type: text/plain 84 | Cache-Control: no-cache 85 | 86 | er,124 87 | ... 88 | """ 89 | sDefaultServerUrl = _Trnskrbs_default_url 90 | #--- INIT ------------------------------------------------------------------------------------------------------------- 91 | def __init__(self, trnkbsServerUrl, sHttpProxy=None, loggingLevel=logging.WARN): 92 | TranskribusClient.__init__(self, sServerUrl=self.sDefaultServerUrl, proxies=sHttpProxy, loggingLevel=loggingLevel) 93 | 94 | def run(self, dictName,dictString): 95 | ret = self.uploadDict(dictName,dictString) 96 | return ret 97 | 98 | if __name__ == '__main__': 99 | version = "v.01" 100 | 101 | #prepare for the parsing of the command line 102 | parser = OptionParser(usage=usage, version=version) 103 | parser.description = description 104 | 105 | #"-s", "--server", "-l", "--login" , "-p", "--pwd", "--https_proxy" OPTIONS 106 | __Trnskrbs_basic_options(parser, DoHtrRnn.sDefaultServerUrl) 107 | 108 | parser.add_option("-d", "--dict" , dest='ldict', action="append", type="string", help="list of dictionaries") 109 | 110 | # --- 111 | #parse the command line 112 | (options, args) = parser.parse_args() 113 | proxies = {} if not options.https_proxy else {'https_proxy':options.https_proxy} 114 | 115 | # --- 116 | doer = DoHtrRnn(options.server, proxies, loggingLevel=logging.WARN) 117 | __Trnskrbs_do_login_stuff(doer, options, trace=trace, traceln=traceln) 118 | # --- 119 | try: dictName = args.pop(0) 120 | except Exception as e: _exit(usage, 1, e) 121 | # try: filename = args.pop(0) 122 | # except Exception as e: _exit(usage, 1, e) 123 | 124 | try: 125 | sfullDict="" 126 | for filename in options.ldict: 127 | dictFile = open(filename,'r',encoding='utf-8').read() 128 | dictFile = dictFile.replace('\t',',') 129 | sfullDict += dictFile #+ '\n' 130 | traceln( "loaded %s"%(filename)) 131 | except IOError:print ('not possible to open file :%s'%(filename)) 132 | 133 | # print sfullDict.encode("utf-8") 134 | # need to normalize the weights when build this different dictionaries??? 135 | response = doer.run(dictName, sfullDict) 136 | traceln(response) 137 | 138 | traceln() 139 | traceln("- Done") 140 | 141 | -------------------------------------------------------------------------------- /src/TranskribusDU/xml_formats/PageXmlExtractor.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | ''' 4 | Created on August 1st, 2017 5 | 6 | 7 | Utility to extract several pages from several document to a folder or a MultiPageXml file 8 | 9 | @author: meunier 10 | ''' 11 | 12 | from __future__ import absolute_import 13 | from __future__ import print_function 14 | from __future__ import unicode_literals 15 | 16 | import os 17 | from io import open 18 | import json 19 | import shutil 20 | import math 21 | 22 | import xml_formats.PageXml as PageXml 23 | 24 | class DocPageSet: 25 | ''' 26 | the list of pages of interest of a document 27 | take the textual form: docID=] 28 | a page-range-set takes the form: [,]+ 29 | with pageRange taking the form: N|N-N 30 | For instance: 111=1 or 222=1-10 or 333=1,10-20,3,40-50 31 | 32 | NOTE: ranges should not overlap!!! 33 | ''' 34 | def __init__(self, sSpec): 35 | try: 36 | sDocID, sPageRangeSet = sSpec.strip().split('=') 37 | except ValueError: 38 | raise ValueError("Malformed range: '%s'"%sSpec) 39 | 40 | self.sDocID = sDocID 41 | self._ltiRange = [] 42 | prev_b = None 43 | for sPageRange in sPageRangeSet.split(","): 44 | lsN = sPageRange.split('-') 45 | if len(lsN) == 1: 46 | a = int(lsN[0]) 47 | b = a 48 | elif len(lsN) == 2: 49 | a,b = int(lsN[0]), int(lsN[1]) 50 | else: 51 | raise ValueError("invalid range: '%s'"%sPageRange) 52 | if not(a<=b): raise ValueError("Invalid range: '%s'"%sPageRange) 53 | self._ltiRange.append( (a,b) ) #222=1-10 54 | if prev_b < a: 55 | prev_b = b 56 | else: 57 | raise ValueError("unordered or overlapping ranges: '%d' >= '%d' '%s'"%(prev_b, a, sSpec)) 58 | if not self.sDocID: raise ValueError("missing docID: '%s'"%sSpec) 59 | if not self._ltiRange: raise ValueError("empty range: '%s'"%sSpec) 60 | 61 | # ----- 62 | def getDocID(self, bSkipPath=False): 63 | if bSkipPath: 64 | return os.path.basename(self.sDocID) 65 | else: 66 | return self.sDocID 67 | 68 | def getRangeString(self): return ",".join( "%d-%d"%(a,b) if a != b else "%d"%a for (a,b) in self._ltiRange ) 69 | 70 | def iterPageNumber(self): 71 | """ 72 | Iterator returning each page number in turn 73 | """ 74 | for a,b in self._ltiRange: 75 | for n in range(a,b+1): 76 | yield n 77 | raise StopIteration 78 | 79 | # ----- 80 | def __str__(self): 81 | return "%s=%s"%(self.sDocID, self.getRangeString()) 82 | 83 | def testDocPageSet(): 84 | import pytest 85 | 86 | for s in ["111=1", "222=1-10", "333=1,10-20,23,40-50"]: 87 | assert str(DocPageSet(s)) == s, s 88 | 89 | o = DocPageSet("111=1") 90 | assert o.getDocID() == "111" 91 | assert [i for i in o.iterPageNumber()] == [1] 92 | 93 | o = DocPageSet("a/b/c/111=1") 94 | assert o.getDocID() == "a/b/c/111" 95 | assert o.getDocID(True) == "111" 96 | assert [i for i in o.iterPageNumber()] == [1] 97 | 98 | 99 | o = DocPageSet("333=1,10-20,23,40-50") 100 | assert o.getDocID() == "333" 101 | assert [i for i in o.iterPageNumber()] == [1]+range(10,21)+[23]+range(40,51) 102 | 103 | with pytest.raises(ValueError): DocPageSet("aaa") 104 | with pytest.raises(ValueError): DocPageSet("aaa=") 105 | with pytest.raises(ValueError): DocPageSet("=1") 106 | with pytest.raises(ValueError): DocPageSet("=1-2") 107 | with pytest.raises(ValueError): DocPageSet("aaa=12=12") 108 | with pytest.raises(ValueError): DocPageSet("aaa=22-11") 109 | with pytest.raises(ValueError): DocPageSet("aaa=-11") 110 | with pytest.raises(ValueError): DocPageSet("aaa=-11-") 111 | with pytest.raises(ValueError): DocPageSet("aaa=-11-12") 112 | with pytest.raises(ValueError): DocPageSet("aaa=333=1,10-20,3,40-50") 113 | 114 | class PageXmlExtractor: 115 | ''' 116 | Utility to extract several pages from several document to a folder 117 | ''' 118 | sColDir = 'col' 119 | 120 | @classmethod 121 | def getFilename(self, sDocID, name): 122 | return os.path.join(sDocID, name) 123 | 124 | @classmethod 125 | def extractPagesToDir(cls, lDocPageSet, sToDir): 126 | """ 127 | extract the pages from the given list of PageSet and store them in the given folder. 128 | (typically to be packaged as a MultiPageXml using PageXml.py) 129 | return the number of copied files, and list of tuple (pnum, orig-docID, orig-pnum, orig-filename) 130 | """ 131 | if not os.path.isdir(sToDir): 132 | print(" - creating directory ", sToDir) 133 | os.mkdir(sToDir) 134 | else: 135 | if len(os.listdir(sToDir)) > 0: raise ValueError("Target folder (%s) must be empty."%sToDir) 136 | if not os.path.isdir(sToDir): raise ValueError("%s is not a directory"%sToDir) 137 | 138 | jsonOriginFilename = os.path.join(sToDir, "origin.json") 139 | cnt, ltOrigin = cls.getOriginTuple(lDocPageSet, jsonOriginFilename) 140 | 141 | print( " - total number of pages = %d"%cnt) 142 | 143 | nbDigit = math.log10(cnt)+1 144 | sFmt = "%%0%dd.pxml" % nbDigit #e.g. %03d.pxml 145 | 146 | for (cnt, docID, n, sFilename) in ltOrigin: 147 | sToFilename = os.path.join(sToDir, sFmt%cnt) 148 | print(" copying %s --> %s"%(sFilename, sToFilename)) 149 | shutil.copy(sFilename, sToFilename) 150 | 151 | return cnt, ltOrigin 152 | 153 | @classmethod 154 | def extractPagesToFile(cls, lDocPageSet, sToFile, bIndent=True): 155 | """ 156 | extract the pages from the given list of PageSet and store them in a MultiPageXml file 157 | (typically to be packaged as a MultiPageXml using PageXml.py) 158 | return the number of copied files, and list of tuple (pnum, orig-docID, orig-pnum, orig-filename) 159 | """ 160 | 161 | sBaseName, _ = os.path.splitext(sToFile) 162 | jsonOriginFilename = sBaseName + "_origin.json" 163 | cnt, ltOrigin = cls.getOriginTuple(lDocPageSet, jsonOriginFilename) 164 | 165 | print( " - total number of pages = %d"%cnt) 166 | 167 | print( " Generating %s"%(sToFile)) 168 | doc = PageXml.MultiPageXml.makeMultiPageXml([sFilename for (cnt, docID, n, sFilename) in ltOrigin] ) 169 | doc.write(sToFile, xml_declaration='UTF-8',encoding="utf-8", pretty_print=bIndent) 170 | 171 | return cnt, ltOrigin 172 | 173 | @classmethod 174 | def getOriginTuple(cls, lDocPageSet, jsonOriginFilename=None): 175 | """ 176 | prepare for extracting the pages from the given list of PageSet 177 | return the number of files, and list of tuple (pnum, orig-docID, orig-pnum, orig-filename) 178 | """ 179 | 180 | ltOrigin = list() 181 | cnt = 0 182 | for o in lDocPageSet: 183 | print( " - Processing doc %s, pages %s"%(o.getDocID(), o.getRangeString())) 184 | lsFilename = cls.getPageFilenameList(o.getDocID(), ".pxml") 185 | for n in o.iterPageNumber(): 186 | cnt += 1 187 | sFilename = lsFilename[n-1] 188 | ltOrigin.append( (cnt, o.getDocID(True), n, sFilename) ) # new-PNum, docID, orig-PNum, orig-filename 189 | 190 | if jsonOriginFilename: 191 | if sys.version_info > (3,0): 192 | with open(jsonOriginFilename, "wb",encoding='utf-8') as fd: json.dump(ltOrigin, fd, indent=True) 193 | else: 194 | with open(jsonOriginFilename, "wb") as fd: json.dump(ltOrigin, fd, indent=True) 195 | 196 | print( " (see %s)"%(jsonOriginFilename)) 197 | 198 | return cnt, ltOrigin 199 | 200 | 201 | @classmethod 202 | def getPageFilenameList(cls, sDocID, sExt): 203 | assert sExt.startswith('.') 204 | 205 | #Look in trp.json file 206 | lsFilename = [] 207 | 208 | trpFile = os.path.join(sDocID, 'trp.json') 209 | if not( os.path.exists(trpFile)): raise ValueError("Non-existing trp.json file %s" % trpFile) 210 | with open(trpFile, "rb",'utf-8') as fd: 211 | jTrp = json.load(fd) 212 | 213 | for i, page in enumerate(jTrp['pageList']['pages']): 214 | sImgFileName = page['imgFileName'] 215 | sBaseName, _ = os.path.splitext(sImgFileName) 216 | sXmlFilename = cls.getFilename(sDocID, sBaseName + sExt) 217 | lsFilename .append( sXmlFilename ) 218 | if page['pageNr'] != i+1: print( "\tWarning: expected page number %d , got %s"%(i+1, page['pageNr'])) 219 | 220 | return lsFilename 221 | 222 | if __name__ == "__main__": 223 | 224 | import sys, optparse 225 | usage = """ 226 | %s [--mpxml filename] [--dir dirname] [docID=]+ 227 | 228 | Utility to extract a set of PageXml files from a set of documents and either: 229 | - store them into a target folder with simple numbering, with unambiguous order. 230 | - generate a MultiPageXMl document. In case of empty filename or "-", the filename is automatically composed from the arguments. 231 | 232 | a page-range-set takes the form: [,]+ 233 | with pageRange taking the form: N|N-N 234 | Page ranges must be ordered, per document. 235 | For instance: 111=1 222=1-10 333=1,10-20,23,40-50 236 | 237 | JL Meunier - Aug. 2017 238 | """ % sys.argv[0] 239 | 240 | parser = optparse.OptionParser(usage=usage) 241 | parser.add_option("--dir" , dest='dir' , action="store", type="string", help="Store the extracted PageXml pages into the specified directory.") 242 | parser.add_option("--file", dest='file', action="store", type="string", help="Store the extracted PageXml pages into the specified MultiPageXml document.") 243 | 244 | (options, args) = parser.parse_args() 245 | 246 | if args: 247 | lsDocPageSet = args 248 | else: 249 | parser.print_help() 250 | parser.exit(1, "") 251 | 252 | lDocPageSet = [] 253 | print("Parsing range(s)") 254 | for s in lsDocPageSet: 255 | o = DocPageSet(s) 256 | lDocPageSet.append(o) 257 | 258 | if options.dir: 259 | print( "Extracting into folder: ", options.dir) 260 | n = PageXmlExtractor.extractPagesToDir(lDocPageSet, options.dir) 261 | 262 | if options.file != None: 263 | if options.file in["", "-"]: options.file = "extraction_" + "_".join(map(str, lDocPageSet)) #automatic filename 264 | sToFile = options.file if options.file.lower().endswith(".mpxml") else options.file+".mpxml" #automatic .mpxml extension 265 | print( "Extracting into file: ", sToFile) 266 | n = PageXmlExtractor.extractPagesToFile(lDocPageSet, sToFile) 267 | 268 | print( "DONE") 269 | -------------------------------------------------------------------------------- /src/TranskribusDU/xml_formats/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Transkribus/TranskribusPyClient/f3b0208751a553257ddf313b73278477aab1ffef/src/TranskribusDU/xml_formats/__init__.py -------------------------------------------------------------------------------- /src/TranskribusDU/xml_formats/mpxml2pxml.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | mpxml to pxml convertor 4 | 5 | @author: H Déjean 6 | 7 | READ project 8 | 31/05/2017 9 | """ 10 | from __future__ import absolute_import 11 | from __future__ import print_function 12 | from __future__ import unicode_literals 13 | import sys, os.path, optparse 14 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(sys.argv[0])))) 15 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(sys.argv[0]))))) 16 | 17 | from lxml import etree 18 | import xml_formats.PageXml as PageXml 19 | 20 | if __name__ == "__main__": 21 | 22 | usage = """ 23 | %s dir docid 24 | Utility to create a set of pageXml XML files from a mpxml file. 25 | """ % sys.argv[0] 26 | 27 | parser = optparse.OptionParser(usage=usage) 28 | 29 | parser.add_option("--format", dest='bIndent', action="store_true" , help="reformat/reindent the input") 30 | parser.add_option("--dir", dest='destdir', action="store", default='pxml' , help="directory ouptut") 31 | (options, args) = parser.parse_args() 32 | 33 | try: 34 | dir = args[0] 35 | docid= args[1] 36 | except: 37 | parser.print_help() 38 | parser.exit(1, "") 39 | 40 | sDocFilename = "%s%scol%s%s.mpxml" % (dir,os.sep,os.sep,docid) 41 | 42 | doc = etree.parse(sDocFilename) 43 | 44 | for pnum, pageDoc in PageXml.MultiPageXml._iter_splitMultiPageXml(doc, bInPlace=False): 45 | outfilename = "%s%s%s%s%s_%03d.pxml" % (dir,os.sep,options.destdir,os.sep,docid,pnum) 46 | print(outfilename) 47 | pageDoc.write(outfilename, xml_declaration ='UTF-8',encoding="utf-8", pretty_print = bool(options.bIndent)) 48 | print ("DONE") -------------------------------------------------------------------------------- /src/TranskribusDU/xml_formats/tests/testDS2PageXml/.gitignore: -------------------------------------------------------------------------------- 1 | /RRB_MM_01_033_Jahr_1810.mpxml 2 | -------------------------------------------------------------------------------- /src/TranskribusDU/xml_formats/tests/test_DS2PageXml.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | test DS2PageXml convertor 4 | @author:déjean 5 | """ 6 | import os.path 7 | from xml_formats.DS2PageXml import DS2PageXMLConvertor 8 | from xml_formats.PageXml import MultiPageXml 9 | 10 | sTESTS_DIR = os.path.dirname(os.path.abspath(__file__)) 11 | 12 | def test_DS2PageXmlConversion(): 13 | filename = os.path.join(sTESTS_DIR, 14 | 'testDS2PageXml/RRB_MM_01_033_Jahr_1810.ds.xml') 15 | conv= DS2PageXMLConvertor() 16 | conv.inputFileName = filename 17 | doc = conv.loadDom(filename) 18 | lPageXmlDocs = conv.run(doc) 19 | mp = MultiPageXml() 20 | # newDoc = mp.makeMultiPageXmlMemory(map(lambda (x,y):x,lPageXmlDocs)) 21 | newDoc = mp.makeMultiPageXmlMemory([x for x,_y in lPageXmlDocs]) 22 | newDoc.write(os.path.join(sTESTS_DIR, 23 | "testDS2PageXml/RRB_MM_01_033_Jahr_1810.mpxml"), 24 | xml_declaration=True, 25 | encoding="UTF-8", 26 | pretty_print=True) 27 | 28 | 29 | # res= conv.storePageXmlSetofFiles(lPageXmlDocs) 30 | # print 'test:', True if res == 0 else False 31 | 32 | if __name__ == "__main__": 33 | # test_setMetadata() 34 | test_DS2PageXmlConversion() -------------------------------------------------------------------------------- /src/TranskribusDU/xml_formats/tests/test_PageXml.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | ''' 4 | Created on 23 Nov 2016 5 | 6 | @author: meunier 7 | ''' 8 | import pytest 9 | from lxml import etree 10 | from io import BytesIO 11 | 12 | from xml_formats.PageXml import PageXml, PageXmlException 13 | 14 | 15 | def test_custom(): 16 | assert PageXml.parseCustomAttr("") == {} 17 | assert PageXml.parseCustomAttr(" ") == {} 18 | assert PageXml.parseCustomAttr(" ") == {} 19 | 20 | assert PageXml.parseCustomAttr("a {x:1;}") == { 'a': { 'x':'1' } } 21 | assert PageXml.parseCustomAttr(" a {x:1;}") == { 'a': { 'x':'1' } } 22 | assert PageXml.parseCustomAttr("a {x:1;} ") == { 'a': { 'x':'1' } } 23 | assert PageXml.parseCustomAttr(" a {x:1;} ") == { 'a': { 'x':'1' } } 24 | assert PageXml.parseCustomAttr("a {x:1 ;}") == { 'a': { 'x':'1' } } 25 | assert PageXml.parseCustomAttr("a {x:1 ; }") == { 'a': { 'x':'1' } } 26 | assert PageXml.parseCustomAttr("a { x:1 ; }") == { 'a': { 'x':'1' } } 27 | 28 | assert PageXml.parseCustomAttr("a{x:1;}") == { 'a': { 'x':'1' } } 29 | assert PageXml.parseCustomAttr("a{x:1 ;}") == { 'a': { 'x':'1' } } 30 | assert PageXml.parseCustomAttr("a{x:1 ; }") == { 'a': { 'x':'1' } } 31 | assert PageXml.parseCustomAttr("a{ x:1 ; }") == { 'a': { 'x':'1' } } 32 | 33 | assert PageXml.parseCustomAttr("a,b{x:1;}") == { 'a': { 'x':'1' }, 'b': { 'x':'1' } } 34 | assert PageXml.parseCustomAttr("a, b{x:1 ;}") == { 'a': { 'x':'1' }, 'b': { 'x':'1' } } 35 | assert PageXml.parseCustomAttr("a , b{x:1 ; }") == { 'a': { 'x':'1' }, 'b': { 'x':'1' } } 36 | assert PageXml.parseCustomAttr("a ,b{ x:1 ; }") == { 'a': { 'x':'1' }, 'b': { 'x':'1' } } 37 | assert PageXml.parseCustomAttr("a ,b { x:1 ; }") == { 'a': { 'x':'1' }, 'b': { 'x':'1' } } 38 | 39 | assert PageXml.parseCustomAttr("a { x:1 ; y:2 }") == { 'a': { 'x':'1', 'y':'2'} } 40 | assert PageXml.parseCustomAttr("a,b { x:1 ; y:2 }") == { 'a': { 'x':'1', 'y':'2'}, 'b': { 'x':'1', 'y':'2'} } 41 | 42 | assert PageXml.parseCustomAttr("a {}") == { 'a': { } } 43 | 44 | assert PageXml.parseCustomAttr("readingOrder {index:4;} structure {type:catch-word;}") == { 'readingOrder': { 'index':'4' }, 'structure':{'type':'catch-word'} } 45 | 46 | def test_malformed_custom(): 47 | with pytest.raises(ValueError): PageXml.parseCustomAttr("a {x1;}") 48 | with pytest.raises(ValueError): PageXml.parseCustomAttr("a x1;}") 49 | with pytest.raises(ValueError): PageXml.parseCustomAttr("a { x1;") 50 | with pytest.raises(ValueError): PageXml.parseCustomAttr("a { x1 }") 51 | 52 | #with pytest.raises(ValueError): PageXml.parseCustomAttr("a { x:1 }") #should it fail? 53 | assert PageXml.parseCustomAttr("a { x:1 2}") == {'a': {'x': '1 2'}} 54 | 55 | #with pytest.raises(ValueError): PageXml.parseCustomAttr("a { x:1 2}")#should it fail? (or do we allow spaces in names or values?) 56 | assert PageXml.parseCustomAttr(" a b { x y : 1 2 }") == {'a b': {'x y': '1 2'}} 57 | 58 | def test_getsetCustomAttr(): 59 | sXml = b""" 60 | 61 | 62 | 63 | """ 64 | doc = etree.parse(BytesIO(sXml)) 65 | nd = doc.getroot() 66 | assert PageXml.getCustomAttr(nd, "readingOrder", "index") == '9' 67 | assert PageXml.setCustomAttr(nd, "readingOrder", "index", 99) == 99 68 | assert PageXml.getCustomAttr(nd, "readingOrder", "index") == '99' 69 | 70 | assert PageXml.getCustomAttr(nd, "readingOrder") == {'index':'99'} 71 | 72 | assert PageXml.setCustomAttr(nd, "readingOrder", "toto", "zou") == "zou" 73 | assert PageXml.getCustomAttr(nd, "readingOrder", "toto") == 'zou' 74 | 75 | with pytest.raises(PageXmlException): PageXml.getCustomAttr(nd, "readingOrder", "axiste_pas") 76 | with pytest.raises(PageXmlException): PageXml.getCustomAttr(nd, "axiste_pas_non_plus", "axiste_pas") 77 | 78 | def getMetadataTestDOM(): 79 | sXml = b""" 80 | 81 | 82 | Tilla 83 | 2016-08-18T13:35:08.252+07:00 84 | 2016-12-01T09:53:39.610+01:00 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | """ 105 | doc = etree.parse(BytesIO(sXml)) 106 | return doc 107 | 108 | def test_getMetadata(): 109 | doc = getMetadataTestDOM() 110 | nd = doc.getroot() 111 | 112 | md = PageXml.getMetadata(doc) 113 | assert md.Creator == "Tilla" 114 | assert md.Created == "2016-08-18T13:35:08.252+07:00" 115 | assert md.LastChange == "2016-12-01T09:53:39.610+01:00" 116 | assert md.Comments == None 117 | 118 | md = PageXml.getMetadata(None, nd[0]) 119 | assert md.Creator == "Tilla" 120 | assert md.Created == "2016-08-18T13:35:08.252+07:00" 121 | assert md.LastChange == "2016-12-01T09:53:39.610+01:00" 122 | 123 | def test_setMetadata(): 124 | import datetime 125 | doc = getMetadataTestDOM() 126 | 127 | nd = doc.getroot() 128 | _sutc = datetime.datetime.utcnow().isoformat() 129 | PageXml.setMetadata(doc, None, "Tigrette") 130 | 131 | sutc = datetime.datetime.utcnow().isoformat() 132 | md = PageXml.getMetadata(doc) 133 | assert md.Creator == "Tigrette" 134 | assert md.Created == "2016-08-18T13:35:08.252+07:00" 135 | assert md.LastChange.startswith(sutc[:15]) 136 | assert md.Comments == None 137 | print(doc) 138 | 139 | sutc = datetime.datetime.utcnow().isoformat() 140 | PageXml.setMetadata(doc, None, "Bijoux", "Le chat de Martine") 141 | md = PageXml.getMetadata(None, nd[0]) 142 | assert md.Creator == "Bijoux" 143 | assert md.Created == "2016-08-18T13:35:08.252+07:00" 144 | assert md.LastChange.startswith(sutc[:15]) 145 | assert md.Comments == "Le chat de Martine" 146 | print(doc) 147 | 148 | def test_CreationPageXmlDocument(): 149 | doc= PageXml.createPageXmlDocument(creatorName='HerveforTest', filename='hervefortest.jpg', imgW=100, imgH=100) 150 | print(doc) 151 | 152 | if __name__ == "__main__": 153 | test_setMetadata() 154 | test_CreationPageXmlDocument() -------------------------------------------------------------------------------- /src/TranskribusPyClient/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Transkribus/TranskribusPyClient/f3b0208751a553257ddf313b73278477aab1ffef/src/TranskribusPyClient/__init__.py -------------------------------------------------------------------------------- /src/TranskribusPyClient/common/IntegerRange.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | Integer range specification for Python clients 5 | 6 | A class to deal with integer range specifications like 1-5,8 7 | 8 | Copyright Naver(C) 2017, JL. Meunier, August 2017 9 | 10 | This program is free software: you can redistribute it and/or modify 11 | it under the terms of the GNU General Public License as published by 12 | the Free Software Foundation, either version 3 of the License, or 13 | (at your option) any later version. 14 | 15 | This program is distributed in the hope that it will be useful, 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 | GNU General Public License for more details. 19 | 20 | You should have received a copy of the GNU General Public License 21 | along with this program. If not, see . 22 | 23 | 24 | Developed for the EU project READ. The READ project has received funding 25 | from the European Union’s Horizon 2020 research and innovation programme 26 | under grant agreement No 674943. 27 | 28 | """ 29 | from __future__ import absolute_import 30 | from __future__ import print_function 31 | from __future__ import unicode_literals 32 | 33 | from builtins import int 34 | 35 | class IntegerRange: 36 | """ 37 | A integer range object 38 | 39 | - at creation, pass a range specification of the form: 1 or 1-3 or 1,3 or 1,5-7,8 40 | IntegerRange = RANGE [, RANGE]+ 41 | where RANGE if either an integer or 2 integer separated by a '-' 42 | RANGE = N 43 | RANGE = N-N 44 | Spaces are ignored, apart between digits. 45 | - the object is a container that supports: 46 | - iteration 47 | - len() 48 | - reversed() 49 | - contains test (if n in o: ...) 50 | """ 51 | def __init__(self, sRange=""): 52 | self._ltAB = self.parseSpec(sRange) 53 | assert str(self) == "".join(sRange.split()) 54 | 55 | def initFromEnumeration(self, lN): 56 | """ 57 | create the list of ranges that exactly cover the enumeration. 58 | """ 59 | if not lN: 60 | pass 61 | elif len(lN) == 1: 62 | self.addRange(lN[0]) 63 | else: 64 | lN = sorted(lN) 65 | A = lN[0] 66 | Nprev = A 67 | for N in lN[1:]: 68 | if Nprev+1 < N: 69 | #hole in sequence, create an interval! 70 | self.addRange(A, Nprev) 71 | A = N 72 | Nprev = N 73 | self.addRange(A, Nprev) 74 | return self 75 | 76 | @classmethod 77 | def parseSpec(cls, sSpec): 78 | """ 79 | parse a range specification of positive integers and return a list of pair of indices 80 | """ 81 | ltAB = list() 82 | prev_b = -1 83 | for sRange in sSpec.split(","): 84 | if not sRange.split(): continue #empty spec! 85 | a,b = cls._getAB(sRange) 86 | ltAB.append( (a,b) ) 87 | if prev_b < a: 88 | prev_b = b 89 | else: 90 | raise ValueError("unordered or overlapping ranges: '%s' >= '%s' '%s'"%(prev_b, a, sSpec)) 91 | return ltAB 92 | 93 | def addRange(self, a, b=None): 94 | if b==None: b = a 95 | assert a <= b 96 | self._ltAB.append( (a,b) ) 97 | self._ltAB.sort() 98 | if not self._check(): 99 | self._ltAB.remove( (a,b) ) 100 | raise ValueError("Overlapping range") 101 | 102 | def len(self): 103 | """ 104 | For som subclass, this method can be useful as it is not forced by Python to return an int (like for return float('inf')) 105 | """ 106 | return sum(b-a+1 for a,b in self._ltAB) 107 | 108 | @classmethod 109 | def _getAB(cls, sRange): 110 | lsN = sRange.split('-') 111 | if len(lsN) == 1: 112 | a = int(lsN[0]) 113 | b = a 114 | elif len(lsN) == 2: 115 | sA, sB = lsN 116 | a,b = int(sA), int(sB) 117 | if not(a<=b): raise ValueError("Invalid range: '%s'"%sRange) 118 | else: 119 | raise ValueError("invalid range: '%s'"%sRange) 120 | return a, b 121 | 122 | def _check(self): 123 | """ 124 | checking things are in order 125 | """ 126 | prevB = -float('inf') 127 | for a,b in self._ltAB: 128 | if prevB > a: return False 129 | prevB = b 130 | return True 131 | 132 | def __str__(self): 133 | return ",".join( "%s-%s"%(a,b) if a != b else "%s"%a for (a,b) in self._ltAB ) 134 | 135 | def __bool__(self): 136 | return bool(self._ltAB) 137 | 138 | def __nonzero__(self): 139 | return bool(self._ltAB) 140 | 141 | #--- Emulating Container type... 142 | def __iter__(self): 143 | """ 144 | Iterator returning each number in turn 145 | """ 146 | for a,b in self._ltAB: 147 | for n in range(a,b+1): yield n 148 | raise StopIteration 149 | 150 | def __reversed__(self): 151 | """ 152 | Reversed iterator 153 | If we do not provide it, we must provide a __getitem__ (boring to code and how useful??) 154 | """ 155 | for a,b in reversed(self._ltAB): 156 | for n in range(b,a-1,-1): yield n 157 | raise StopIteration 158 | 159 | def __len__(self): 160 | return sum(b-a+1 for a,b in self._ltAB) 161 | 162 | def __contains__(self, item): 163 | #All integers are long in python3 and call to covert is just int 164 | try: 165 | item = int(item) 166 | except TypeError: 167 | raise ValueError("A range contains numeric values not %s"%type(item)) 168 | #if type(item) != types.IntType and type(item) != types.LongType: raise ValueError("A range contains integer values not %s"%type(item)) 169 | a, b = None, None 170 | for a,b in self._ltAB: 171 | if b >= item: break 172 | #print a, item, b 173 | return a<= item and item <= b 174 | 175 | 176 | 177 | # ------ TESTS ---------------------------------------------------------------------------------- 178 | def test_good_spec(capsys): 179 | def container_test(o, lref): 180 | assert list(o) == lref 181 | assert list(reversed(o)) == list(reversed(lref)) 182 | for item in lref: assert item in o 183 | assert -99 not in o 184 | 185 | o = IntegerRange("1") 186 | # with capsys.disabled(): 187 | # print "YOOOOOOOOOOOOOOOOOOOOOOOOOOO ", list(reversed(o)) 188 | container_test(o, [1]) 189 | 190 | o = IntegerRange("99") 191 | container_test(o, [99]) 192 | 193 | o = IntegerRange("1,99") 194 | container_test(o, [1, 99]) 195 | 196 | o = IntegerRange("1-5") 197 | container_test(o, range(1, 6)) 198 | 199 | o = IntegerRange("1-5,6-88") 200 | container_test(o, range(1, 6)+range(6, 89)) 201 | 202 | o = IntegerRange("1-3,4-8") 203 | container_test(o, range(1, 9)) 204 | assert len(o) == len(range(1, 9)) 205 | 206 | def test_spaced_good_spec(): 207 | def container_test(o, lref): 208 | assert list(o) == lref 209 | assert list(reversed(o))== list(reversed(lref)) 210 | for item in lref: assert item in o 211 | assert -99 not in o 212 | 213 | o = IntegerRange(" 1\t\t") 214 | container_test(o, [1]) 215 | 216 | o = IntegerRange("99 ") 217 | container_test(o, [99]) 218 | 219 | o = IntegerRange("1 , 99") 220 | container_test(o, [1, 99]) 221 | 222 | o = IntegerRange(" 1\t- 5\t") 223 | container_test(o, range(1, 6)) 224 | 225 | o = IntegerRange("1-5, 6-88") 226 | container_test(o, range(1, 6)+range(6, 89)) 227 | 228 | o = IntegerRange("1 -3\t,4- 8") 229 | container_test(o, range(1, 9)) 230 | assert len(o) == len(range(1, 9)) 231 | 232 | def test_errors(): 233 | import pytest 234 | with pytest.raises(ValueError): IntegerRange("1 3") 235 | with pytest.raises(ValueError): IntegerRange("3-1") 236 | with pytest.raises(ValueError): IntegerRange("3,1") 237 | with pytest.raises(ValueError): IntegerRange("1-3,2") 238 | with pytest.raises(ValueError): IntegerRange("3,1-2") 239 | with pytest.raises(ValueError): IntegerRange("1-3,3-8") 240 | with pytest.raises(ValueError): IntegerRange("1-3 3,3-8") 241 | with pytest.raises(ValueError): IntegerRange("1-3,3-8 8") 242 | 243 | 244 | def test_limit(): 245 | o = IntegerRange("") 246 | assert list(o) == [] 247 | assert len(o) == 0 248 | o = IntegerRange("\t \t ") 249 | assert list(o) == [] 250 | assert len(o) == 0 251 | 252 | def test_add(): 253 | import pytest 254 | 255 | def container_test(o, lref): 256 | assert list(o) == lref 257 | assert list(reversed(o)) == list(reversed(lref)) 258 | for item in lref: assert item in o 259 | assert -99 not in o 260 | 261 | o = IntegerRange() 262 | container_test(o, []) 263 | 264 | o.addRange(1) 265 | container_test(o, [1]) 266 | 267 | o.addRange(0) 268 | container_test(o, [0, 1]) 269 | 270 | with pytest.raises(ValueError): o.addRange(1) 271 | with pytest.raises(ValueError): o.addRange(0,1) 272 | with pytest.raises(ValueError): o.addRange(-3,0) 273 | with pytest.raises(ValueError): o.addRange(-3,3) 274 | with pytest.raises(ValueError): o.addRange(1,3) 275 | with pytest.raises(ValueError): o.addRange(0,3) 276 | 277 | o.addRange(90, 99) 278 | container_test(o, [0, 1]+range(90, 100)) 279 | 280 | o.addRange(60, 66) 281 | container_test(o, [0, 1]+range(60, 67)+range(90, 100)) 282 | 283 | with pytest.raises(ValueError): o.addRange(0,1000) 284 | with pytest.raises(ValueError): o.addRange(10,60) 285 | with pytest.raises(ValueError): o.addRange(70,95) 286 | with pytest.raises(ValueError): o.addRange(95) 287 | o.addRange(80, 88) 288 | container_test(o, [0, 1]+range(60, 67)+range(80, 89)+range(90, 100)) 289 | 290 | assert 1 in o 291 | assert 0 in o 292 | assert 90 in o 293 | assert 80 in o 294 | assert 60 in o 295 | assert 66 in o 296 | assert 99 in o 297 | assert 88 in o 298 | 299 | assert 50 not in o 300 | 301 | def test_enum(): 302 | def test_enum(l): 303 | ll = set(l) 304 | o = IntegerRange() 305 | o.initFromEnumeration(l) 306 | assert set(o) == ll 307 | 308 | test_enum([]) 309 | test_enum([2]) 310 | test_enum([-2]) 311 | test_enum([2,1]) 312 | test_enum([1,2]) 313 | test_enum([1,2,2]) #bad case that we cover anyway 314 | test_enum([1,2,4,2,5]) 315 | test_enum([7,4,6,1]) 316 | test_enum([0]) 317 | 318 | 319 | -------------------------------------------------------------------------------- /src/TranskribusPyClient/common/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Transkribus/TranskribusPyClient/f3b0208751a553257ddf313b73278477aab1ffef/src/TranskribusPyClient/common/__init__.py -------------------------------------------------------------------------------- /src/TranskribusPyClient/common/trace.py: -------------------------------------------------------------------------------- 1 | # 2 | # A simple trace module 3 | # 4 | # JL Meunier - May 2004 5 | # Copyright XRCE, 2004 6 | # 7 | 8 | import sys 9 | 10 | global traceFD 11 | traceFD = sys.stderr 12 | 13 | def setTraceFD(fd): 14 | global traceFD 15 | traceFD = fd 16 | 17 | def trace(*msg): 18 | global traceFD 19 | for i in msg: 20 | try: traceFD.write(str(i)) 21 | except UnicodeEncodeError:sys.stderr.write(i.encode("utf-8")) 22 | 23 | def traceln(*msg): 24 | global traceFD 25 | 26 | trace(*msg) 27 | # apply(trace, msg) 28 | trace("\n") 29 | traceFD.flush() 30 | 31 | def flush(): 32 | traceFD.flush() 33 | 34 | 35 | 36 | #SELF-TEST 37 | if __name__=="__main__": 38 | 39 | trace(1) 40 | trace(" aut") 41 | trace("o") 42 | traceln("-test") 43 | trace("2 auto", "-", "test") 44 | trace() 45 | traceln() 46 | traceln("Done") 47 | -------------------------------------------------------------------------------- /src/TranskribusPyClient/test/__init__.py: -------------------------------------------------------------------------------- 1 | #-*- coding:utf-8 -*- 2 | 3 | """ 4 | If you run all the test using pytest, you should first set some appropriate values here!! 5 | 6 | Created on 25 Nov 2016 7 | 8 | @author: meunier 9 | """ 10 | 11 | 12 | # an existing collection A 13 | _colId_A = 3571 14 | 15 | #some existing documents in collection A 16 | _docId_a = 7749 17 | _docId_b = 7750 18 | _docId_c = 8251 19 | _docId_d = 8252 20 | 21 | 22 | #A different collection where you can do whatever you want 23 | _coldId_Sandbox = 3820 24 | 25 | -------------------------------------------------------------------------------- /src/TranskribusPyClient/test/test_collections_addDocToCollection.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | #optional: useful if you want to choose the logging level to something else than logging.WARN 4 | import sys, os 5 | import logging 6 | 7 | try: #to ease the use without proper Python installation 8 | import TranskribusPyClient_version 9 | except ImportError: 10 | sys.path.append( os.path.dirname(os.path.dirname( os.path.abspath(sys.argv[0]) )) ) 11 | import TranskribusPyClient_version 12 | 13 | from TranskribusPyClient.test import _coldId_Sandbox, _docId_a 14 | from TranskribusPyClient.client import TranskribusClient, getStoredCredentials 15 | 16 | login, pwd = getStoredCredentials() 17 | 18 | conn = TranskribusClient(proxies={'https':'http://cornillon:8000'}, loggingLevel=logging.INFO) 19 | sessionID = conn.auth_login(login, pwd) 20 | 21 | data = conn.addDocToCollection(_coldId_Sandbox, _docId_a) 22 | """ 23 | True or Exception 24 | """ 25 | 26 | print conn.auth_logout() 27 | 28 | -------------------------------------------------------------------------------- /src/TranskribusPyClient/test/test_collections_copyDocToCollection.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | #optional: useful if you want to choose the logging level to something else than logging.WARN 4 | import sys, os 5 | import logging 6 | 7 | try: #to ease the use without proper Python installation 8 | import TranskribusPyClient_version 9 | except ImportError: 10 | sys.path.append( os.path.dirname(os.path.dirname( os.path.abspath(sys.argv[0]) )) ) 11 | import TranskribusPyClient_version 12 | 13 | from TranskribusPyClient.test import _colId_A, _coldId_Sandbox, _docId_c, _docId_d 14 | from TranskribusPyClient.client import TranskribusClient, getStoredCredentials 15 | 16 | login, pwd = getStoredCredentials() 17 | 18 | conn = TranskribusClient(proxies={'https':'http://cornillon:8000'}, loggingLevel=logging.INFO) 19 | sessionID = conn.auth_login(login, pwd) 20 | 21 | data = conn.duplicateDoc(_colId_A, _docId_c, _coldId_Sandbox, "named_by_JL") 22 | data = conn.duplicateDoc(_colId_A, _docId_d, _coldId_Sandbox) 23 | """ 24 | True or Exception 25 | """ 26 | 27 | print conn.auth_logout() 28 | 29 | -------------------------------------------------------------------------------- /src/TranskribusPyClient/test/test_collections_fulldoc_xml.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | #optional: useful if you want to choose the logging level to something else than logging.WARN 4 | import sys, os 5 | import logging 6 | 7 | try: #to ease the use without proper Python installation 8 | import TranskribusPyClient_version 9 | except ImportError: 10 | sys.path.append( os.path.dirname(os.path.dirname( os.path.abspath(sys.argv[0]) )) ) 11 | import TranskribusPyClient_version 12 | 13 | from TranskribusPyClient.test import _colId_A, _docId_a 14 | from TranskribusPyClient.client import TranskribusClient, getStoredCredentials 15 | 16 | 17 | login, pwd = getStoredCredentials() 18 | 19 | conn = TranskribusClient(proxies={'https':'http://cornillon:8000'} 20 | , loggingLevel=logging.INFO) 21 | print conn 22 | 23 | #print conn.auth_logout() 24 | 25 | sessionID = conn.auth_login(login, pwd) 26 | print sessionID 27 | 28 | #sessionID = conn.auth_login("jean-luc.meunier@xrce.xerox.com", "trnjluc", sHttpsProxyUrl='http://cornillon:8000') 29 | 30 | 31 | 32 | # ret = conn.getDocumentFromServer(colid, docid) 33 | #ret = conn.getDocumentFromServer("3571", "7750") 34 | data = conn.getDocByIdAsXml(_colId_A, str(_docId_a)) #str just to stress-test 35 | #data = conn.getDocByIdAsXml(3571, "7750") 36 | print data 37 | """ 38 | 39 | """ 40 | 41 | conn.setProxies({'https':'http://cornillon:8000'}) 42 | 43 | print conn.auth_logout() 44 | 45 | -------------------------------------------------------------------------------- /src/TranskribusPyClient/test/test_collections_list.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys, os 4 | import logging 5 | 6 | try: #to ease the use without proper Python installation 7 | import TranskribusPyClient_version 8 | except ImportError: 9 | sys.path.append( os.path.dirname(os.path.dirname( os.path.abspath(sys.argv[0]) )) ) 10 | import TranskribusPyClient_version 11 | 12 | from TranskribusPyClient.test import _colId_A, _docId_a 13 | from TranskribusPyClient.client import TranskribusClient, getStoredCredentials 14 | 15 | login, pwd = getStoredCredentials() 16 | 17 | conn = TranskribusClient(proxies={'https':'http://cornillon:8000'} 18 | , loggingLevel=logging.INFO) 19 | 20 | sessionID = conn.auth_login(login, pwd) 21 | data = conn.listDocsByCollectionId(_colId_A) 22 | import pprint 23 | pprint.pprint(data) 24 | 25 | print conn.auth_logout() 26 | 27 | """ 28 | 29 | [{u'collectionList': {u'colList': [{u'colId': 3571, 30 | u'colName': u'READDU', 31 | u'description': u'created by herve.dejean@xrce.xerox.com'}]}, 32 | u'createdFromTimestamp': 33175290, 33 | u'createdToTimestamp': 33175290, 34 | u'docId': 7749, 35 | u'fimgStoreColl': u'TrpDoc_DEA_7749', 36 | u'nrOfPages': 10, 37 | u'scriptType': u'HANDWRITTEN', 38 | u'status': 0, 39 | u'title': u'MM_1_001', 40 | u'uploadTimestamp': 1478161395893L, 41 | u'uploader': u'herve.dejean@xrce.xerox.com', 42 | u'uploaderId': 275}, 43 | {u'collectionList': {u'colList': [{u'colId': 3571, 44 | u'colName': u'READDU', 45 | u'description': u'created by herve.dejean@xrce.xerox.com'}]}, 46 | u'createdFromTimestamp': 0, 47 | u'createdToTimestamp': 0, 48 | u'docId': 7750, 49 | u'fimgStoreColl': u'TrpDoc_DEA_7750', 50 | u'nrOfPages': 10, 51 | u'scriptType': u'HANDWRITTEN', 52 | u'status': 0, 53 | u'title': u'MM_1_005', 54 | u'uploadTimestamp': 1478161451242L, 55 | u'uploader': u'herve.dejean@xrce.xerox.com', 56 | u'uploaderId': 275}] 57 | 58 | """ -------------------------------------------------------------------------------- /src/TranskribusPyClient/test/test_collections_listEditDeclFeatures.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys, os 4 | import logging 5 | 6 | try: #to ease the use without proper Python installation 7 | import TranskribusPyClient_version 8 | except ImportError: 9 | sys.path.append( os.path.dirname(os.path.dirname( os.path.abspath(sys.argv[0]) )) ) 10 | import TranskribusPyClient_version 11 | 12 | from TranskribusPyClient.test import _colId_A 13 | from TranskribusPyClient.client import TranskribusClient, getStoredCredentials 14 | 15 | 16 | login, pwd = getStoredCredentials() 17 | 18 | conn = TranskribusClient(proxies={'https':'http://cornillon:8000'} 19 | , loggingLevel=logging.INFO) 20 | 21 | sessionID = conn.auth_login(login, pwd) 22 | doc = conn.listEditDeclFeatures(_colId_A) 23 | doc.saveFormatFileEnc("-", "UTF-8", True) 24 | conn.xmlFreeDoc(doc) 25 | 26 | print conn.auth_logout() 27 | 28 | """ 29 | 30 | 31 | 32 | 1 33 | Long S 34 | Source uses long "s" 35 | 36 | 37 | 1 38 | 1 39 | Long s is normalized to "s" 40 | false 41 | 42 | 43 | 2 44 | 1 45 | Long s is transcribed as "ſ" U+017F "Latin small letter long s" 46 | false 47 | 48 | 49 | 50 | 51 | 2 52 | u and v 53 | Source uses v for u 54 | 55 | 56 | 3 57 | 2 58 | Transcribed as in source 59 | false 60 | 61 | 62 | 4 63 | 2 64 | Transcribed according to modern spelling 65 | false 66 | 67 | 68 | 69 | 70 | 3 71 | i and j 72 | Source uses "i" and "j" differently to modern spelling 73 | 74 | 75 | 7 76 | 3 77 | Normalized according to modern lexicon 78 | false 79 | 80 | 81 | 5 82 | 3 83 | Transcribed as in source 84 | false 85 | 86 | 87 | 279 88 | 3 89 | Capital letter "J" is normalized to "I" at the beginning of a word 90 | false 91 | 92 | 93 | 94 | 95 | 5 96 | Printspace 97 | The printspace indicates the overall text region. 98 | 99 | 100 | 9 101 | 5 102 | Created by FineReader 103 | false 104 | 105 | 106 | 8 107 | 5 108 | Manually corrected 109 | false 110 | 111 | 112 | 113 | 114 | 6 115 | Ligature "sz" 116 | "sz" is set as ligature 117 | 118 | 119 | 10 120 | 6 121 | Transcribed as "sz" 122 | false 123 | 124 | 125 | 11 126 | 6 127 | Normalized to "ß" 128 | false 129 | 130 | 131 | 132 | 133 | 28 134 | Text regions 135 | Regions which contain handwritten text 136 | 137 | 138 | 34 139 | 28 140 | Manually corrected 141 | false 142 | 143 | 144 | 33 145 | 28 146 | Automatically created 147 | false 148 | 149 | 150 | 151 | 152 | 29 153 | Line Regions 154 | Contain the text of line 155 | 156 | 157 | 35 158 | 29 159 | Automatically created 160 | false 161 | 162 | 163 | 36 164 | 29 165 | Manually corrected 166 | false 167 | 168 | 169 | 170 | 171 | 30 172 | Baselines 173 | The baseline is defined as in Wikipedia - characters are "sitting" on the baseline 174 | 175 | 176 | 38 177 | 30 178 | Manually corrected 179 | false 180 | 181 | 182 | 37 183 | 30 184 | Automatically created 185 | false 186 | 187 | 188 | 189 | 190 | 47 191 | Omitted text 192 | Even in diplomatic transcriptions the editor may decide to not transcribe specific notes or marginalia which do not contribute to the overall objective of the transcription 193 | 194 | 195 | 59 196 | 47 197 | Some text was omitted, e.g. marginalia, notes of librarians 198 | false 199 | 200 | 201 | 60 202 | 47 203 | No text was omitted 204 | false 205 | 206 | 207 | 208 | 209 | 48 210 | Person names 211 | Tagging of person names 212 | 213 | 214 | 61 215 | 48 216 | Person names were tagged 217 | false 218 | 219 | 220 | 62 221 | 48 222 | Person names were not tagged 223 | false 224 | 225 | 226 | 227 | 228 | 49 229 | Geo-Names 230 | Tagging of geo-names 231 | 232 | 233 | 63 234 | 49 235 | Geo-names were tagged 236 | false 237 | 238 | 239 | 64 240 | 49 241 | Geo-names wer not tagged 242 | false 243 | 244 | 245 | 246 | 247 | 50 248 | Abbreviations - common 249 | Common abbreviations are usually known to most readers of a text, for example: e.g., i.e., &, etc. 250 | 251 | 252 | 65 253 | 50 254 | Common abbreviations were not expanded 255 | false 256 | 257 | 258 | 66 259 | 50 260 | Common abbreviations were expanded 261 | false 262 | 263 | 264 | 265 | 266 | 51 267 | Abbreviations 268 | Especially in medieval texts and early modern handwritting many words are abbreviated, or even characters are left out in the middle of a word. These abbreviations often need deep grammatical understanding to be correctly expanded. 269 | 270 | 271 | 68 272 | 51 273 | Abbreviations were not marked 274 | false 275 | 276 | 277 | 67 278 | 51 279 | Abbreviations were marked, but not explanded 280 | false 281 | 282 | 283 | 399 284 | 51 285 | Abbreviations were marked and expanded 286 | false 287 | 288 | 289 | 290 | 291 | 52 292 | Blackening 293 | Sensible text can be marked as "blackened" and can be suppressed when exporting the text and the images 294 | 295 | 296 | 70 297 | 52 298 | Blackeing was not applied 299 | false 300 | 301 | 302 | 69 303 | 52 304 | Blackening was applied to names of persons and companies 305 | false 306 | 307 | 308 | 309 | 310 | """ -------------------------------------------------------------------------------- /src/TranskribusPyClient_version.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 29 Nov 2016 3 | 4 | @author: meunier 5 | ''' 6 | version="0.3" 7 | -------------------------------------------------------------------------------- /src/Transkribus_credential.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | Store in this file your transkribus credentials. 5 | Change acess right to protect this information. 6 | 7 | Alternatively, use do_login --persist to make a persistent session usable by next commands. 8 | 9 | Created on 15 Nov 2016 10 | 11 | @author: meunier 12 | """ 13 | 14 | # Either you store your credentials here, or you use the --persist options 15 | 16 | login = "herve.dejean@naverlabs.com" 17 | password = "" 18 | -------------------------------------------------------------------------------- /tests/.gitignore: -------------------------------------------------------------------------------- 1 | /trnskrbs_3571/ 2 | -------------------------------------------------------------------------------- /tests/test_commands.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Tests of the TranskribusPyClient command-line utilities 4 | # 5 | # JL Meunier - Nov 29th 2016 6 | # 7 | # Copyright Xerox(C) 2016 H. Déjean, JL. Meunier 8 | # 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU General Public License 16 | # along with this program. If not, see . 17 | # 18 | # Developed for the EU project READ. The READ project has received funding 19 | # from the European Union's Horizon 2020 research and innovation programme 20 | # under grant agreement No 674943. 21 | 22 | # ------------------------------------------------------------------------------------------------------------------------ 23 | # --- CONFIGURATION SECTION 24 | # ------------------------------------------------------------------------------------------------------------------------ 25 | 26 | #transkribus valid login 27 | login="herve.dejean@naverlabs.com" 28 | passwd="" 29 | 30 | #some existing collection with read access for you 31 | colId=3571 32 | #2 existing documents, forming a small range 33 | docId_A=7749 34 | docId_B=7750 35 | TRP=tst.trp 36 | 37 | #PYTHON=python 38 | PYTHON=/drives/c/Local/anaconda3/envs/py36/python.exe 39 | 40 | # ------------------------------------------------------------------------------------------------------------------------ 41 | # --- GENERIC STUF BELOW 42 | # ------------------------------------------------------------------------------------------------------------------------ 43 | 44 | SRC=`dirname "$0"`/../src 45 | 46 | tmp_col_name="toto_$$" 47 | 48 | # ------------------------------------------------------------------------------------------------------------------------ 49 | 50 | function error { 51 | echo "ERROR: $1" 52 | exit 1 53 | } 54 | 55 | # ------------------------------------------------------------------------------------------------------------------------ 56 | 57 | #--------------------------------------------------- 58 | #cleaning any persistent login info 59 | echo "===================================================================" 60 | echo "--- logout" 61 | tmp_col_id=`$PYTHON $SRC/TranskribusCommands/do_logout.py --persist` 62 | echo "OK" 63 | 64 | #testing a bad login 65 | echo 66 | echo "--- login" 67 | tmp_col_id=`$PYTHON $SRC/TranskribusCommands/do_login.py --persist -l "tilla" -p "miaouuuu"` && error "login should have failed" 68 | echo 69 | echo "OK" 70 | 71 | #making a login and persisting the session token 72 | echo 73 | echo "--- login" 74 | tmp_col_id=`$PYTHON $SRC/TranskribusCommands/do_login.py --persist -l "$login" -p "$passwd"` || error "login error" 75 | echo "OK" 76 | 77 | #--------------------------------------------------- 78 | echo "= = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =" 79 | 80 | echo 81 | echo "--- creating a collection $tmp_col_name" 82 | tmp_col_id=`$PYTHON $SRC/TranskribusCommands/do_createCollec.py --persist $tmp_col_name` || error "collection creation error" 83 | echo "--> $tmp_col_id" 84 | echo "OK" 85 | 86 | #--------------------------------------------------- 87 | echo "= = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =" 88 | echo 89 | echo "--- adding doc $docId_A - $docId_B to the new collection" 90 | $PYTHON $SRC/TranskribusCommands/do_addDocToCollec.py --persist $tmp_col_id $docId_A || error "collection add error 1" 91 | echo "OK" 92 | 93 | echo 94 | echo "--- adding doc $docId_A - $docId_B to the new collection" 95 | $PYTHON $SRC/TranskribusCommands/do_addDocToCollec.py --persist $tmp_col_id $docId_A-$docId_B || error "collection add error 2" 96 | echo "OK" 97 | 98 | echo "= = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =" 99 | echo 100 | echo "--- copying doc $docId_A from collection $colId to the new collection" 101 | $PYTHON $SRC/TranskribusCommands/do_duplicateDoc.py --persist $colId $tmp_col_id $docId_A || error "collection copy error 1" 102 | echo "OK" 103 | 104 | #--------------------------------------------------- 105 | echo "= = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =" 106 | echo 107 | echo "--- deleting it ( $tmp_col_id ) " 108 | tmp_col_id=`$PYTHON $SRC/TranskribusCommands/do_deleteCollec.py --persist $tmp_col_id` || error "collection deletion error" 109 | echo "OK" 110 | 111 | echo "= = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =" 112 | echo 113 | echo "--- display trpdoc of the first page of $docId_A from collection $colId " 114 | $PYTHON $SRC/TranskribusCommands/do_getDocTrp.py --persist $colId $docId_A 1 || error "getDocTrp error 1" 115 | echo "OK" 116 | 117 | 118 | 119 | #--------------------------------------------------- 120 | echo "= = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =" 121 | echo 122 | echo "--- listing collection $colId " 123 | $PYTHON $SRC/TranskribusCommands/do_listCollec.py --persist $colId || error "collection list error" 124 | echo "OK" 125 | 126 | #--------------------------------------------------- 127 | echo "= = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =" 128 | echo 129 | echo "--- Layout Analysis in collection $colId " 130 | $PYTHON $SRC/TranskribusCommands/do_analyzeLayout.py $colId $docId_A/1 || error "layout analysis error" 131 | echo "OK" 132 | 133 | #--------------------------------------------------- 134 | echo "= = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =" 135 | echo 136 | echo "--- delete last transcript $colid / $docid / 1 " 137 | $PYTHON $SRC/TranskribusCommands/do_transcript.py $colId $docId_A 1 --last --rm || error " delete last transcript error" 138 | echo "OK" 139 | 140 | #--------------------------------------------------- 141 | echo "= = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =" 142 | echo 143 | echo "--- list of locked pages for $docId_A in $colId " 144 | $PYTHON $SRC/TranskribusCommands/do_listPageLocks.py $colId $docId_A || error "locked pages error" 145 | echo "OK" 146 | 147 | #--------------------------------------------------- 148 | echo "= = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =" 149 | echo 150 | echo "--- list HTR models in collection $colId " 151 | $PYTHON $SRC/TranskribusCommands/do_listHtrRnn.py --colid=$colId || error "list HTR models error" 152 | echo "OK" 153 | 154 | #--------------------------------------------------- 155 | echo "= = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =" 156 | echo 157 | echo "--- list trpdoc for document $docId_A in $colId " 158 | $PYTHON $SRC/TranskribusCommands/do_transcript.py $colId $docId_A || error " transcript list models error" 159 | echo "OK" 160 | 161 | 162 | #--------------------------------------------------- 163 | echo "= = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =" 164 | echo 165 | echo "--- save trpdoc for document $docId_A in $TRP " 166 | $PYTHON $SRC/TranskribusCommands/do_transcript.py $colId $docId_A 2 --trp=$TRP || error " transcript list models error" 167 | 168 | echo "OK" 169 | 170 | 171 | #--------------------------------------------------- 172 | echo "= = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =" 173 | echo 174 | echo "--- download as per trp ---" 175 | rm -rf trnskrbs_$colId 176 | echo "--- download using $TRP " 177 | $PYTHON $SRC/TranskribusCommands/Transkribus_downloader.py $colId --trp=$TRP || error " download error" 178 | echo "OK" 179 | #--------------------------------------------------- 180 | echo "= = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =" 181 | echo 182 | echo "--- download trnskrbs_$colId document $docId_A ---" 183 | rm -rf trnskrbs_$colId 184 | echo "--- download document $docId_A ($colId) " 185 | $PYTHON $SRC/TranskribusCommands/Transkribus_downloader.py $colId --docid=$docId_A --noimage || error " download error" 186 | echo "OK" 187 | 188 | #--------------------------------------------------- 189 | echo "= = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =" 190 | echo 191 | echo "--- upload document $docId_A ($colId ) " 192 | $PYTHON $SRC/TranskribusCommands/TranskribusDU_transcriptUploader.py trnskrbs_$colId $colId $docId_A --nodu || error " TranskribusDU_transcriptUploaderupload error" 193 | echo "OK" 194 | 195 | #--------------------------------------------------- 196 | echo "= = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =" 197 | echo 198 | echo "--- upload as per trp $TRP " 199 | $PYTHON $SRC/TranskribusCommands/Transkribus_uploader.py trnskrbs_$colId $colId $docId_A --trp=$TRP || error " Transkribus_uploader upload error" 200 | echo "OK" 201 | echo "--- rm $TRP" 202 | rm $TRP 203 | 204 | #--------------------------------------------------- 205 | echo "= = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =" 206 | echo 207 | echo "--- test only --help" 208 | $PYTHON $SRC/TranskribusCommands/do_htrTrainRnn.py --help 209 | 210 | echo "===================================================================" 211 | echo "TESTs done" 212 | 213 | 214 | 215 | 216 | --------------------------------------------------------------------------------