├── .gitignore ├── CREDITS ├── LICENCE ├── README.md ├── core ├── 3rd-party │ ├── README │ └── sscdocapi.html ├── __init__.py ├── document.py ├── document_type.py ├── docvert.py ├── docvert_exception.py ├── docvert_html.py ├── docvert_libreoffice.py ├── docvert_pipeline.py ├── docvert_storage.py ├── docvert_url.py ├── docvert_xml.py ├── linux │ └── debian │ │ ├── README │ │ └── docvert-webserver.sh ├── opendocument.py ├── pipeline_type │ ├── __init__.py │ ├── compare.py │ ├── convertimages.py │ ├── debug.py │ ├── docbooktoxhtml.py │ ├── generate.py │ ├── generatepostconversioneditorfiles.py │ ├── getpreface.py │ ├── loop.py │ ├── normalizeopendocument.py │ ├── pipeline_item.py │ ├── serialize.py │ ├── serializeopendocument.py │ ├── splitpages.py │ ├── test.py │ ├── transform.py │ ├── transformopendocumenttodocbook.py │ └── writemetadata.py ├── transform │ ├── docbook-to-html.xsl │ ├── each-page.xsl │ ├── extract-metadata.xsl │ ├── normalize-docbook.xsl │ ├── normalize-opendocument.xsl │ ├── opendocument-to-docbook.xsl │ └── turn-document-into-test.xsl └── web_service_themes │ ├── README │ └── default │ ├── docvertedges2-small.gif │ ├── favicon.ico │ ├── index.js │ ├── index.tpl │ ├── jquery.dropp.js │ ├── loading.gif │ ├── preview.css │ ├── screen.css │ ├── tests.js │ ├── tests.tpl │ ├── upload_computer.png │ ├── web-service.js │ └── web-service.tpl ├── doc ├── how-to-write-themes.txt └── sample │ ├── sample-document.doc │ └── sample-document.odt ├── docvert-cli.py ├── docvert-web.py ├── lib ├── README ├── __init__.py ├── bottle │ ├── __init__.py │ ├── bottle.py │ └── bottle_session_file_problem.html ├── bottlesession │ ├── __init__.py │ └── bottlesession.py ├── fonts │ ├── COPYRIGHT │ ├── README │ └── reenie-beanie.woff ├── jquery │ └── jquery-1.5.min.js └── workerpool │ ├── LICENSE │ ├── QueueWrapper.py │ ├── README │ ├── __init__.py │ ├── exceptions.py │ ├── jobs.py │ ├── pools.py │ └── workers.py ├── logo.gif └── pipelines ├── auto_pipelines ├── Break up over Heading 1.default │ └── pipeline.xml └── Nothing (one long page) │ └── pipeline.xml ├── html_to_opendocument └── default │ └── pipeline.xml ├── pipelines ├── basic │ ├── onepage.xsl │ └── pipeline.xml ├── docbook │ └── pipeline.xml ├── open document │ └── pipeline.xml ├── pretty-lists │ ├── pipeline.xml │ └── pretty-lists.xsl ├── sanitised open document │ └── pipeline.xml ├── ssc │ ├── pipeline.xml │ └── pretty-lists.xsl └── web standards │ ├── mytheme2.xsl │ └── pipeline.xml └── tests ├── bold-italics ├── bold-italics.odt ├── bold-italics.txt ├── bold-italics.xsl └── pipeline.xml ├── footnotes ├── footnotes.doc ├── footnotes.txt ├── footnotes.xsl └── pipeline.xml ├── headings-and-paragraphs ├── opendocument.rng ├── pipeline.xml ├── sample-document-docbook-headings-and-paragraphs.txt ├── sample-document-html-headings-and-paragraphs.txt ├── sample-document.doc └── sample-document.txt ├── images ├── emf-copyright.txt ├── emf-sample.doc ├── emf-sample.odt ├── pipeline.xml ├── wmf-copyright.txt ├── wmf-sample.doc └── wmf-sample.odt ├── invalid-odf ├── invalid-page.odt ├── invalid-page.xsl └── pipeline.xml ├── links ├── links.odt ├── links.txt ├── links.xsl └── pipeline.xml ├── lists ├── continuation.odt ├── continuation.xsl ├── docvert-test-five-new.odt ├── docvert-test-five-new.txt ├── docvert-test-five.odt ├── docvert-test-five.txt ├── pipeline.xml ├── sample-document-docbook-lists.txt ├── sample-document-html-lists.txt ├── sample-document.doc ├── sample-document.txt ├── sample-document.xsl ├── table-list.odt └── table-list.txt └── tables ├── pipeline.xml ├── table-list.odt ├── table-list.txt ├── table-list.xsl └── table-span.doc /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | 3 | -------------------------------------------------------------------------------- /CREDITS: -------------------------------------------------------------------------------- 1 | Matthew Holloway (ex-Matthew Cruickshank) 2 | Mark Rickerby 3 | Cyrille Bonnet 4 | Francois Marier 5 | Sol Quimpo 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | Docvert 3 |

4 | 5 | 6 | Converts Word Processor office files (e.g. .DOC files) to OpenDocument, DocBook, and structured HTML. 7 | 8 | This is Docvert for Python 3. To find Docvert for Python 2.x see http://github.com/holloway/docvert/ 9 | 10 | Web Service 11 | ----------- 12 | 13 | python3 ./docvert-web.py [-p PORT] [-H host] 14 | 15 | Command Line 16 | ------------ 17 | 18 | python3 ./docvert-cli.py 19 | 20 | usage: docvert-cli.py [-h] [--version] --pipeline PIPELINE 21 | [--response {auto,path,stdout}] 22 | [--autopipeline {Break up over Heading 1.default,Nothing one long page}] 23 | [--url URL] 24 | [--list-pipelines] 25 | [--pipelinetype {tests,auto_pipelines,pipelines}] 26 | infile [infile ...] 27 | 28 | Community 29 | --------- 30 | 31 | http://lists.catalyst.net.nz/mailman/listinfo/docvert 32 | 33 | Requirements 34 | ------------ 35 | 36 | Python 3 37 | libreoffice 38 | python3-uno 39 | python3-lxml 40 | python3-imaging 41 | pdf2svg 42 | librsvg2-2 43 | 44 | Quickstart Guide 45 | ---------------- 46 | 47 | sudo apt-get install libreoffice python3-uno python3-lxml python3-imaging pdf2svg librsvg2-2 librsvg2-bin 48 | 49 | /usr/bin/soffice --headless --norestore --nologo --norestore --nofirststartwizard --accept="socket,port=2002;urp;" 50 | 51 | then in another terminal 52 | 53 | cd ~ 54 | 55 | git clone git://github.com/holloway/docvert-python3.git 56 | 57 | cd docvert-python3 58 | 59 | python3 ./docvert-web.py 60 | 61 | and browse to http://localhost:8080 62 | 63 | 64 | LICENCE 65 | ------- 66 | Released under the GPL3 see LICENCE 67 | -------------------------------------------------------------------------------- /core/3rd-party/README: -------------------------------------------------------------------------------- 1 | Mimics 3rd-party bindings to the web service api for manual testing purposes. 2 | -------------------------------------------------------------------------------- /core/3rd-party/sscdocapi.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 |
6 | 7 |
8 | 9 |
12 | 13 |
16 | 17 |
21 | 22 | 25 | 26 |
27 | 28 | 29 | 30 |
31 | 32 | 33 | 34 | -------------------------------------------------------------------------------- /core/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/holloway/docvert-python3/a16d9ff0921186355e80c6f4905a00f20c12cf5f/core/__init__.py -------------------------------------------------------------------------------- /core/document.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import libreoffice 4 | 5 | class document(object): 6 | def __init__(self, data): 7 | self.data = data 8 | 9 | def get_opendocument(self): 10 | raise NotImplemented() 11 | 12 | class opendocument(document): 13 | def get_open_document(self): 14 | return self.data 15 | 16 | def set_dimensions(self, width, height): 17 | raise NotImplemented() 18 | 19 | class binary_office_file(document) 20 | def get_opendocument(self): 21 | client = libreoffice.libreoffice_client() 22 | return client.convert_by_stream(self.data) 23 | 24 | 25 | -------------------------------------------------------------------------------- /core/document_type.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import zipfile 3 | import io 4 | 5 | class types(object): 6 | oasis_open_document = "oasis_open_document (any version)" 7 | pdf = "portable document format (any version)" 8 | xml = "xml" 9 | html = "html" 10 | exception = "exception" 11 | unknown_type = "unknown file type" 12 | 13 | def detect_document_type(data): 14 | if isinstance(data, Exception): 15 | return types.exception 16 | if isinstance(data, str): 17 | data = io.BytesIO(data) 18 | 19 | # 1. Sniff for OpenDocument 20 | try: 21 | magic_bytes_open_document = 'PK' 22 | data.seek(0) 23 | first_bytes = data.read(len(magic_bytes_open_document)).decode('latin-1') 24 | if first_bytes == magic_bytes_open_document: # Ok it's a ZIP but... 25 | archive = zipfile.ZipFile(data) 26 | if 'mimetype' in archive.namelist() and archive.read('mimetype').decode('utf-8') == 'application/vnd.oasis.opendocument.text': # ...if it doesn't have these files it's not an OpenDocument 27 | return types.oasis_open_document 28 | except UnicodeDecodeError as e: 29 | pass 30 | except Exception as e: 31 | print(e) 32 | # 2. Sniff for PDF 33 | try: 34 | magic_bytes_pdf = '%PDF' 35 | data.seek(0) 36 | first_bytes = data.read(len(magic_bytes_pdf)).decode('latin-1') 37 | if first_bytes == magic_bytes_pdf: 38 | return types.pdf 39 | except UnicodeDecodeError as e: 40 | pass 41 | except Exception as e: 42 | print(e) 43 | # 3. Sniff for HTML and XML 44 | data.seek(0) 45 | try: 46 | first_bytes = data.read(200).decode('latin-1') #200 bytes in, because sometimes there's a really long doctype 47 | #print first_bytes 48 | data.seek(0) 49 | if first_bytes.count(" 0: 50 | return types.html 51 | if first_bytes.count(" 0: 52 | return types.xml 53 | except UnicodeDecodeError as e: 54 | pass 55 | except Exception as e: 56 | print(e) 57 | 58 | return types.unknown_type 59 | -------------------------------------------------------------------------------- /core/docvert.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import tempfile 3 | import io 4 | import os.path 5 | from . import document_type 6 | from . import docvert_exception 7 | from . import docvert_pipeline 8 | from . import docvert_storage 9 | from . import docvert_libreoffice 10 | from . import docvert_xml 11 | from . import opendocument 12 | import urllib.request, urllib.error, urllib.parse 13 | 14 | docvert_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 15 | version = '6' 16 | http_timeout = 10 17 | 18 | class converter_type(object): 19 | python_streaming_to_libreoffice = "python streaming to libreoffice" 20 | 21 | def process_conversion(files=None, urls=None, pipeline_id=None, pipeline_type="pipelines", auto_pipeline_id=None, storage_type_name=docvert_storage.storage_type.memory_based, converter=converter_type.python_streaming_to_libreoffice, suppress_errors=False): 22 | if files is None and urls is None: 23 | raise docvert_exception.needs_files_or_urls() 24 | if pipeline_id is None: 25 | raise docvert_exception.unrecognised_pipeline("Unknown pipeline '%s'" % pipeline_id) 26 | storage = docvert_storage.get_storage(storage_type_name) 27 | 28 | def _title(name, files, data): 29 | filename = os.path.basename(name).replace('\\','-').replace('/','-').replace(':','-') 30 | if len(filename) == 0: 31 | filename = "document.odt" 32 | if filename in files: 33 | if data and hasattr(files[filename], 'read') and files[filename].getvalue() == data: 34 | return filename 35 | unique = 1 36 | potential_filename = filename 37 | while potential_filename in files: 38 | unique += 1 39 | if filename.count("."): 40 | potential_filename = filename.replace(".", "%i." % unique, 1) 41 | else: 42 | potential_filename = filename + str(unique) 43 | filename = potential_filename 44 | return filename 45 | 46 | for filename, data in files.items(): 47 | storage.set_friendly_name(filename, filename) 48 | 49 | for url in urls: 50 | try: 51 | data = urllib.request.urlopen(url, None, http_timeout).read() 52 | doc_type = document_type.detect_document_type(data) 53 | if doc_type == document_type.types.html: 54 | data = html_to_opendocument(data, url) 55 | filename = _title(url, files, data) 56 | storage.set_friendly_name(filename, "%s (%s)" % (filename, url)) 57 | files[filename] = io.StringIO(data) 58 | except IOError as e: 59 | filename = _title(url, files, None) 60 | storage.set_friendly_name(filename, "%s (%s)" % (filename, url)) 61 | files[filename] = Exception("Download error from %s: %s" % (url, e)) 62 | 63 | for filename, data in files.items(): 64 | if storage.default_document is None: 65 | storage.default_document = filename 66 | doc_type = document_type.detect_document_type(data) 67 | if doc_type == document_type.types.exception: 68 | storage.add("%s/index.txt" % filename, str(data)) 69 | elif doc_type != document_type.types.oasis_open_document: 70 | try: 71 | data = generate_open_document(data, converter) 72 | doc_type = document_type.types.oasis_open_document 73 | except Exception as e: 74 | if not suppress_errors: 75 | raise e 76 | storage.add("%s/index.txt" % filename, str(e)) 77 | if doc_type == document_type.types.oasis_open_document: 78 | if pipeline_id == "open document": #reserved term, for when people want the Open Document file back directly. Don't bother loading pipeline. 79 | storage.add("%s/index.odt" % filename, data) 80 | thumbnail = opendocument.extract_thumbnail(data) 81 | if thumbnail: 82 | storage.add("%s/thumbnail.png" % filename, thumbnail) 83 | else: 84 | document_xml = opendocument.extract_useful_open_document_files(data, storage, filename) 85 | storage.add("%s/opendocument.xml" % filename, document_xml) 86 | process_pipeline(document_xml, pipeline_id, pipeline_type, auto_pipeline_id, storage, filename) 87 | storage.remove("%s/opendocument.xml" % filename) 88 | return storage 89 | 90 | def process_pipeline(initial_pipeline_value, pipeline_id, pipeline_type, auto_pipeline_id, storage, storage_prefix=None): 91 | pipeline_definition = docvert_pipeline.get_pipeline_definition(pipeline_type, pipeline_id, auto_pipeline_id) 92 | pipeline = docvert_pipeline.pipeline_processor(storage, pipeline_definition['stages'], pipeline_definition['pipeline_directory'], storage_prefix) 93 | return pipeline.start(initial_pipeline_value) 94 | 95 | def generate_open_document(data, converter=converter_type.python_streaming_to_libreoffice): 96 | if converter == converter_type.python_streaming_to_libreoffice: 97 | return docvert_libreoffice.get_client().convert_by_stream(data, docvert_libreoffice.LIBREOFFICE_OPEN_DOCUMENT) 98 | raise docvert_exception.unrecognised_converter("Unknown converter '%s'" % converter) 99 | 100 | def html_to_opendocument(html, url): 101 | from BeautifulSoup import BeautifulSoup 102 | import html.entities 103 | import re 104 | 105 | def to_ncr(match): 106 | text = match.group(0) 107 | entity_string = text[1:-1] 108 | entity = html.entities.entitydefs.get(entity_string) 109 | if entity: 110 | if len(entity) > 1: 111 | return entity 112 | try: 113 | return "&#%s;" % ord(entity) 114 | except ValueError: 115 | pass 116 | except TypeError as e: 117 | print("TypeError on '%s'?" % entity) 118 | raise 119 | return text 120 | 121 | soup = BeautifulSoup(html, convertEntities=BeautifulSoup.XML_ENTITIES) 122 | to_extract = soup.findAll('script') 123 | for item in to_extract: 124 | item.extract() 125 | pretty_xml = soup.html.prettify() 126 | pretty_xml = re.sub("&?\w+;", to_ncr, pretty_xml) 127 | pretty_xml = re.sub('&(\w+);', '&\\1', pretty_xml) 128 | pretty_xml = pretty_xml.replace("& ", "& ") 129 | #display_lines(pretty_xml, 5, 15) 130 | xml = docvert_xml.get_document(pretty_xml) 131 | storage = docvert_storage.get_storage(docvert_storage.storage_type.memory_based) 132 | result = process_pipeline(xml, 'default', 'html_to_opendocument', None, storage) 133 | #print result 134 | #print storage 135 | return result 136 | 137 | def display_lines(data, start_line, end_line): 138 | data = data.split("\n") 139 | segment = data[start_line:end_line] 140 | for line in segment: 141 | print("%s%s" % (start_line, line)) 142 | start_line += 1 143 | 144 | 145 | def get_all_pipelines(include_default_autopipeline = True): 146 | def _title(name): 147 | if name.endswith('.default'): 148 | name = name[0:-len('.default')] 149 | return name.replace('_',' ').replace('-',' ').title() 150 | 151 | pipeline_types_path = os.path.join(docvert_root, "pipelines") 152 | pipeline_types = dict() 153 | for pipeline_type in os.listdir(pipeline_types_path): 154 | pipeline_types[pipeline_type] = list() 155 | for pipeline_directory in os.listdir(os.path.join(pipeline_types_path, pipeline_type)): 156 | if pipeline_directory == 'ssc': #don't show this pipeline publicly. it's not important. 157 | pass 158 | elif include_default_autopipeline is False and pipeline_type == "auto_pipelines" and "nothing" in pipeline_directory.lower(): 159 | pass #print "Skipping?" 160 | else: 161 | pipeline_types[pipeline_type].append(dict(id=pipeline_directory, name=_title(pipeline_directory))) 162 | return pipeline_types 163 | 164 | 165 | -------------------------------------------------------------------------------- /core/docvert_exception.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | class docvert_exception(Exception): 4 | pass 5 | 6 | class needs_files_or_urls(docvert_exception): 7 | pass 8 | 9 | class unrecognised_pipeline(docvert_exception): 10 | pass 11 | 12 | class unrecognised_auto_pipeline(docvert_exception): 13 | pass 14 | 15 | class unrecognised_converter(docvert_exception): 16 | pass 17 | 18 | class converter_unable_to_generate_open_document(docvert_exception): 19 | pass 20 | 21 | class converter_unable_to_generate_pdf(docvert_exception): 22 | pass 23 | 24 | class unknown_docvert_process(docvert_exception): 25 | pass 26 | 27 | class unable_to_serialize_opendocument(docvert_exception): 28 | pass 29 | 30 | class unrecognised_pipeline_item(docvert_exception): 31 | pass 32 | 33 | class unrecognised_storage_type(docvert_exception): 34 | pass 35 | 36 | class unknown_pipeline_node(docvert_exception): 37 | pass 38 | 39 | class unknown_docvert_process(docvert_exception): 40 | pass 41 | 42 | class tests_disabled(docvert_exception): 43 | pass 44 | 45 | class unable_to_generate_xml_document(docvert_exception): 46 | pass 47 | 48 | class invalid_test_root_node(docvert_exception): 49 | pass 50 | 51 | class invalid_test_child_node(docvert_exception): 52 | pass 53 | 54 | class debug_exception(docvert_exception): 55 | def __init__(self, message, data, content_type): 56 | self.data = data 57 | self.content_type = content_type 58 | super(docvert_exception, self).__init__(message) 59 | 60 | class debug_xml_exception(debug_exception): 61 | pass 62 | -------------------------------------------------------------------------------- /core/docvert_html.py: -------------------------------------------------------------------------------- 1 | # Based on code from 2 | # http://stackoverflow.com/questions/257409/download-image-file-from-the-html-page-source-using-python 3 | # but multithreaded, etc. 4 | from BeautifulSoup import BeautifulSoup as bs 5 | import urllib.parse 6 | from urllib.request import urlopen 7 | from urllib.request import urlretrieve 8 | import os 9 | import sys 10 | 11 | def get_urls(url, storage, storage_prefix): 12 | """Downloads all the images at 'url' to /test/""" 13 | soup = bs(urlopen(url)) 14 | parsed = list(urllib.parse.urlparse(url)) 15 | 16 | for image in soup.findAll("img"): 17 | print("Image: %(src)s" % image) 18 | filename = image["src"].split("/")[-1] 19 | parsed[2] = image["src"] 20 | storage_path = os.path.join(storage_prefix, filename) 21 | 22 | url = urllib.parse.urlunparse(parsed) 23 | if url.lower().startswith("http"): 24 | url = image["src"] 25 | data = urllib.request.urlopen(url) 26 | -------------------------------------------------------------------------------- /core/docvert_libreoffice.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from os.path import abspath 3 | from os.path import isfile 4 | from os.path import splitext 5 | import sys 6 | import io 7 | from . import document_type 8 | from . import docvert_exception 9 | import socket 10 | 11 | DEFAULT_LIBREOFFICE_PORT = 2002 12 | LIBREOFFICE_OPEN_DOCUMENT = 'writer8' 13 | LIBREOFFICE_PDF = 'writer_pdf_Export' 14 | 15 | client = None 16 | 17 | try: 18 | import uno 19 | except ImportError: 20 | sys.path.append('/opt/libreoffice/program/') 21 | sys.path.append('/usr/lib/libreoffice/program/') 22 | sys.path.append('/usr/share/libreoffice/program/') 23 | sys.path.append('/usr/lib/openoffice.org/program/') 24 | sys.path.append('/usr/lib/openoffice.org2.0/program/') 25 | try: 26 | import uno 27 | except ImportError: 28 | python_version_info = sys.version_info 29 | python_version = "%s.%s.%s" % (python_version_info[0], python_version_info[1], python_version_info[2]) 30 | sys.stderr.write("Error: Unable to find Python UNO libraries in %s.\nAre Python UNO libraries somewhere else?\nAlternatively, Docvert is currently running Python %s. Maybe the libraries are available under a different version of Python?)\nExiting...\n" % (sys.path, python_version)) 31 | sys.exit(0) 32 | 33 | import unohelper 34 | from com.sun.star.beans import PropertyValue 35 | from com.sun.star.task import ErrorCodeIOException 36 | from com.sun.star.uno import Exception as UnoException 37 | from com.sun.star.connection import NoConnectException 38 | from com.sun.star.io import XOutputStream 39 | 40 | class output_stream_wrapper(unohelper.Base, XOutputStream): 41 | def __init__(self): 42 | self.data = io.BytesIO() 43 | self.position = 0 44 | 45 | def writeBytes(self, bytes): 46 | self.data.write(bytes.value) 47 | self.position += len(bytes.value) 48 | 49 | def close(self): 50 | self.data.close() 51 | 52 | def flush(self): 53 | pass 54 | 55 | 56 | class libreoffice_client(object): 57 | def __init__(self, port=DEFAULT_LIBREOFFICE_PORT): 58 | self._local_context = uno.getComponentContext() 59 | self._service_manager = self._local_context.ServiceManager 60 | resolver = self._service_manager.createInstanceWithContext("com.sun.star.bridge.UnoUrlResolver", self._local_context) 61 | try: 62 | context = resolver.resolve("uno:socket,host=localhost,port=%s;urp;StarOffice.ComponentContext" % port) 63 | except NoConnectException as exception: 64 | raise Exception("Failed to connect to LibreOffice on port %s. Python 3 UNO library said: \n\n\t%s\n\nIf you don't have a server then read README for 'OPTIONAL LIBRARIES' to see how to set one up." % (port, exception)) 65 | self._desktop = context.ServiceManager.createInstanceWithContext("com.sun.star.frame.Desktop", context) 66 | 67 | def convert_by_stream(self, data, format=LIBREOFFICE_OPEN_DOCUMENT): 68 | input_stream = self._service_manager.createInstanceWithContext("com.sun.star.io.SequenceInputStream", self._local_context) 69 | 70 | 71 | data.seek(0) 72 | print(data, format) 73 | print(data.read(100)) 74 | data.seek(0) 75 | handle = open('/tmp/what', 'wb') 76 | handle.write(data.read()) 77 | 78 | 79 | data.seek(0) 80 | input_stream.initialize((uno.ByteSequence(data.read()),)) 81 | #try: 82 | document = self._desktop.loadComponentFromURL('private:stream', "_blank", 8, self._to_properties(InputStream=input_stream)) 83 | 84 | try: 85 | document.refresh() 86 | except AttributeError: 87 | pass 88 | 89 | #except Exception as e: 90 | # print("Lost connection to LibreOffice. Trying to reconnect...") 91 | # self.__init__(); #try to reconnect 92 | # document = self._desktop.loadComponentFromURL('private:stream', "_blank", 0, self._to_properties(InputStream=input_stream)) 93 | if not document: 94 | raise Exception("Error making document") 95 | 96 | input_stream.closeInput() 97 | output_stream = output_stream_wrapper() 98 | #try: 99 | document.storeToURL('private:stream', self._to_properties(OutputStream=output_stream, FilterName=format, Hidden=True)) 100 | #except Exception as e: #ignore any error, verify the output before complaining 101 | #print(e) 102 | # pass 103 | #finally: 104 | # document.close(True) 105 | # pass 106 | if format == LIBREOFFICE_OPEN_DOCUMENT or format == LIBREOFFICE_PDF: 107 | doc_type = document_type.detect_document_type(output_stream.data) 108 | output_stream.data.seek(0) 109 | if format == LIBREOFFICE_OPEN_DOCUMENT and doc_type != document_type.types.oasis_open_document: 110 | raise docvert_exception.converter_unable_to_generate_open_document("Unable to generate OpenDocument, was detected as %s.\n\nAre you sure you tried to convert an office document? If so then it\nmight be a bug, so please contact docvert@holloway.co.nz and we'll see\nif we can fix it. Thanks!" % doc_type) 111 | elif format == LIBREOFFICE_PDF and doc_type != document_type.types.pdf: 112 | raise docvert_exception.converter_unable_to_generate_pdf("Unable to generate PDF, was detected as %s. First 4 bytes = %s" % (doc_type, output_stream.data.read(4))) 113 | return output_stream.data 114 | 115 | def _to_properties(self, **args): 116 | props = [] 117 | for key in args: 118 | prop = PropertyValue() 119 | prop.Name = key 120 | prop.Value = args[key] 121 | props.append(prop) 122 | return tuple(props) 123 | 124 | def checkLibreOfficeStatus(): 125 | try: 126 | libreoffice_client() 127 | return True 128 | except Exception: 129 | return False 130 | 131 | def get_client(): 132 | global client 133 | if client is None: 134 | client = libreoffice_client() 135 | return client 136 | 137 | -------------------------------------------------------------------------------- /core/docvert_pipeline.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | import lxml.etree 4 | from . import docvert_exception 5 | from . import docvert_xml 6 | 7 | docvert_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 8 | 9 | def get_pipeline_definition(pipeline_type, pipeline_id, auto_pipeline_id): 10 | pipeline = get_pipeline_xml(pipeline_type, pipeline_id, auto_pipeline_id) 11 | pipeline['stages'] = process_stage_level( pipeline['xml'].getroot() ) 12 | return pipeline 13 | 14 | def process_stage_level(nodes): 15 | stages = list() 16 | for child_node in nodes: 17 | if child_node.tag != "stage": 18 | continue 19 | child = dict() 20 | child['attributes'] = child_node.attrib 21 | child['children'] = None 22 | if(len(child_node) > 0): 23 | child['children'] = process_stage_level(child_node) 24 | stages.append(child) 25 | return stages 26 | 27 | def get_pipeline_xml(pipeline_type, pipeline_id, auto_pipeline_id): 28 | path = os.path.join(docvert_root, "pipelines", pipeline_type, pipeline_id, "pipeline.xml") 29 | if not os.path.exists(path): 30 | raise docvert_exception.unrecognised_pipeline("Unknown pipeline_id '%s' (checked %s)" % (pipeline_id, path)) 31 | autopipeline_path = None 32 | xml = lxml.etree.parse(path) 33 | if xml.getroot().tag == "autopipeline": 34 | if auto_pipeline_id is None: 35 | raise docvert_exception.unrecognised_auto_pipeline("Unknown auto pipeline '%s'" % auto_pipeline_id) 36 | autopipeline_path = os.path.join(docvert_root, "pipelines", "auto_pipelines", auto_pipeline_id, "pipeline.xml") 37 | if not os.path.exists(path): 38 | raise docvert_exception.unrecognised_auto_pipeline("Unknown auto pipeline '%s'" % auto_pipeline_id) 39 | custom_stages = lxml.etree.tostring(xml.getroot()).decode('utf-8') 40 | autopipeline = "" 41 | try: 42 | autopipeline_handle = open(autopipeline_path) 43 | except IOError as e: 44 | autopipeline_path_with_default = os.path.join(docvert_root, "pipelines", "auto_pipelines", "%s.default" % auto_pipeline_id, "pipeline.xml") 45 | autopipeline_handle = open(autopipeline_path_with_default) 46 | autopipeline = autopipeline_handle.read().replace('{{custom-stages}}', custom_stages) 47 | autopipeline = docvert_xml.strip_encoding_declaration(autopipeline) 48 | xml = lxml.etree.fromstring(autopipeline) 49 | xml = xml.getroottree() 50 | return dict(xml=xml, pipeline_directory=os.path.dirname(path), path=path, autopath=autopipeline_path) 51 | 52 | class pipeline_processor(object): 53 | """ Processes through a list() of pipeline_item(s) """ 54 | def __init__(self, storage, pipeline_items, pipeline_directory, pipeline_storage_prefix=None, depth=None): 55 | self.storage = storage 56 | self.pipeline_items = pipeline_items 57 | self.pipeline_directory = pipeline_directory 58 | self.pipeline_storage_prefix = pipeline_storage_prefix 59 | self.depth = list() if depth is None else depth 60 | 61 | def start(self, pipeline_value): 62 | for item in self.pipeline_items: 63 | process = item['attributes']['process'] 64 | namespace = 'core.pipeline_type' 65 | full_pipeline_type = "%s.%s" % (namespace, process.lower()) 66 | #try: 67 | stage_module = __import__(full_pipeline_type, {}, {}, [full_pipeline_type.rsplit(".", 1)[-1]]) 68 | stage_class = getattr(stage_module, process) 69 | stage_instance = stage_class(self.storage, self.pipeline_directory, item['attributes'], self.pipeline_storage_prefix, item['children'], self.depth) 70 | pipeline_value = stage_instance.stage(pipeline_value) 71 | #except ImportError, exception: 72 | # raise exception 73 | # raise docvert_exception.unknown_docvert_process('Unknown pipeline process of "%s" (at %s)' % (process, "%s.%s" % (namespace, process.lower()) )) 74 | return pipeline_value 75 | 76 | -------------------------------------------------------------------------------- /core/docvert_storage.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import zipfile 3 | import io 4 | import time 5 | from . import docvert_exception 6 | import core.docvert_xml 7 | import core.docvert_exception 8 | 9 | 10 | class storage_type(object): 11 | file_based = "file based storage" 12 | memory_based = "memory based storage" 13 | 14 | def get_storage(name): 15 | if name == storage_type.file_based: 16 | return storage_file_based() 17 | elif name == storage_type.memory_based: 18 | return storage_memory_based() 19 | raise docvert_exception.unrecognised_storage_type("Unknown storage type '%s'" % name) 20 | 21 | class storage(object): 22 | _docvert_xml_namespace = '{docvert:5}' 23 | 24 | def __init__(self, *args, **kargs): 25 | raise NotImplemented() 26 | 27 | def __setitem__(self, key, value): 28 | self.add(key, value) 29 | 30 | def __contains__(self, key): 31 | raise NotImplemented() 32 | 33 | def keys(self): 34 | return list(self.storage.keys()) 35 | 36 | def has_key(self, key): 37 | return key in self.storage 38 | 39 | def __getitem__(self, key): 40 | return self.get(key) 41 | 42 | def set_friendly_name(self, filename, friendly_name): 43 | raise NotImplemented() 44 | 45 | def add_tests(self, tests): 46 | if not hasattr(self, 'tests'): 47 | self.tests = [] 48 | if type(tests) == type([]): #assume correctly formatted list 49 | return self.tests.extend(tests) 50 | document = core.docvert_xml.get_document(tests) 51 | if hasattr(document, 'getroottree'): 52 | document = document.getroottree() 53 | root = document.getroot() 54 | if root.tag != "%sgroup" % self._docvert_xml_namespace: 55 | raise docvert_exception.invalid_test_root_node("Error parsing test results. Expected a root node of 'group' but got '%s'" % root.tag) 56 | for child in root: 57 | if child.tag == "%spass" % self._docvert_xml_namespace: 58 | self.tests.append( {"status":"pass", "message":str(child.text)} ) 59 | elif child.tag == "%sfail" % self._docvert_xml_namespace: 60 | self.tests.append(dict(status="fail", message=str(child.text))) 61 | else: 62 | raise invalid_test_child_node("Error parsing test results. Unexpected child element of '%s' %s" % (child.tag, child)) 63 | 64 | def get_tests(self): 65 | if hasattr(self, 'tests'): 66 | return self.tests 67 | return list() 68 | 69 | def get_zip_name(self): 70 | raise NotImplemented("No implemented, yet...") 71 | 72 | class storage_file_based(storage): 73 | def __init__(self): 74 | self.working_directory = tempfile.mkdtemp() 75 | self.created_at = time.time() 76 | self.default_document = None 77 | 78 | def add(self, path, data): 79 | handler = open(os.path.join(self.working_directory, path), 'w') 80 | handler.write(data) 81 | handler.close() 82 | 83 | def set_friendly_name(self, filename, friendly_name): 84 | raise NotImplemented() 85 | 86 | def get(self, path): 87 | handler = open(os.path.join(self.working_directory, path), 'r') 88 | return handler.read() 89 | 90 | def _dispose(self): 91 | os.removedirs(self.working_directory) 92 | 93 | def get_zip_name(self): 94 | raise NotImplemented("No implemented, yet...") 95 | 96 | def to_zip(self): 97 | raise NotImplemented("Not implemented, yet...") 98 | 99 | def __str__(self): 100 | return '' % self.working_directory 101 | 102 | 103 | class storage_memory_based(storage): 104 | def __init__(self): 105 | self.storage = dict() 106 | self.created_at = time.time() 107 | self.default_document = None 108 | self.friendly_names = dict() 109 | 110 | def add(self, path, data): 111 | self.storage[path] = data 112 | 113 | def set_friendly_name(self, filename, friendly_name): 114 | self.friendly_names[filename] = friendly_name 115 | 116 | def get_friendly_name_if_available(self, filename): 117 | if filename in self.friendly_names: 118 | return self.friendly_names[filename] 119 | return filename 120 | 121 | def keys(self): 122 | return list(self.storage.keys()) 123 | 124 | def get(self, path): 125 | return self.storage[path] 126 | 127 | def remove(self, path): 128 | del self.storage[path] 129 | 130 | def __delitem__(self, path): 131 | del self.storage[path] 132 | 133 | def __contains__(self, key): 134 | return key in self.storage 135 | 136 | def to_zip(self): 137 | zipdata = io.BytesIO() 138 | archive = zipfile.ZipFile(zipdata, 'w') 139 | for key, value in self.storage.items(): 140 | data = value 141 | if hasattr(value, "read"): 142 | data = value.seek(0) 143 | data = value.read() 144 | if not key.startswith("__"): #if it's not internal data 145 | archive.writestr(key.replace("\\", "/"), data) 146 | archive.close() 147 | return zipdata 148 | 149 | def get_zip_name(self): 150 | friendly_names = ", ".join(list(self.friendly_names.keys())) 151 | if friendly_names != "": 152 | friendly_names = "-%s" % friendly_names 153 | zip_name = "%s%s" % (time.strftime("docvert-%Y%m%d%H%M"), friendly_names) 154 | return zip_name.replace("\"","").replace("\n","").replace("\r","").replace("\\","") 155 | 156 | def _dispose(self): 157 | pass 158 | 159 | def __str__(self): 160 | return '' % list(self.storage.keys()) 161 | -------------------------------------------------------------------------------- /core/docvert_url.py: -------------------------------------------------------------------------------- 1 | #based on https://github.com/shazow/workerpool/wiki/Mass-Downloader 2 | import os 3 | import urllib.request, urllib.error, urllib.parse 4 | import lib.workerpool 5 | 6 | class DownloadUrl(lib.workerpool.Job): 7 | def __init__(self, url, http_timeout=10): 8 | self.url = url 9 | self.http_timeout = http_timeout 10 | 11 | def run(self): 12 | try: 13 | self.response = urllib.request.urlopen(self.url, None, self.http_timeout).read() 14 | except urllib.error.URLError as e: 15 | self.response = e 16 | 17 | def download(urls, workerpool_size=5): 18 | pool = lib.workerpool.WorkerPool(size=workerpool_size) 19 | for url in urls: 20 | pool.put(DownloadUrl(url)) 21 | pool.shutdown() 22 | pool.wait() 23 | print(dir(pool)) 24 | 25 | def demo(): 26 | download([ 27 | 'https://github.com/shazow/workerpool/wiki/Mass-Downloader', 28 | 'http://yahoo.com', 29 | 'http://twitter.com/', 30 | 'http://www.google.com/', 31 | 'http://www.stuff.co.nz/', 32 | 'http://trademe.co.nz/', 33 | 'http://av.com/', 34 | 'http://reddit.com/', 35 | 'http://slashdot.org/' 36 | ]) 37 | -------------------------------------------------------------------------------- /core/docvert_xml.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from . import docvert_exception 3 | import lxml.etree 4 | import xml.sax.saxutils 5 | import re 6 | 7 | def transform(data, xslt, params=None): 8 | if params is None: 9 | params = dict() 10 | xslt_document = get_document(xslt) 11 | xslt_processor = lxml.etree.XSLT(xslt_document) 12 | xml_document = get_document(data) 13 | params = convert_dict_to_params(params) 14 | return xslt_processor(xml_document, **params) 15 | 16 | def relaxng(data, relaxng_path): 17 | relaxng_document = get_document(relaxng_path) 18 | xml_document = get_document(data) 19 | relaxng_processor = lxml.etree.RelaxNG(relaxng_document) 20 | is_valid = relaxng_processor.validate(xml_document) 21 | return dict(valid=is_valid, log=relaxng_processor.error_log) 22 | 23 | def escape_text(text): 24 | return xml.sax.saxutils.escape(text) 25 | 26 | def get_document(data): 27 | if isinstance(data, lxml.etree._Element): 28 | return data 29 | elif isinstance(data, lxml.etree._XSLTResultTree): 30 | return data 31 | elif hasattr(data, 'read'): 32 | data.seek(0) 33 | return lxml.etree.XML(data.read()) 34 | elif data[0:1] == "/" or data[0:1] == "\\": #path 35 | return lxml.etree.XML(strip_encoding_declaration(open(data).read())) 36 | elif data[0:1] == "<": #xml 37 | return lxml.etree.XML(data) 38 | else: #last ditch attempt... 39 | return lxml.etree.XML(data) 40 | raise docvert_exception.unable_to_generate_xml_document() 41 | 42 | def convert_dict_to_params(params): 43 | for key in list(params.keys()): 44 | params[key] = "'%s'" % params[key] 45 | return params 46 | 47 | def file_as_string(path): 48 | pass 49 | 50 | def strip_encoding_declaration(xml_string): 51 | return re.sub('<\?.*?\?>','', xml_string) -------------------------------------------------------------------------------- /core/linux/debian/README: -------------------------------------------------------------------------------- 1 | docvert-webserver.sh -- A start/stop daemon for Docvert 2 | -------------------------------------------------------------------------------- /core/linux/debian/docvert-webserver.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | ### BEGIN INIT INFO 3 | # Provides: docvert 4 | # Default-Start: 2 3 4 5 5 | # Default-Stop: 0 1 6 6 | # Required-Start: $local_fs $network_fs $network $syslog 7 | # Required-Stop: $local_fs $network_fs $network $syslog 8 | # Short-Description: Docvert web server 9 | # Description: Start/stop Docvert in server mode 10 | ### END INIT INFO 11 | 12 | scriptDirectory=$( dirname "$0" ) 13 | scriptDirectory=$( cd $scriptDirectory; pwd ) 14 | 15 | docvertcommand="/opt/docvert/docvert-web.py" 16 | kill="/bin/kill" 17 | pidfile="/tmp/docvert-webserver.pid" 18 | startStopDaemon="/sbin/start-stop-daemon" 19 | rpmDaemon="/usr/bin/daemon" 20 | whoami=$( whoami ) 21 | 22 | export HOME=/tmp/ 23 | 24 | if [ -e "$startStopDaemon" ] 25 | then 26 | if [ -n "$username" ] 27 | then 28 | username="-c $username " 29 | fi 30 | if [ -n "$groupname" ] 31 | then 32 | groupname="-g $groupname " 33 | fi 34 | elif [ -e "$rpmDaemon" ] 35 | then 36 | if [ -n "$username" ] 37 | then 38 | username="--user=$username" 39 | if [ -n "$groupname" ] 40 | then 41 | username="$username.$groupname" 42 | fi 43 | fi 44 | fi 45 | 46 | docvert_start() 47 | { 48 | if [ -s "$pidfile" ] 49 | then 50 | pid=$( cat "$pidfile" ) 51 | echo "Already running? pid file exists at $pidfile that contains process #$pid" 52 | else 53 | rm -f "$pidfile" 54 | if [ -e "$startStopDaemon" ] 55 | then 56 | $startStopDaemon -b -m --pidfile $pidfile --start --exec "$docvertcommand" 57 | elif [ -e "$rpmDaemon" ] 58 | then 59 | $rpmDaemon --pidfile=$pidfile $docvertcommand 60 | else 61 | echo "Warning: Unable to find $startStopDaemon or $rpmDaemon so instead I'll daemonize by forking as the current user, $whoami." 62 | $docvertcommand & 63 | fi 64 | return 0 65 | fi 66 | } 67 | 68 | docvert_stop() 69 | { 70 | if [ -s "$pidfile" ] 71 | then 72 | pid=$( cat "$pidfile" ) 73 | $kill $pid 74 | sleep 1 75 | $kill -s 9 "$pid" > /dev/null 2>&1 & 76 | remainingProcess=$( ps "$pid" | grep $pid ) 77 | if [ -n $remainingProcess ] 78 | then 79 | rm -f "$pidfile" 80 | echo "Successfully killed process #$pid" 81 | return 0 82 | else 83 | echo "Unable to kill process #$pid. Check permissions? (remaining processes '$remainingProcess')" 84 | return 0 85 | fi 86 | else 87 | echo "Stopped. Warning: No pid file found at $pidfile so I'm assuming it was never running." 88 | fi 89 | } 90 | 91 | case "$1" in 92 | start) 93 | docvert_start 94 | ;; 95 | stop) 96 | docvert_stop 97 | ;; 98 | restart) 99 | docvert_stop 100 | sleep 1 101 | docvert_start 102 | ;; 103 | *) 104 | echo "Usage: docvert-webserver.init.sh {start|stop|restart}" 105 | exit 1 106 | ;; 107 | esac 108 | 109 | -------------------------------------------------------------------------------- /core/opendocument.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import zipfile 3 | import io 4 | import lxml.etree 5 | import os.path 6 | 7 | def extract_useful_open_document_files(data, storage=None, prefix=None): 8 | archive = zipfile.ZipFile(data) 9 | archive_files = archive.namelist() 10 | xml_string = extract_xml(archive, archive_files) 11 | if storage is None: #we can't extract binaries 12 | return xml_string 13 | return extract_useful_binaries(archive, archive_files, storage, prefix, xml_string) 14 | 15 | def extract_thumbnail(data): 16 | archive = zipfile.ZipFile(data) 17 | thumbnail_path = 'Thumbnails/thumbnail.png' 18 | archive_files = archive.namelist() 19 | if thumbnail_path in archive_files: 20 | return archive.open(thumbnail_path).read() 21 | return None 22 | 23 | def extract_useful_binaries(archive, archive_files, storage, prefix, xml_string): 24 | xlink_namespace = "http://www.w3.org/1999/xlink" 25 | xpath_template = '//*[@{%s}href="%s"]' % (xlink_namespace, '%s') 26 | document = lxml.etree.fromstring(xml_string.getvalue()) 27 | extensions = [".wmf", ".emf", ".svg", ".png", ".gif", ".bmp", ".jpg", ".jpe", ".jpeg"] 28 | index = 0 29 | for archive_path in archive_files: 30 | path_minus_extension, extension = os.path.splitext(archive_path) 31 | if extension in extensions: 32 | storage_path = "%s/file%i.%s" % (prefix, index, extension) 33 | try: 34 | storage_path = "%s/%s" % (prefix, os.path.basename(archive_path)) 35 | except UnicodeDecodeError as e: 36 | pass 37 | #step 1. extract binaries 38 | storage[storage_path] = archive.open(archive_path).read() 39 | #step 2. update XML references 40 | path_relative_to_xml = os.path.basename(archive_path) 41 | xpath = lxml.etree.ETXPath(xpath_template % archive_path) 42 | for match in xpath(document): 43 | match.attrib['{%s}href' % xlink_namespace] = storage_path 44 | index += 1 45 | return io.StringIO(lxml.etree.tostring(document).decode('utf-8')) 46 | 47 | def extract_xml(archive, archive_files): 48 | xml_files_to_extract = ["content.xml", "meta.xml", "settings.xml", "styles.xml"] 49 | xml_string = io.StringIO() 50 | xml_string.write('') 51 | for xml_file_to_extract in xml_files_to_extract: 52 | if xml_file_to_extract in archive_files: 53 | xml_string.write('' % xml_file_to_extract) 54 | xml_document_string = archive.open(xml_file_to_extract).read().decode('utf-8').replace('', '') 55 | document = lxml.etree.fromstring(xml_document_string) #parsing as XML to remove any doctype 56 | xml_string.write(lxml.etree.tostring(document).decode('utf-8')) 57 | xml_string.write('') 58 | xml_string.write('') 59 | return xml_string 60 | 61 | def generate_single_image_document(image_data, width, height): 62 | content_xml = """ 63 | 64 | 65 | 66 | %s 67 | 68 | 69 | """ 70 | mimetype = 'application/vnd.oasis.opendocument.text' 71 | image_xml = """ 72 | 73 | 74 | """ 75 | image_path = "Pictures/image.png" 76 | manifest = """ 77 | 78 | 79 | 80 | 81 | 82 | 83 | """ 84 | styles_xml = """ 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | """ 97 | image_xml = image_xml % (width, height, image_path) #filename doesn't matter 98 | zipio = io.BytesIO() 99 | archive = zipfile.ZipFile(zipio, 'w') 100 | archive.writestr('mimetype', mimetype) 101 | archive.writestr('content.xml', content_xml % image_xml) 102 | archive.writestr('styles.xml', styles_xml % (width, height)) 103 | archive.writestr('META-INF/manifest.xml', manifest % image_path) 104 | archive.writestr(image_path, image_data) 105 | archive.close() 106 | zipio.seek(0) 107 | #pointer = open('/tmp/doc.odt', 'wb') 108 | #pointer.write(zipio.read()) 109 | #pointer.close() 110 | return zipio 111 | 112 | -------------------------------------------------------------------------------- /core/pipeline_type/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/holloway/docvert-python3/a16d9ff0921186355e80c6f4905a00f20c12cf5f/core/pipeline_type/__init__.py -------------------------------------------------------------------------------- /core/pipeline_type/compare.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | import os.path 4 | import lxml.etree 5 | from . import pipeline_item 6 | import core.docvert 7 | import core.docvert_exception 8 | import core.docvert_xml 9 | import core.document_type 10 | import core.opendocument 11 | 12 | class Compare(pipeline_item.pipeline_stage): 13 | def stage(self, pipeline_value): 14 | if pipeline_value is None: 15 | raise pipeline_value_not_empty("A process type of Compare needs pipeline_value to compare with.") 16 | if 'withFile' not in self.attributes: 17 | raise needs_with_file_attribute("A process type of Compare needs a withFile attribute containing a filename/path.") 18 | compare_path = self.resolve_pipeline_resource(self.attributes['withFile']) 19 | if not os.path.exists(compare_path): 20 | raise generation_file_not_found("A process type of Compare couldn't find a file at %s" % compare_path) 21 | compare_data = file(compare_path) 22 | compare_xml = None 23 | doc_type = core.document_type.detect_document_type(compare_data) 24 | if doc_type == core.document_type.types.oasis_open_document: 25 | compare_xml = core.opendocument.extract_useful_open_document_files(compare_data) 26 | elif doc_type == core.document_type.types.xml: 27 | compare_xml = compare_data 28 | else: 29 | raise cannot_compare_with_non_xml_or_non_opendocument("Cannot compare withFile=%s with detected type of %s" % (compare_path, doc_type)) 30 | turn_document_into_test_filename = "internal://turn-document-into-test.xsl" 31 | xslt_path = self.resolve_pipeline_resource(turn_document_into_test_filename) 32 | test_xslt = core.docvert_xml.transform(compare_data, xslt_path) 33 | storage_filename = "comparision-to-%s.xhtml" % self.attributes['withFile'] 34 | storage_path = "%s/%s" % (self.pipeline_storage_prefix, storage_filename) 35 | if self.pipeline_storage_prefix is None: 36 | storage_path = storage_filename 37 | print(storage_path) 38 | storage[storage_path] = core.docvert_xml.transform(pipeline_value, test_as_xslt) 39 | return pipeline_value 40 | 41 | class pipeline_value_not_empty(core.docvert_exception.docvert_exception): 42 | pass 43 | 44 | class needs_with_file_attribute(core.docvert_exception.docvert_exception): 45 | pass 46 | 47 | class generation_file_not_found(core.docvert_exception.docvert_exception): 48 | pass 49 | 50 | class cannot_compare_with_non_xml_or_non_opendocument(core.docvert_exception.docvert_exception): 51 | pass 52 | 53 | -------------------------------------------------------------------------------- /core/pipeline_type/convertimages.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | import os.path 4 | import tempfile 5 | import io 6 | import subprocess 7 | from PIL import Image # Python PIL 8 | from . import pipeline_item 9 | import core.docvert_xml 10 | import core.opendocument 11 | import core.docvert_libreoffice 12 | import lxml.etree 13 | 14 | class ConvertImages(pipeline_item.pipeline_stage): 15 | synonym_formats = dict( #Not just synonyms, but types of files that are converted using the same code (eg, emf=wmf) 16 | emf='wmf',wmf='wmf',#horrible old vector 17 | pdf='pdf', ps='pdf', #moderately horrible vector 18 | svg='svg',#vector 19 | ani='png',apng='png',art='png',bef='png',bmf='png',bmp='png',cgm='png',cin='png',cpc='png',dpx='png',ecw='png',exr='png',fits='png',flic='png',fpx='png',gif='png',icer='png',ics='png',iff='png',iges='png',ilbm='png',jbig='png',jbig2='png',jng='png',jpe='png',jpg='png',jpeg='png',jp2='png',mng='png',miff='png',pbm='png',pcx='png',pgf='png',pgm='png',png='png',ppm='png',psp='png',raw='png',rad='png',rgbe='png',sgi='png',tga='png',tif='png',tiff='png',webp='png',xar='png',xbm='png',xcf='png',xpm='png' #bitmap 20 | ) 21 | 22 | def stage(self, pipeline_value): 23 | self.intermediate_files = list() 24 | intermediate_file_extensions_to_retain = list() 25 | #TODO add format sniffing code 26 | conversions = dict() 27 | if '__convertimages' not in self.storage: 28 | self.storage['__convertimages'] = dict() 29 | # 1. Parse conversion requests 30 | formats = ("%s," % self.attributes["formats"]).split(",") 31 | for format in formats: 32 | conversion = format.strip(" ._-\n\r").lower() 33 | if len(conversion) == 0: continue 34 | from_format, to_format = conversion.split("2") 35 | if from_format in self.synonym_formats: 36 | from_format = self.synonym_formats[from_format] 37 | if from_format not in conversions: 38 | conversions[from_format] = list() 39 | intermediate_file_extensions_to_retain.append(str(to_format)) 40 | conversions[str(from_format)].append(str(to_format)) 41 | 42 | # 2. Convert images 43 | # 44 | storage_paths = list(self.storage.keys()) 45 | for storage_path in storage_paths: 46 | if self.pipeline_storage_prefix and not storage_path.startswith(self.pipeline_storage_prefix): 47 | continue 48 | path, extension = os.path.splitext(storage_path) 49 | extension_minus_dot = str(extension[1:]) 50 | for from_format, to_formats in conversions.items(): 51 | from_format_method = "convert_%s" % extension_minus_dot 52 | if extension_minus_dot == from_format and hasattr(self, from_format_method): 53 | for to_format in to_formats: 54 | pipeline_value = getattr(self, from_format_method)(storage_path, to_format, pipeline_value) 55 | 56 | # 3. Delete original images 57 | if "deleteOriginals" in self.attributes and not self.attributes["deleteOriginals"].strip().lower() in ['false','f','n','0','']: 58 | for storage_path in storage_paths: 59 | if not storage_path.startswith(self.pipeline_storage_prefix): 60 | continue 61 | extension = os.path.splitext(storage_path)[1][1:] 62 | #if conversions.has_key(extension): 63 | # self.storage.remove(storage_path) 64 | 65 | for intermediate_file in self.intermediate_files: 66 | path, extension = os.path.splitext(intermediate_file) 67 | extension_minus_dot = str(extension[1:]) 68 | if not extension_minus_dot in intermediate_file_extensions_to_retain: 69 | try: 70 | del self.storage[intermediate_file] 71 | except KeyError as e: 72 | pass 73 | 74 | return pipeline_value 75 | 76 | def convert_wmf(self, storage_path, to_format, pipeline_value, width=None, height=None): 77 | # We can't reliably parse wmf/emf here so use LibreOffice to generate PDF no matter the to_format 78 | path, extension = os.path.splitext(storage_path) 79 | pdf_path = "%s.pdf" % path 80 | if pdf_path not in self.storage: 81 | if width is None or height is None: 82 | width, height, pipeline_value = self.get_dimensions_from_xml(storage_path, pipeline_value, to_format) 83 | #print "Generate document for %s because %s doesn't exist\n%s\n\n" % (storage_path, pdf_path, self.storage.keys()) 84 | opendocument = core.opendocument.generate_single_image_document(self.storage[storage_path], width, height) 85 | self.storage[pdf_path] = core.docvert_libreoffice.get_client().convert_by_stream(opendocument, core.docvert_libreoffice.LIBREOFFICE_PDF) 86 | else: 87 | #print "Cache hit! No need to generate %s" % pdf_path 88 | pass 89 | if to_format == 'pdf': 90 | return pipeline_value 91 | self.intermediate_files.append(pdf_path) 92 | from_format = 'pdf' 93 | if from_format in self.synonym_formats: 94 | from_format = self.synonym_formats[from_format] 95 | from_format_method = "convert_%s" % from_format 96 | return getattr(self, from_format_method)(pdf_path, to_format, pipeline_value, width, height) 97 | 98 | def convert_pdf(self, storage_path, to_format, pipeline_value, width=None, height=None): 99 | path, extension = os.path.splitext(storage_path) 100 | svg_path = "%s.svg" % path 101 | if svg_path not in self.storage: 102 | if width is None or height is None: 103 | width, height, pipeline_value = self.get_dimensions_from_xml(storage_path, pipeline_value) 104 | from_format = str(extension[1:]) 105 | synonym_from_format = from_format 106 | if synonym_from_format in self.synonym_formats: 107 | synonym_from_format = self.synonym_formats[synonym_from_format] 108 | self.storage[svg_path] = self.run_conversion_command_with_temporary_files(storage_path, "pdf2svg %s %s") 109 | else: 110 | #print "Cache hit! No need to generate %s" % svg_path 111 | pass 112 | if to_format == 'svg': 113 | return pipeline_value 114 | self.intermediate_files.append(svg_path) 115 | from_format = 'svg' 116 | if from_format in self.synonym_formats: 117 | from_format = self.synonym_formats[from_format] 118 | from_format_method = "convert_%s" % from_format 119 | return getattr(self, from_format_method)(svg_path, to_format, pipeline_value, width, height) 120 | 121 | def convert_svg(self, storage_path, to_format, pipeline_value, width=None, height=None): 122 | path, extension = os.path.splitext(storage_path) 123 | png_path = "%s.png" % path 124 | if png_path not in self.storage: 125 | if width is None or height is None: 126 | width, height, pipeline_value = self.get_dimensions_from_xml(storage_path, pipeline_value) 127 | from_format = str(extension[1:]) 128 | synonym_from_format = from_format 129 | if synonym_from_format in self.synonym_formats: 130 | synonym_from_format = self.synonym_formats[synonym_from_format] 131 | self.storage[png_path] = self.run_conversion_command_with_temporary_files(storage_path, "rsvg %s %s") 132 | else: 133 | #print "Cache hit! No need to generate %s" % png_path 134 | pass 135 | if to_format == 'png': 136 | return pipeline_value 137 | self.intermediate_files.append(png_path) 138 | from_format = 'svg' 139 | if from_format in self.synonym_formats: 140 | from_format = self.synonym_formats[from_format] 141 | from_format_method = "convert_%s" % from_format 142 | return getattr(self, from_format_method)(png_path, to_format, pipeline_value, width, height) 143 | 144 | def convert_png(self, storage_path, to_format, pipeline_value, width=None, height=None): 145 | #im = Image.open('icon.gif') 146 | #transparency = im.info['transparency'] 147 | #im .save('icon.png', transparency=transparency) 148 | #print dir(Image) 149 | return pipeline_value 150 | 151 | def get_dimensions_from_xml(self, storage_path, pipeline_value, change_image_path_extension_to=None): 152 | def get_value(data): 153 | if hasattr(data, 'read'): 154 | data.seek(0) 155 | return data.read() 156 | return data 157 | path, extension = os.path.splitext(storage_path) 158 | if path in self.storage['__convertimages']: #intentionally extensionless because all formats of this single image are considered to have the same dimensions 159 | return (self.storage['__convertimages'][path]['width'], self.storage['__convertimages'][path]['height'], pipeline_value) 160 | 161 | default_dimensions = ('10cm', '10cm') #we had to choose something 162 | #if self.pipeline_storage_prefix: 163 | # storage_path = storage_path[len(self.pipeline_storage_prefix) + 1:] 164 | xml = self.get_document(pipeline_value) 165 | namespaces = {'xlink':'http://www.w3.org/1999/xlink'} 166 | xpath = '//*[@xlink:href="%s"]/parent::*' % storage_path 167 | image_nodes = xml.xpath(xpath, namespaces=namespaces) 168 | if len(image_nodes) == 0: #can't do anything, might have been a thumbnail or unlinked image, but either way return 10cm square 169 | #images = xml.xpath('//*[@xlink:href]', namespaces=namespaces) 170 | #print "Could not find image node with %s. Document contains: \n%s\n%s. Prefix was %s" % (xpath, images[0], images[0].attrib, self.pipeline_storage_prefix) 171 | return default_dimensions[0], default_dimensions[1], pipeline_value 172 | #print "FOUND IMAGE!" 173 | image_node = image_nodes[0] #first image will be do fine. It's possible to have multiple tags with different width/height pointing at the same image but for now we'll discount that possibility 174 | oasis_opendocument_svg_namespace = 'urn:oasis:names:tc:opendocument:xmlns:svg-compatible:1.0' 175 | width_attribute = "{%s}%s" % (oasis_opendocument_svg_namespace, 'width') 176 | height_attribute = "{%s}%s" % (oasis_opendocument_svg_namespace, 'height') 177 | #print "about to read width/height" 178 | try: 179 | width = image_node.attrib[width_attribute] 180 | height = image_node.attrib[height_attribute] 181 | #print "success... and %s" % change_image_path_extension_to 182 | if change_image_path_extension_to: 183 | path, extension = os.path.splitext(storage_path) 184 | xlink_href_attribute = "{%s}%s" % (namespaces['xlink'], 'href') 185 | change_image_path = '%s.%s' % (path, change_image_path_extension_to) 186 | #print "New image path is %s" % change_image_path 187 | image_nodes = image_node.xpath('*[@xlink:href="%s"]' % storage_path, namespaces=namespaces) 188 | for image_node in image_nodes: 189 | #print "Value was %s" % image_node.attrib[xlink_href_attribute] 190 | image_node.attrib[xlink_href_attribute] = change_image_path 191 | #print "Value is %s" % image_node.attrib[xlink_href_attribute] 192 | self.storage['__convertimages'][path] = dict(width=width, height=height) #intentionally extensionless because all formats of this single image are considered to have the same dimensions 193 | return (width, height, lxml.etree.tostring(xml)) 194 | except KeyError as e: 195 | pass 196 | return default_dimensions[0], default_dimensions[1], pipeline_value 197 | 198 | def get_document(self, pipeline_value): 199 | xml = core.docvert_xml.get_document(pipeline_value) 200 | if hasattr(xml, "getroottree"): 201 | xml = xml.getroottree() 202 | elif hasattr(xml, 'getroot'): 203 | xml = xml.getroot() 204 | return xml 205 | 206 | def run_conversion_command_with_temporary_files(self, from_storage_path, command_template): 207 | def get_value(data): 208 | if hasattr(data, 'read'): 209 | data.seek(0) 210 | return data.read() 211 | return data 212 | temporary_from_path = None 213 | temporary_to_path = None 214 | try: 215 | os_handle, temporary_from_path = tempfile.mkstemp() 216 | temporary_from_file = open(temporary_from_path, 'wb') 217 | the_value = get_value(self.storage[from_storage_path]) 218 | if hasattr(the_value, 'encode'): 219 | the_value = the_value.encode('utf-8') 220 | temporary_from_file.write(the_value); 221 | temporary_from_file.flush() 222 | temporary_from_file.close() 223 | os_handle, temporary_to_path = tempfile.mkstemp() 224 | command = command_template % (temporary_from_path, temporary_to_path) 225 | std_response = subprocess.getstatusoutput(command) 226 | if os.path.getsize(temporary_to_path) == 0: 227 | raise Exception('Error in convertimages.py: No output data created. Command was "%s" which returned "%s"' % (command_template, std_response)) 228 | temporary_to = open(temporary_to_path, 'rb') 229 | to_data = temporary_to.read() 230 | temporary_to.close() 231 | return to_data 232 | finally: 233 | if temporary_from_path: os.remove(temporary_from_path) 234 | if temporary_to_path: os.remove(temporary_to_path) 235 | 236 | 237 | """ 238 | #NOTE: Poppler doesn't work on my [Matthew Holloway's] Ubuntu 10.10 machine. It seg faults so that's why I'm shelling out 239 | #import cairo 240 | #import poppler 241 | os_handle, temporary_file_path = tempfile.mkstemp() 242 | temporary_file = open(temporary_file_path, 'w') 243 | temporary_file.write(get_value(self.storage[storage_path])) 244 | temporary_file.flush() 245 | print temporary_file_path 246 | pdf = poppler.document_new_from_file( 247 | "file://%s" % temporary_file_path, 248 | password=None) 249 | first_page = pdf.get_page(0) 250 | surface = cairo.PDFSurface(surface_storage, width_float, height_float) 251 | cairo_context = cairo.Context(surface) 252 | 253 | first_page.render(cairo_context) 254 | surface.write_to_png("/tmp/page0.png") 255 | print dir(first_page) 256 | temporary_file.close() 257 | """ 258 | 259 | -------------------------------------------------------------------------------- /core/pipeline_type/debug.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import lxml.etree 3 | from . import pipeline_item 4 | import core.docvert_exception 5 | 6 | class Debug(pipeline_item.pipeline_stage): 7 | def stage(self, pipeline_value): 8 | def get_value(data): 9 | if hasattr(data, "read"): 10 | data.seek(0) 11 | return data.read() 12 | return data 13 | if isinstance(pipeline_value, lxml.etree._Element) or isinstance(pipeline_value, lxml.etree._XSLTResultTree): 14 | pipeline_value = lxml.etree.tostring(pipeline_value) 15 | elif hasattr(pipeline_value, 'read'): 16 | pipeline_value.seek(0) 17 | pipeline_value = pipeline_value.read() 18 | if get_value(pipeline_value) is None: 19 | raise core.docvert_exception.debug_exception("Current contents of pipeline", "Debug: pipeline_value is %s" % get_value(pipeline_value), "text/plain; charset=UTF-8") 20 | try: 21 | document = lxml.etree.fromstring(get_value(pipeline_value)) 22 | except lxml.etree.XMLSyntaxError as exception: 23 | raise core.docvert_exception.debug_exception("Current contents of pipeline", "Error parsing as XML, here it is as plain text: %s\n%s" % (exception, pipeline_value), "text/plain; charset=UTF-8") 24 | help_text = "In debug mode we want to display an XML tree but if the root node is or there's an HTML namespace then popular browsers will\nrender it as HTML so these have been changed. See core/pipeline_type/debug.py for the details." 25 | unit_tests = self.get_tests() 26 | if unit_tests: 27 | #help_text += "\n\nUnit tests so far in the pipeline:" 28 | help_text += "\n\nFailed unit tests so far in the pipeline:" 29 | for value in self.get_tests(): 30 | #help_text += "\n\t%s:%s" % (value["status"], value["message"]) 31 | if value["status"] == "fail": 32 | help_text += "\n\tFail: %s" % (value["message"]) 33 | 34 | content_type = 'text/xml' 35 | if "contentType" in self.attributes: 36 | content_type = self.attributes['contentType'] 37 | if "zip" in self.attributes: 38 | content_type = 'application/zip' 39 | pipeline_value = self.storage.to_zip().getvalue() 40 | if content_type == 'text/xml': 41 | help_text += "\n\nConversion files:\n\t" + "\n\t".join(list(self.storage.keys())) 42 | if hasattr(document, 'getroottree'): 43 | document = document.getroottree() 44 | if document.getroot().tag == "{http://www.w3.org/1999/xhtml}html": 45 | pipeline_value = "%s" % (help_text, lxml.etree.tostring(document.getroot())) 46 | else: 47 | pipeline_value = "%s" % (help_text, lxml.etree.tostring(document.getroot()).decode('utf-8') ) 48 | pipeline_value = pipeline_value.replace('"http://www.w3.org/1999/xhtml"', '"XHTML_NAMESPACE_REPLACED_BY_DOCVERT_DURING_DEBUG_MODE"') 49 | xml_declaration = '' 50 | if pipeline_value[0:5] != xml_declaration[0:5]: 51 | pipeline_value = xml_declaration + "\n" + pipeline_value 52 | raise core.docvert_exception.debug_xml_exception("Current contents of pipeline", pipeline_value, content_type) 53 | -------------------------------------------------------------------------------- /core/pipeline_type/docbooktoxhtml.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from . import pipeline_item 3 | import core.docvert_xml 4 | 5 | class DocBookToXHTML(pipeline_item.pipeline_stage): 6 | 7 | def stage(self, pipeline_value): 8 | docbook_to_html_path = self.resolve_pipeline_resource('internal://docbook-to-html.xsl') 9 | return core.docvert_xml.transform(pipeline_value, docbook_to_html_path) 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /core/pipeline_type/generate.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | import os.path 4 | import io 5 | from . import pipeline_item 6 | import core.docvert 7 | import core.opendocument 8 | import core.document_type 9 | import core.docvert_exception 10 | 11 | class Generate(pipeline_item.pipeline_stage): 12 | def stage(self, pipeline_value): 13 | if 'withFile' not in self.attributes: 14 | raise needs_with_file_attribute("A process type of Generate needs a withFile attribute containing a filename/path.") 15 | path = self.resolve_pipeline_resource(self.attributes['withFile']) 16 | if not os.path.exists(path): 17 | raise generation_file_not_found("A process type of Generate couldn't find a file at %s" % path) 18 | #print(path) 19 | data = open(path, mode='rb') 20 | doc_type = core.document_type.detect_document_type(data) 21 | #print(doc_type) 22 | if doc_type != core.document_type.types.oasis_open_document: 23 | data = core.docvert.generate_open_document(data) 24 | document_xml = core.opendocument.extract_useful_open_document_files(data, self.storage, os.path.basename(path)) 25 | return document_xml 26 | 27 | class needs_with_file_attribute(core.docvert_exception.docvert_exception): 28 | pass 29 | 30 | class generation_file_not_found(core.docvert_exception.docvert_exception): 31 | pass 32 | -------------------------------------------------------------------------------- /core/pipeline_type/generatepostconversioneditorfiles.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | import lxml.etree 4 | import io 5 | from . import pipeline_item 6 | import core.docvert_exception 7 | 8 | 9 | class GeneratePostConversionEditorFiles(pipeline_item.pipeline_stage): 10 | def stage(self, pipeline_value): 11 | return pipeline_value 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /core/pipeline_type/getpreface.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import lxml.etree 3 | from . import pipeline_item 4 | import core.docvert_exception 5 | import core.docvert_xml 6 | 7 | class GetPreface(pipeline_item.pipeline_stage): 8 | def stage(self, pipeline_value): 9 | params = dict( 10 | loopDepth = 0, 11 | process = self.attributes['process'], 12 | customFilenameIndex = 'index.html', 13 | customFilenameSection = 'section#.html' 14 | ) 15 | xslt_path = self.resolve_pipeline_resource('internal://each-page.xsl') 16 | return core.docvert_xml.transform(pipeline_value, xslt_path, params) 17 | 18 | 19 | -------------------------------------------------------------------------------- /core/pipeline_type/loop.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | from . import pipeline_item 4 | import copy 5 | import core.docvert_exception 6 | import core.docvert_pipeline 7 | import core.docvert_xml 8 | import lxml.etree 9 | 10 | class Loop(pipeline_item.pipeline_stage): 11 | def stage(self, pipeline_value): 12 | if 'numberOfTimes' not in self.attributes: 13 | raise no_number_of_times_attribute("In process Loop there wasn't a numberOfTimes attribute.") 14 | numberOfTimes = self.attributes['numberOfTimes'] 15 | if numberOfTimes.startswith('xpathCount:'): 16 | xpath = numberOfTimes[len('xpathCount:'):] 17 | xml = core.docvert_xml.get_document(pipeline_value) 18 | namespaces = { 19 | 'xlink':'http://www.w3.org/1999/xlink', 20 | 'db':'http://docbook.org/ns/docbook', 21 | 'text':'urn:oasis:names:tc:opendocument:xmlns:text:1.0', 22 | 'office':'urn:oasis:names:tc:opendocument:xmlns:office:1.0', 23 | 'html':'http://www.w3.org/1999/xhtml', 24 | 'xhtml':'http://www.w3.org/1999/xhtml'} 25 | nodes = xml.xpath(xpath, namespaces=namespaces) 26 | index = 0 27 | for node in nodes: 28 | index += 1 29 | child_depth = copy.copy(self.depth) 30 | child_depth.append(str(index)) 31 | pipeline = core.docvert_pipeline.pipeline_processor(self.storage, self.child_stages, self.pipeline_directory, self.pipeline_storage_prefix, child_depth) 32 | child_pipeline_value = lxml.etree.tostring(pipeline_value).decode('utf-8') 33 | pipeline.start(child_pipeline_value) #discard return value 34 | elif numberOfTimes.startswith('substring:'): 35 | number = int(numberOfTimes[len('substring:'):]) 36 | for index in range(1, number): 37 | pass 38 | elif numberOfTimes.startswith('number:'): 39 | number = int(numberOfTimes[len('number:'):]) 40 | for index in range(1, number): 41 | pass 42 | return pipeline_value 43 | 44 | class no_number_of_times_attribute(core.docvert_exception.docvert_exception): 45 | pass 46 | 47 | -------------------------------------------------------------------------------- /core/pipeline_type/normalizeopendocument.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import lxml.etree 3 | from . import pipeline_item 4 | import core.docvert_xml 5 | 6 | class NormalizeOpenDocument(pipeline_item.pipeline_stage): 7 | 8 | def stage(self, pipeline_value): 9 | normalize_opendocument_path = self.resolve_pipeline_resource('internal://normalize-opendocument.xsl') 10 | return core.docvert_xml.transform(pipeline_value, normalize_opendocument_path) 11 | 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /core/pipeline_type/pipeline_item.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os.path 3 | 4 | docvert_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 5 | 6 | class pipeline_stage(object): 7 | def __init__(self, storage, pipeline_directory, attributes, pipeline_storage_prefix=None, child_stages=None, depth=None): 8 | self.storage = storage 9 | self.pipeline_directory = pipeline_directory 10 | self.pipeline_id = os.path.basename(pipeline_directory) 11 | self.pipeline_id_namespace = os.path.basename(os.path.dirname(pipeline_directory)) 12 | self.attributes = attributes 13 | self.pipeline_storage_prefix = pipeline_storage_prefix 14 | self.child_stages = child_stages 15 | self.depth = list() if depth is None else depth 16 | 17 | def resolve_pipeline_resource(self, resource_path): 18 | internal_prefix = 'internal://' 19 | if resource_path.startswith(internal_prefix): 20 | return os.path.join(docvert_root, 'core', 'transform', resource_path[len(internal_prefix):]) 21 | return os.path.join(docvert_root, "pipelines", self.pipeline_id_namespace, self.pipeline_id, resource_path) 22 | 23 | def log(self, message, log_type='error'): 24 | log_filename = '%s.log' % log_type 25 | storage_path = log_filename 26 | if self.pipeline_storage_prefix is not None: 27 | storage_path = "%s/%s" % (self.pipeline_storage_prefix, log_filename) 28 | self.storage[storage_path] = message 29 | 30 | def add_tests(self, tests): 31 | self.storage.add_tests(tests) 32 | 33 | def get_tests(self): 34 | return self.storage.get_tests() 35 | 36 | -------------------------------------------------------------------------------- /core/pipeline_type/serialize.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | from . import pipeline_item 4 | import lxml.etree 5 | 6 | class Serialize(pipeline_item.pipeline_stage): 7 | def stage(self, pipeline_value): 8 | storage_path = "%s/%s" % (self.pipeline_storage_prefix, self.attributes['toFile']) 9 | if self.pipeline_storage_prefix is None: 10 | storage_path = self.attributes['toFile'] 11 | if '{customSection}' in storage_path: 12 | depth_string = 'section' 13 | depth_string += "-".join(self.depth) 14 | depth_string += ".html" 15 | storage_path = storage_path.replace('{customSection}', depth_string) 16 | if hasattr(pipeline_value, 'read'): 17 | self.storage[storage_path] = str(pipeline_value) 18 | elif isinstance(pipeline_value, lxml.etree._Element) or isinstance(pipeline_value, lxml.etree._XSLTResultTree): 19 | self.storage[storage_path] = lxml.etree.tostring(pipeline_value) 20 | else: 21 | self.storage[storage_path] = str(pipeline_value) 22 | return pipeline_value 23 | 24 | 25 | -------------------------------------------------------------------------------- /core/pipeline_type/serializeopendocument.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import cgi 3 | import os 4 | import zipfile 5 | import lxml.etree 6 | import io 7 | from . import pipeline_item 8 | import core.docvert_exception 9 | 10 | class SerializeOpenDocument(pipeline_item.pipeline_stage): 11 | def stage(self, pipeline_value): 12 | storage_path = "%s/%s" % (self.pipeline_storage_prefix, self.attributes['toFile']) 13 | if self.pipeline_storage_prefix is None: 14 | storage_path = self.attributes['toFile'] 15 | if '{customSection}' in storage_path: 16 | depth_string = 'section' 17 | depth_string += "-".join(self.depth) 18 | depth_string += ".odt" 19 | storage_path = storage_path.replace('{customSection}', depth_string) 20 | if not isinstance(pipeline_value, lxml.etree._Element) and not isinstance(pipeline_value, lxml.etree._XSLTResultTree): 21 | return pipeline_value 22 | zipdata = io.StringIO() 23 | archive = zipfile.ZipFile(zipdata, 'w') 24 | archive.writestr('mimetype', 'application/vnd.oasis.opendocument.text') 25 | manifest_xml = '\n\n\t\n' 26 | root = pipeline_value.getroot() 27 | expected_lxml_root = '{docvert:5}root' 28 | if str(root.tag) != expected_lxml_root: 29 | raise core.docvert_exception.unable_to_serialize_opendocument("Can't serialize OpenDocument with a pipeline_value root node of '%s'." % root.tag) 30 | expected_lxml_child = '{docvert:5}external-file' 31 | for child in root.iterchildren(): 32 | if str(child.tag) != expected_lxml_child: 33 | raise core.docvert_exception.unable_to_serialize_opendocument("Can't serialize OpenDocument with a pipeline_value child node of '%s'." % child.tag) 34 | filename = str(child.attrib['{docvert:5}name']) 35 | xml = "".join(map(lxml.etree.tostring, child.getchildren())) 36 | print("{%s]" % filename) 37 | archive.writestr(filename, xml) 38 | manifest_xml += '\t\n' % (cgi.escape('text/xml'), cgi.escape(filename)) 39 | manifest_xml += '\t\n' 40 | imagetypes = {".svg":"image/svg+xml", ".png":"image/png", ".gif":"image/gif", ".bmp":"image/x-ms-bmp", ".jpg":"image/jpeg", ".jpe":"image/jpeg", ".jpeg":"image/jpeg"} 41 | for storage_key in list(self.storage.keys()): 42 | if storage_key.startswith(self.pipeline_storage_prefix): 43 | extension = os.path.splitext(storage_key)[1] 44 | if extension in list(imagetypes.keys()): 45 | odt_path = "Pictures/%s" % os.path.basename(storage_key) 46 | manifest_xml += '\t\n' % (cgi.escape(imagetypes[extension]), cgi.escape(odt_path) ) 47 | archive.writestr(odt_path, self.storage[storage_key]) 48 | manifest_xml += '' 49 | archive.writestr('META-INF/manifest.xml', manifest_xml.encode("utf-8") ) 50 | archive.close() 51 | zipdata.seek(0) 52 | self.storage.add(storage_path, zipdata.read()) 53 | 54 | 55 | 56 | -------------------------------------------------------------------------------- /core/pipeline_type/splitpages.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import lxml.etree 3 | from . import pipeline_item 4 | import core.docvert_exception 5 | import core.docvert_xml 6 | 7 | class SplitPages(pipeline_item.pipeline_stage): 8 | def stage(self, pipeline_value): 9 | depth_string = '-'.join(self.depth) 10 | params = dict( 11 | loopDepth = depth_string, 12 | process = self.attributes['process'], 13 | customFilenameIndex = 'index.html', 14 | customFilenameSection = 'section#.html' 15 | ) 16 | xslt_path = self.resolve_pipeline_resource('internal://each-page.xsl') 17 | 18 | return core.docvert_xml.transform(pipeline_value, xslt_path, params) 19 | 20 | 21 | -------------------------------------------------------------------------------- /core/pipeline_type/test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | import os.path 4 | import lxml.etree 5 | import io 6 | from . import pipeline_item 7 | import core.docvert_exception 8 | import core.docvert 9 | import core.docvert_xml 10 | 11 | 12 | class Test(pipeline_item.pipeline_stage): 13 | def stage(self, pipeline_value): 14 | def get_size(data): 15 | if hasattr(data, 'read'): 16 | data.seek(0, os.SEEK_END) 17 | return data.tell() 18 | return len(data) 19 | 20 | if not ("withFile" in self.attributes or "extensionExist" in self.attributes): 21 | raise no_with_file_attribute("In process Test there wasn't a withFile or extensionExist attribute.") 22 | if pipeline_value is None: 23 | raise xml_empty("Cannot Test with %s because pipeline_value is None." % self.attributes['withFile']) 24 | test_result = None 25 | if "withFile" in self.attributes: 26 | test_path = self.resolve_pipeline_resource(self.attributes['withFile']) 27 | if not os.path.exists(test_path): 28 | raise file_not_found("Test file not found at %s" % test_path) 29 | prefix = "" 30 | if "prefix" in self.attributes: 31 | prefix = "%s: " % self.attributes["prefix"] 32 | if test_path.endswith(".rng"): # RelaxNG test 33 | relaxng_response = core.docvert_xml.relaxng(pipeline_value, test_path) 34 | node_name = "pass" 35 | if not relaxng_response["valid"]: 36 | node_name = "fail" 37 | test_result = '<%s>%s%s' % (node_name, prefix, core.docvert_xml.escape_text(str(relaxng_response["log"])), node_name) 38 | elif test_path.endswith(".txt"): # Substring test (new substring on each line) 39 | document_string = str(pipeline_value) 40 | if hasattr(pipeline_value, "read"): 41 | document_string = pipeline_value.read() 42 | pipeline_value.seek(0) 43 | test_result = '' 44 | for line in open(test_path, 'r').readlines(): 45 | test_string = line[0:-1].strip() 46 | if len(test_string) == 0: continue 47 | node_name = "fail" 48 | description = "doesn't contain" 49 | occurences = document_string.count(test_string) 50 | if occurences == 1: 51 | node_name = "pass" 52 | description = "contains one of" 53 | elif occurences > 1: 54 | node_name = "fail" 55 | description = "contains %i of" % occurences 56 | test_result += '<%s>%s%s' % (node_name, prefix, core.docvert_xml.escape_text('Document %s the string "%s"' % (description, test_string)), node_name) 57 | test_result += '' 58 | else: #XSLT 59 | test_result = core.docvert_xml.transform(pipeline_value, test_path, dict(**self.attributes)) 60 | elif "extensionExist" in self.attributes: 61 | extension = self.attributes["extensionExist"] 62 | extension_exist_count = 1 63 | if "extensionExistCount" in self.attributes: 64 | extension_exist_count = int(self.attributes["extensionExistCount"]) 65 | original_extension_exist_count = extension_exist_count 66 | for key in list(self.storage.keys()): 67 | if key.endswith('thumbnail.png'): #ignore any inbuilt thumbnails 68 | continue 69 | if key.endswith(extension): 70 | if self.pipeline_storage_prefix is None or (self.pipeline_storage_prefix and key.startswith(self.pipeline_storage_prefix)): 71 | if get_size(self.storage[key]) > 0: 72 | extension_exist_count -= 1 73 | test_result = "pass" 74 | text = 'There were %i files with the extension "%s" as expected.' % (original_extension_exist_count, extension) 75 | if extension_exist_count != 0: 76 | test_result = "fail" 77 | text = 'There were only %i (%i-%i) files instead of %i with the extension "%s". ' % (original_extension_exist_count - extension_exist_count, original_extension_exist_count, extension_exist_count, original_extension_exist_count, extension) 78 | test_result = '<%s>%s' % (test_result, core.docvert_xml.escape_text(text), test_result) 79 | if "debug" in self.attributes: 80 | raise core.docvert_exception.debug_xml_exception("Test Results", str(test_result), "text/xml; charset=UTF-8") 81 | self.add_tests(test_result) 82 | return pipeline_value 83 | 84 | 85 | class no_with_file_attribute(core.docvert_exception.docvert_exception): 86 | pass 87 | 88 | class file_not_found(core.docvert_exception.docvert_exception): 89 | pass 90 | 91 | class xml_empty(core.docvert_exception.docvert_exception): 92 | pass 93 | -------------------------------------------------------------------------------- /core/pipeline_type/transform.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | import io 4 | from . import pipeline_item 5 | import core.docvert_exception 6 | import core.docvert_xml 7 | 8 | class Transform(pipeline_item.pipeline_stage): 9 | def stage(self, pipeline_value): 10 | if "withFile" not in self.attributes: 11 | raise no_with_file_attribute("In process Transform there wasn't a withFile attribute.") 12 | if pipeline_value is None: 13 | raise xml_empty("Cannot Transform with %s because pipeline_value is None." % self.attributes['withFile']) 14 | xslt_path = self.resolve_pipeline_resource(self.attributes['withFile']) 15 | if not os.path.exists(xslt_path): 16 | raise xslt_not_found("XSLT file not found at %s" % xslt_path) 17 | return core.docvert_xml.transform(pipeline_value, xslt_path) 18 | 19 | class no_with_file_attribute(core.docvert_exception.docvert_exception): 20 | pass 21 | 22 | class xslt_not_found(core.docvert_exception.docvert_exception): 23 | pass 24 | 25 | class xml_empty(core.docvert_exception.docvert_exception): 26 | pass 27 | -------------------------------------------------------------------------------- /core/pipeline_type/transformopendocumenttodocbook.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import lxml.etree 3 | from . import pipeline_item 4 | import core.docvert_xml 5 | import core.docvert_exception 6 | 7 | class TransformOpenDocumentToDocBook(pipeline_item.pipeline_stage): 8 | 9 | def stage(self, pipeline_value): 10 | normalize_opendocument_path = self.resolve_pipeline_resource('internal://normalize-opendocument.xsl') 11 | pipeline_value = core.docvert_xml.transform(pipeline_value, normalize_opendocument_path) 12 | if "debugAfterOpenDocumentNormalization" in self.attributes: 13 | pipeline_value = lxml.etree.tostring(pipeline_value) 14 | raise core.docvert_exception.debug_xml_exception("Current contents of pipeline", pipeline_value, 'text/xml') 15 | opendocument_to_docbook_path = self.resolve_pipeline_resource('internal://opendocument-to-docbook.xsl') 16 | pipeline_value = core.docvert_xml.transform(pipeline_value, opendocument_to_docbook_path) 17 | normalize_docbook_path = self.resolve_pipeline_resource('internal://normalize-docbook.xsl') 18 | pipeline_value = core.docvert_xml.transform(pipeline_value, normalize_docbook_path) 19 | if "debugAfterDocBookNormalization" in self.attributes: 20 | pipeline_value = lxml.etree.tostring(pipeline_value) 21 | raise core.docvert_exception.debug_xml_exception("Current contents of pipeline", pipeline_value, 'text/xml') 22 | return pipeline_value 23 | 24 | 25 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /core/pipeline_type/writemetadata.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | from . import pipeline_item 4 | import core.docvert_exception 5 | import core.docvert_xml 6 | import lxml.etree 7 | 8 | class WriteMetaData(pipeline_item.pipeline_stage): 9 | def stage(self, pipeline_value): 10 | opendocument_xml_path = "%s/%s" % (self.pipeline_storage_prefix, 'opendocument.xml') 11 | xslt_path = self.resolve_pipeline_resource('internal://extract-metadata.xsl') 12 | if not os.path.exists(xslt_path): 13 | raise xslt_not_found("XSLT file not found at %s" % xslt_path) 14 | metadata_xml_path = "%s/%s" % (self.pipeline_storage_prefix, 'docvert-meta.xml') 15 | metadata_xml = core.docvert_xml.transform(self.storage.get(opendocument_xml_path), xslt_path) 16 | if isinstance(metadata_xml, lxml.etree._Element) or isinstance(metadata_xml, lxml.etree._XSLTResultTree): 17 | metadata_xml = lxml.etree.tostring(metadata_xml) 18 | self.storage[metadata_xml_path] = metadata_xml 19 | return pipeline_value 20 | 21 | class xslt_not_found(core.docvert_exception.docvert_exception): 22 | pass 23 | 24 | 25 | -------------------------------------------------------------------------------- /core/transform/extract-metadata.xsl: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /core/transform/normalize-docbook.xsl: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /core/transform/turn-document-into-test.xsl: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | <xsl:template> 21 | 22 | 23 | 24 | 25 | 26 | </xsl:template> 27 | 28 | 29 | 30 | -------------------------------------------------------------------------------- /core/web_service_themes/README: -------------------------------------------------------------------------------- 1 | Warning this directory is available via the web server. Do not store sensitive files here or in subdirectories. 2 | -------------------------------------------------------------------------------- /core/web_service_themes/default/docvertedges2-small.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/holloway/docvert-python3/a16d9ff0921186355e80c6f4905a00f20c12cf5f/core/web_service_themes/default/docvertedges2-small.gif -------------------------------------------------------------------------------- /core/web_service_themes/default/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/holloway/docvert-python3/a16d9ff0921186355e80c6f4905a00f20c12cf5f/core/web_service_themes/default/favicon.ico -------------------------------------------------------------------------------- /core/web_service_themes/default/index.js: -------------------------------------------------------------------------------- 1 | if(history) history.navigationMode = 'compatible'; 2 | 3 | if(location && location.hash.toString().indexOf("back-reload") != -1){ //see http://stackoverflow.com/questions/158319/cross-browser-onload-event-and-the-back-button 4 | window.location.href = (window.location.href.toString().indexOf("#") != -1) ? window.location.href.toString().substring(0, window.location.href.toString().indexOf("#")) : window.location.href.toString() 5 | } 6 | 7 | var docvert = { 8 | slide_in: function(){ 9 | if(location && location.hash.toString().indexOf("slide-in") != -1){ 10 | location.hash = "" 11 | var form_element = $("form") 12 | form_element.css({"top":-form_element.height(), position:"relative"}) 13 | form_element.animate({top:0}, "slow") 14 | } 15 | }, 16 | 17 | upload_file_change: function(event){ 18 | $(event.target).parent().append($(event.target).clone()) 19 | var text = $(event.target).val() 20 | var full_text = "" 21 | if(text.length > 25) { 22 | full_text = text 23 | text = text.substring(0,15) + "\u2026" + text.substring(text.length-10) 24 | } 25 | var list_item = $("
  • ").attr("title",full_text).append($(event.target).attr("id","").css("display","none")).text(text).append(' ×').hide() 26 | $("#upload_list").append(list_item) 27 | list_item.slideDown() 28 | $("#upload_submit").removeClass("disabled").addClass("enabled") 29 | $("#submit_error").slideUp() 30 | $(".upload_list").slideDown() 31 | }, 32 | 33 | upload_file_delete: function(event){ 34 | var container = $(event.target).parent() 35 | if(container.parent().children().length === 1) { 36 | $("#upload_submit").addClass("disabled").removeClass("enabled") 37 | $(".upload_list").slideUp() 38 | } 39 | container.slideUp('slow',function(){ 40 | container.remove() 41 | }) 42 | return false 43 | }, 44 | 45 | upload_file_mouseover: function(event) { 46 | $("#upload_from_file").addClass("upload_button_hover") 47 | }, 48 | 49 | upload_file_mouseout: function(event) { 50 | $("#upload_from_file").removeClass("upload_button_hover") 51 | }, 52 | 53 | reveal_upload_web_dialog: function(event){ 54 | var sender = $(event.target) 55 | sender_offset = sender.offset() 56 | $("#upload_from_web_dialog").show().css({"position":"absolute","left":sender_offset.left+"px","top":(sender_offset.top+sender.height())+"px"}) 57 | $("#upload_from_web_dialog input").val("http://\u2026").select() 58 | }, 59 | 60 | is_url: function(value){ 61 | //an intentionally rather liberal url detector 62 | var url_pattern = /^(ftp|http|https):\/\/.*?\//i 63 | return url_pattern.test(value) 64 | }, 65 | 66 | hide_upload_web_dialog: function(event){ 67 | $("#upload_from_web_dialog").hide() 68 | var url = $("#upload_from_web_dialog input").val() 69 | if(docvert.is_url(url)) { 70 | docvert.upload_file_change(event) 71 | } 72 | }, 73 | 74 | replace_select: function(select, width){ 75 | }, 76 | 77 | check_submit: function(event){ 78 | docvert.hide_upload_web_dialog() 79 | var should_submit = ($("#upload_list li").length > 0) 80 | if(!should_submit) { 81 | $("#submit_error").slideDown().find("span").animate({"marginLeft": "50px"}, function(){ 82 | $("#submit_error span").animate({"marginLeft": "-50px"}, function(){ 83 | $("#submit_error span").animate({"marginLeft": "50px"}, function(){ 84 | $("#submit_error").slideDown().find("span").animate({"marginLeft": "-50px"}, function(){ 85 | $("#submit_error").slideDown().find("span").animate({"marginLeft": "0px"}) 86 | }) 87 | }) 88 | }) 89 | }) 90 | return false 91 | } 92 | if($("#after_conversion_preview").is(":checked")){ 93 | location.hash = "back-reload" //see http://stackoverflow.com/questions/158319/cross-browser-onload-event-and-the-back-button 94 | var form_element = $("form") 95 | form_element.css({"position":"relative"}).animate({"top": -(form_element.offset().top + form_element.height() + 50)},"slow", function(){ 96 | $("form").submit() 97 | }) 98 | return false 99 | } 100 | }, 101 | 102 | click_advanced: function(){ 103 | var inner = $(this).parents("fieldset").find(".inner") 104 | if(inner.hasClass("closed")) { 105 | $("span", this).html("▼") 106 | inner.removeClass("closed").slideDown() 107 | } else { 108 | $("span", this).html("▶") 109 | inner.addClass("closed").slideUp() 110 | } 111 | return false 112 | }, 113 | 114 | keydown: function(event){ 115 | var escape_key = 27 116 | if (event.keyCode == escape_key) { 117 | $("#upload_from_web_dialog").hide() 118 | } 119 | }, 120 | 121 | reset_check_libreoffice_status: function(){ 122 | docvert.number_of_libreoffice_checks_remaining = 10 123 | }, 124 | 125 | check_libreoffice_status: function(event) { 126 | $.ajax({ 127 | url: '/libreoffice-status', 128 | dataType: 'json', 129 | success: function(data, textStatus, jqXHR){ 130 | if(data['libreoffice-status']) { 131 | $("#libreOfficeStatus").removeClass("libreOfficeStatus_False").addClass("libreOfficeStatus_True") 132 | } else { 133 | $("#libreOfficeStatus").removeClass("libreOfficeStatus_True").addClass("libreOfficeStatus_False") 134 | } 135 | docvert.number_of_libreoffice_checks_remaining -= 1 136 | if(docvert.number_of_libreoffice_checks_remaining > 0){ 137 | docvert.libreoffice_status_timer = setTimeout(docvert.check_libreoffice_status, 1000) 138 | } 139 | //$("#advanced").text(docvert.number_of_libreoffice_checks_remaining) 140 | } 141 | }) 142 | } 143 | } 144 | 145 | $(document).ready(function(){ 146 | docvert.slide_in() 147 | $("#upload_submit").addClass("disabled").removeClass("enabled") 148 | $(".upload_list").hide() 149 | $(".delete").live("click", docvert.upload_file_delete) 150 | var upload_file = $("#upload_file") 151 | upload_file.change(docvert.upload_file_change) 152 | .mouseover(docvert.upload_file_mouseover) 153 | .mouseout(docvert.upload_file_mouseout) 154 | $("#upload_documents label").css({ 155 | "width":upload_file.width() + "px", 156 | "height": upload_file.height() + "px", 157 | "margin-right": - upload_file.width() + "px"}) 158 | $("#upload_from_web label").click(docvert.reveal_upload_web_dialog) 159 | $("#upload_from_web_dialog").hide() 160 | $("#upload_from_web_dialog input").blur(docvert.hide_upload_web_dialog) 161 | $("fieldset,#button_tray").width((upload_file.width() * 2) + 30) 162 | $("#page,form").width((upload_file.width() * 2) + 53) 163 | $("#upload_submit").click(docvert.check_submit) 164 | $("select").dropp() 165 | $("#advanced .inner").addClass("closed").hide() 166 | $("#advanced legend a").click(docvert.click_advanced) 167 | docvert.reset_check_libreoffice_status() 168 | docvert.libreoffice_status_timer = setTimeout(docvert.check_libreoffice_status, 1000) 169 | $("*").live("focus click", docvert.reset_check_libreoffice_status) 170 | $("#break_up_pages").change(function(){ 171 | if($(this).is(":checked")){ 172 | $("#autopipelines_options").slideDown().parent() 173 | } else { 174 | $("#autopipelines_options").slideUp() 175 | } 176 | $("#autopipeline").nextAll(".dropp_dropdown_list").width(upload_file.width() * 2 + 30).css("clear","both") 177 | }).change() 178 | }).keydown(docvert.keydown) 179 | 180 | -------------------------------------------------------------------------------- /core/web_service_themes/default/index.tpl: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Docvert - Web Service 6 | 7 | 8 | 9 | 10 | 11 | 12 | 16 |

    Docvert 6 MSWord to Open Standards

    17 |
    18 |
    19 |
    20 | Upload Documents 21 |
    22 |
    23 | 24 | 25 |
    26 |
    27 | 28 |
    29 |
    30 |

    Documents to Convert

    31 |
      32 |
    33 |
    34 |
    35 | Theme (XML Pipeline) 36 | 41 |
    42 |
    43 | 44 | 45 | 46 | 47 |
    48 |

    Please note that some pipelines don't support multiple pages.

    49 | 54 |
    55 |
    56 |
    57 | 58 |
    59 |
    60 | Advanced 61 |
    62 |

    63 |   64 | 65 |

    66 |
    67 |
    68 |
    69 | Please choose a file or web URL to convert 70 |
    71 |
    72 | 73 |
    74 |
    75 |
    LibreOffice
    76 |
    77 |
    78 |

    Dear programmers,

    79 |

    this form sends files via HTTP POST so do the same in your software to build upon this web service.

    80 |
    81 | 82 | 83 | -------------------------------------------------------------------------------- /core/web_service_themes/default/jquery.dropp.js: -------------------------------------------------------------------------------- 1 | /* 2 | * Dropp 3 | * http://github.com/matrushka/Dropp 4 | * @requires jQuery v1.3 or later 5 | * 6 | * Dropp is a jQuery plugin which replaces regular droprown menus (