├── VERSION
├── version.h
├── 3rdparty
├── unzip101e.zip
├── htmlcxx-0.85-x86_64-linux.tar.bz2
├── mimetic-0.9.7-x86_64-linux.tar.bz2
├── wv2-0.2.3_patched_4-x86_64-linux.tar.bz2
├── libcharsetdetect-1.0-x86_64-linux.tar.bz2
├── wv2-0.2.3_patched_4-private_headers.tar.bz2
├── pdf_font_metrics.txt
└── resources
│ ├── pdf_font_metrics.txt
│ ├── CNS2-V
│ ├── Roman
│ ├── Katakana
│ ├── ETenms-B5-H
│ ├── Hiragana
│ ├── UniJIS-UCS2-HW-H
│ ├── Hankaku
│ ├── B5-V
│ ├── CNS1-V
│ ├── B5pc-V
│ ├── HKdla-B5-V
│ ├── HKdlb-B5-V
│ ├── HKgccs-B5-V
│ ├── HKm314-B5-V
│ ├── HKm471-B5-V
│ ├── ETHK-B5-V
│ ├── HKscs-B5-V
│ ├── UniCNS-UCS2-V
│ ├── ETen-B5-V
│ ├── UniCNS-UTF16-V
│ ├── UniCNS-UTF8-V
│ ├── KSC-V
│ ├── KSC-EUC-V
│ ├── UniCNS-UTF32-V
│ ├── KSCms-UHC-HW-V
│ ├── KSCpc-EUC-V
│ ├── KSC-Johab-V
│ ├── KSCms-UHC-V
│ ├── UniKS-UCS2-V
│ ├── GB-V
│ ├── GBT-V
│ ├── GBK-EUC-V
│ ├── GBKp-EUC-V
│ ├── GB-EUC-V
│ ├── GBT-EUC-V
│ ├── GBpc-EUC-V
│ ├── GBTpc-EUC-V
│ ├── UniKS-UTF16-V
│ ├── ETenms-B5-V
│ ├── UniKS-UTF8-V
│ ├── UniGB-UTF16-V
│ ├── WP-Symbol
│ ├── UniKS-UTF32-V
│ ├── UniGB-UTF8-V
│ ├── UniGB-UCS2-V
│ ├── UniGB-UTF32-V
│ ├── 78-V
│ ├── V
│ └── 78-EUC-V
├── COPYING.COM
├── doc
├── index.html
└── Doxyfile
├── src
├── tracing.h
├── xml_fixer.h
├── entities.h
├── txt_parser.h
├── oshared.h
├── xml_stream.h
├── odfxml_parser.h
├── odf_ooxml_parser.h
├── rtf_parser.h
├── doc_parser.h
├── pdf_parser.h
├── ppt_parser.h
├── xlsb_parser.h
├── iwork_parser.h
├── eml_parser.h
├── xls_parser.h
├── html_parser.h
├── formatting_style.h
├── thread_safe_ole_storage.h
├── doctotext_unzip.h
├── thread_safe_ole_stream_reader.h
├── link.h
├── attachment.h
├── data_stream.h
├── metadata.h
├── exception.h
├── variant.h
├── list_style.cpp
├── tracing.cpp
└── exception.cpp
├── update_version.sh
├── tests
├── cpp_api_test.cpp
├── multithreading_test.cpp
└── read_from_buffer_test.cpp
└── README
/VERSION:
--------------------------------------------------------------------------------
1 | .33537
--------------------------------------------------------------------------------
/version.h:
--------------------------------------------------------------------------------
1 | #define VERSION ".33537"
2 |
--------------------------------------------------------------------------------
/3rdparty/unzip101e.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tokgolich/doctotext/HEAD/3rdparty/unzip101e.zip
--------------------------------------------------------------------------------
/COPYING.COM:
--------------------------------------------------------------------------------
1 | Please contact SILVERCODERS (http://silvercoders.com) for a commercial license for DocToText.
2 |
--------------------------------------------------------------------------------
/doc/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/3rdparty/htmlcxx-0.85-x86_64-linux.tar.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tokgolich/doctotext/HEAD/3rdparty/htmlcxx-0.85-x86_64-linux.tar.bz2
--------------------------------------------------------------------------------
/3rdparty/mimetic-0.9.7-x86_64-linux.tar.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tokgolich/doctotext/HEAD/3rdparty/mimetic-0.9.7-x86_64-linux.tar.bz2
--------------------------------------------------------------------------------
/3rdparty/wv2-0.2.3_patched_4-x86_64-linux.tar.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tokgolich/doctotext/HEAD/3rdparty/wv2-0.2.3_patched_4-x86_64-linux.tar.bz2
--------------------------------------------------------------------------------
/src/tracing.h:
--------------------------------------------------------------------------------
1 | #ifndef DOCTOTEXT_TRACING_H
2 | #define DOCTOTEXT_TRACING_H
3 |
4 | void doctotext_init_tracing(const char* filename);
5 |
6 | #endif
7 |
--------------------------------------------------------------------------------
/3rdparty/libcharsetdetect-1.0-x86_64-linux.tar.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tokgolich/doctotext/HEAD/3rdparty/libcharsetdetect-1.0-x86_64-linux.tar.bz2
--------------------------------------------------------------------------------
/3rdparty/wv2-0.2.3_patched_4-private_headers.tar.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tokgolich/doctotext/HEAD/3rdparty/wv2-0.2.3_patched_4-private_headers.tar.bz2
--------------------------------------------------------------------------------
/src/xml_fixer.h:
--------------------------------------------------------------------------------
1 | #ifndef DOCTOTEXT_XML_FIXER_H
2 | #define DOCTOTEXT_XML_FIXER_H
3 |
4 | #include
5 |
6 | class DocToTextXmlFixer
7 | {
8 | private:
9 | struct Implementation;
10 | Implementation* Impl;
11 |
12 | public:
13 | DocToTextXmlFixer();
14 | ~DocToTextXmlFixer();
15 | std::string fix(const std::string& xml) const;
16 | };
17 |
18 | #endif
19 |
--------------------------------------------------------------------------------
/src/entities.h:
--------------------------------------------------------------------------------
1 | /* Copyright 2012 Christoph Gärtner
2 | Distributed under the Boost Software License, Version 1.0
3 | */
4 |
5 | #ifndef DECODE_HTML_ENTITIES_UTF8_
6 | #define DECODE_HTML_ENTITIES_UTF8_
7 |
8 | #include
9 | #include
10 |
11 | extern size_t decode_html_entities_utf8(char* dest, const char* src);
12 | /*
13 | Takes input from and decodes into
14 | The function returns the length of the decoded string.
15 | */
16 |
17 | #endif
18 |
--------------------------------------------------------------------------------
/src/txt_parser.h:
--------------------------------------------------------------------------------
1 | #ifndef DOCTOTEXT_TXT_PARSER_H
2 | #define DOCTOTEXT_TXT_PARSER_H
3 |
4 | #include
5 |
6 | class TXTParser
7 | {
8 | private:
9 | struct Implementation;
10 | Implementation* impl;
11 |
12 | public:
13 | TXTParser(const std::string& file_name);
14 | TXTParser(const char* buffer, size_t size);
15 | ~TXTParser();
16 | void setVerboseLogging(bool verbose);
17 | void setLogStream(std::ostream& log_stream);
18 | std::string plainText();
19 | bool error();
20 | };
21 |
22 | #endif
23 |
--------------------------------------------------------------------------------
/doc/Doxyfile:
--------------------------------------------------------------------------------
1 | PROJECT_NAME = "SILVERCODERS DocToText"
2 | PROJECT_NUMBER = VERSION
3 | PROJECT_BRIEF = "Converts DOC, XLS, XLSB, PPT, RTF, ODF (ODT, ODS, ODP), OOXML (DOCX, XLSX, PPTX), iWork (PAGES, NUMBERS, KEYNOTE), ODFXML (FODP, FODS, FODT), PDF, EML and HTML documents to plain text. Extracts metadata and annotations."
4 | INPUT = ./mainpage.dox ../src/plain_text_extractor.h ../src/metadata.h ../src/link.h ../src/formatting_style.h ../src/exception.h ../src/doctotext_c_api.h ../src/attachment.h ../src/variant.h
5 | STRIP_FROM_PATH = ../src
6 | GENERATE_LATEX = NO
--------------------------------------------------------------------------------
/src/oshared.h:
--------------------------------------------------------------------------------
1 | #ifndef DOCTOTEXT_OSHARED_H
2 | #define DOCTOTEXT_OSHARED_H
3 |
4 | #include
5 |
6 | namespace doctotext
7 | {
8 | class Metadata;
9 | }
10 | class ThreadSafeOLEStorage;
11 |
12 | using namespace doctotext;
13 |
14 | bool parse_oshared_summary_info(ThreadSafeOLEStorage& storage, std::ostream& log_stream, Metadata& meta);
15 | bool parse_oshared_document_summary_info(ThreadSafeOLEStorage& storage, std::ostream& log_stream, int& slide_count);
16 | bool get_codepage_from_document_summary_info(ThreadSafeOLEStorage& storage, std::ostream& log_stream, std::string& codepage);
17 |
18 | #endif
19 |
--------------------------------------------------------------------------------
/src/xml_stream.h:
--------------------------------------------------------------------------------
1 | #ifndef DOCTOTEXT_XML_STREAM_H
2 | #define DOCTOTEXT_XML_STREAM_H
3 |
4 | #include
5 |
6 | class XmlStream
7 | {
8 | private:
9 | struct Implementation;
10 | Implementation* impl;
11 |
12 | public:
13 | XmlStream(const std::string& xml, bool manage_xml_parser, int xml_parse_options = 0);
14 | ~XmlStream();
15 | operator bool();
16 | void next();
17 | void levelDown();
18 | void levelUp();
19 | char* content();
20 | std::string name();
21 | std::string fullName();
22 | std::string stringValue();
23 | std::string attribute(const std::string& attr_name);
24 | };
25 |
26 | #endif
27 |
28 |
29 |
--------------------------------------------------------------------------------
/update_version.sh:
--------------------------------------------------------------------------------
1 | # Update VERSION file if needed.
2 | branch=`svn info | grep '^URL:' | sed 's|^URL: .*/\(.*\)|\1|'`
3 | if test $branch = "trunk"; then
4 | ver_main="0.0"
5 | else
6 | ver_main=$branch
7 | fi
8 | if test `uname` = "Darwin"; then
9 | base_secs=`date -j -f "%Y %m %d %H %M %S" "2013 12 01 00 00 00" +%s`
10 | else
11 | base_secs=`date -d 20131201 +%s`
12 | fi
13 | now_secs=`date +%s`
14 | ver_build=`expr \( $now_secs - $base_secs \) / 60 / 60`
15 | old_ver="?"
16 | if test -f VERSION; then
17 | old_ver=`cat VERSION`
18 | fi
19 | if test $old_ver != $ver_main.$ver_build; then
20 | printf $ver_main.$ver_build > VERSION
21 | fi
22 |
--------------------------------------------------------------------------------
/src/odfxml_parser.h:
--------------------------------------------------------------------------------
1 | #ifndef DOCTOTEXT_ODFXML_PARSER_H
2 | #define DOCTOTEXT_ODFXML_PARSER_H
3 |
4 | #include "common_xml_document_parser.h"
5 |
6 | class ODFXMLParser : public CommonXMLDocumentParser
7 | {
8 | private:
9 | struct ExtendedImplementation;
10 | ExtendedImplementation* extended_impl;
11 | class CommandHandlersSet;
12 |
13 | public:
14 | ODFXMLParser(const std::string &file_name);
15 | ODFXMLParser(const char* buffer, size_t size);
16 | ~ODFXMLParser();
17 | bool isODFXML();
18 | std::string plainText(XmlParseMode mode, FormattingStyle& formatting_style);
19 | Metadata metaData();
20 | };
21 |
22 | #endif
23 |
24 |
25 |
--------------------------------------------------------------------------------
/src/odf_ooxml_parser.h:
--------------------------------------------------------------------------------
1 | #ifndef DOCTOTEXT_ODFOOXML_PARSER_H
2 | #define DOCTOTEXT_ODFOOXML_PARSER_H
3 |
4 | #include "common_xml_document_parser.h"
5 |
6 | class ODFOOXMLParser : public CommonXMLDocumentParser
7 | {
8 | private:
9 | struct ExtendedImplementation;
10 | ExtendedImplementation* extended_impl;
11 | class CommandHandlersSet;
12 |
13 | public:
14 | ODFOOXMLParser(const std::string &file_name);
15 | ODFOOXMLParser(const char* buffer, size_t size);
16 | ~ODFOOXMLParser();
17 | bool isODFOOXML();
18 | std::string plainText(XmlParseMode mode, FormattingStyle& options);
19 | Metadata metaData();
20 | };
21 |
22 | #endif
23 |
24 |
25 |
--------------------------------------------------------------------------------
/src/rtf_parser.h:
--------------------------------------------------------------------------------
1 | #ifndef DOCTOTEXT_RTF_PARSER_H
2 | #define DOCTOTEXT_RTF_PARSER_H
3 |
4 | #include "link.h"
5 | #include
6 | #include
7 |
8 | namespace doctotext
9 | {
10 | class Metadata;
11 | }
12 | using namespace doctotext;
13 |
14 | class RTFParser
15 | {
16 | private:
17 | struct Implementation;
18 | Implementation* impl;
19 |
20 | public:
21 | RTFParser(const std::string& file_name);
22 | RTFParser(const char* buffer, size_t size);
23 | ~RTFParser();
24 | void setVerboseLogging(bool verbose);
25 | void setLogStream(std::ostream& log_stream);
26 | bool isRTF();
27 | void getLinks(std::vector& links);
28 | std::string plainText();
29 | Metadata metaData();
30 | bool error();
31 | };
32 |
33 | #endif
34 |
--------------------------------------------------------------------------------
/src/doc_parser.h:
--------------------------------------------------------------------------------
1 | #ifndef DOCTOTEXT_DOC_PARSER_H
2 | #define DOCTOTEXT_DOC_PARSER_H
3 |
4 | #include "link.h"
5 | #include
6 | #include
7 |
8 | namespace doctotext
9 | {
10 | class FormattingStyle;
11 | class Metadata;
12 | }
13 |
14 | using namespace doctotext;
15 |
16 | class DOCParser
17 | {
18 | private:
19 | struct Implementation;
20 | Implementation* impl;
21 |
22 | public:
23 | DOCParser(const std::string& file_name);
24 | DOCParser(const char* buffer, size_t size);
25 | ~DOCParser();
26 | void setVerboseLogging(bool verbose);
27 | void setLogStream(std::ostream& log_stream);
28 | bool isDOC();
29 | void getLinks(std::vector& links);
30 | std::string plainText(const FormattingStyle& formatting);
31 | Metadata metaData();
32 | bool error();
33 | };
34 |
35 | #endif
36 |
--------------------------------------------------------------------------------
/src/pdf_parser.h:
--------------------------------------------------------------------------------
1 | #ifndef DOCTOTEXT_PDF_PARSER_H
2 | #define DOCTOTEXT_PDF_PARSER_H
3 |
4 | #include "link.h"
5 | #include
6 | #include
7 |
8 | namespace doctotext
9 | {
10 | struct FormattingStyle;
11 | class Metadata;
12 | }
13 |
14 | using namespace doctotext;
15 |
16 | class PDFParser
17 | {
18 | private:
19 | struct Implementation;
20 | Implementation* impl;
21 |
22 | public:
23 | PDFParser(const std::string& file_name);
24 | PDFParser(const char* buffer, size_t size);
25 | ~PDFParser();
26 | void setVerboseLogging(bool verbose);
27 | void setLogStream(std::ostream& log_stream);
28 | bool isPDF();
29 | void getLinks(std::vector& links);
30 | std::string plainText(const FormattingStyle& formatting);
31 | bool error();
32 | Metadata metaData();
33 | };
34 |
35 | #endif
36 |
--------------------------------------------------------------------------------
/src/ppt_parser.h:
--------------------------------------------------------------------------------
1 | #ifndef DOCTOTEXT_PPT_PARSER_H
2 | #define DOCTOTEXT_PPT_PARSER_H
3 |
4 | #include "link.h"
5 | #include
6 | #include
7 |
8 | namespace doctotext
9 | {
10 | struct FormattingStyle;
11 | struct Metadata;
12 | }
13 |
14 | using namespace doctotext;
15 |
16 | class PPTParser
17 | {
18 | private:
19 | struct Implementation;
20 | Implementation* impl;
21 |
22 | public:
23 | PPTParser(const std::string& file_name);
24 | PPTParser(const char* buffer, size_t size);
25 | ~PPTParser();
26 | void setVerboseLogging(bool verbose);
27 | void setLogStream(std::ostream& log_stream);
28 | bool isPPT();
29 | void getLinks(std::vector& links);
30 | std::string plainText(const FormattingStyle& formatting);
31 | Metadata metaData();
32 | bool error();
33 | };
34 |
35 | #endif
36 |
--------------------------------------------------------------------------------
/src/xlsb_parser.h:
--------------------------------------------------------------------------------
1 | #ifndef DOCTOTEXT_XLSB_PARSER_H
2 | #define DOCTOTEXT_XLSB_PARSER_H
3 |
4 | #include "link.h"
5 | #include
6 | #include
7 |
8 | namespace doctotext
9 | {
10 | struct FormattingStyle;
11 | class Metadata;
12 | }
13 |
14 | using namespace doctotext;
15 |
16 | class XLSBParser
17 | {
18 | private:
19 | struct Implementation;
20 | Implementation* impl;
21 |
22 | public:
23 | XLSBParser(const std::string& file_name);
24 | XLSBParser(const char* buffer, size_t size);
25 | ~XLSBParser();
26 | void setVerboseLogging(bool verbose);
27 | void setLogStream(std::ostream& log_stream);
28 | bool isXLSB();
29 | void getLinks(std::vector& links);
30 | std::string plainText(const FormattingStyle& formatting);
31 | bool error();
32 | Metadata metaData();
33 | };
34 |
35 | #endif
36 |
--------------------------------------------------------------------------------
/src/iwork_parser.h:
--------------------------------------------------------------------------------
1 | #ifndef DOCTOTEXT_IWORK_PARSER_H
2 | #define DOCTOTEXT_IWORK_PARSER_H
3 |
4 | #include "link.h"
5 | #include
6 | #include
7 |
8 | namespace doctotext
9 | {
10 | struct FormattingStyle;
11 | class Metadata;
12 | }
13 |
14 | using namespace doctotext;
15 |
16 | class IWorkParser
17 | {
18 | private:
19 | struct Implementation;
20 | Implementation* impl;
21 |
22 | public:
23 | IWorkParser(const std::string& file_name);
24 | IWorkParser(const char* buffer, size_t size);
25 | ~IWorkParser();
26 | void setVerboseLogging(bool verbose);
27 | void setLogStream(std::ostream& log_stream);
28 | bool isIWork();
29 | void getLinks(std::vector& links);
30 | std::string plainText(const FormattingStyle& formatting);
31 | bool error();
32 | Metadata metaData();
33 | };
34 |
35 | #endif
36 |
--------------------------------------------------------------------------------
/src/eml_parser.h:
--------------------------------------------------------------------------------
1 | #ifndef DOCTOTEXT_EML_PARSER_H
2 | #define DOCTOTEXT_EML_PARSER_H
3 |
4 | #include "link.h"
5 | #include
6 | #include
7 |
8 | namespace doctotext
9 | {
10 | struct FormattingStyle;
11 | class Metadata;
12 | class Attachment;
13 | }
14 |
15 | using namespace doctotext;
16 |
17 | class EMLParser
18 | {
19 | private:
20 | struct Implementation;
21 | Implementation* impl;
22 |
23 | public:
24 | EMLParser(const std::string& file_name);
25 | EMLParser(const char* buffer, size_t size);
26 | ~EMLParser();
27 | void setVerboseLogging(bool verbose);
28 | void setLogStream(std::ostream& log_stream);
29 | bool isEML();
30 | void getLinks(std::vector& links);
31 | void getAttachments(std::vector& attachments);
32 | std::string plainText(const FormattingStyle& formatting);
33 | Metadata metaData();
34 | bool error();
35 | };
36 |
37 | #endif
38 |
--------------------------------------------------------------------------------
/tests/cpp_api_test.cpp:
--------------------------------------------------------------------------------
1 | #include "../build/metadata.h"
2 | #include "../build/plain_text_extractor.h"
3 |
4 | #include
5 | #include
6 |
7 | using namespace doctotext;
8 |
9 | int main(int argc, char* argv[])
10 | {
11 | std::string file_name;
12 | bool verbose = false;
13 | if (argc == 2)
14 | file_name = argv[1];
15 | else if (argc == 3 && std::string(argv[1]) == "--verbose")
16 | {
17 | verbose = true;
18 | file_name = argv[2];
19 | }
20 | else
21 | return 1;
22 | PlainTextExtractor extractor;
23 | extractor.setVerboseLogging(verbose);
24 | Metadata meta;
25 | if (!extractor.extractMetadata(file_name, meta))
26 | return 1;
27 | std::cout << "Author: " << meta.author() << std::endl;
28 | std::cout << "Last modified by: " << meta.lastModifiedBy() << std::endl;
29 | std::string text;
30 | if (!extractor.processFile(file_name, text))
31 | return 1;
32 | std::cout << text << std::endl;
33 | return 0;
34 | }
35 |
--------------------------------------------------------------------------------
/src/xls_parser.h:
--------------------------------------------------------------------------------
1 | #ifndef DOCTOTEXT_XLS_PARSER_H
2 | #define DOCTOTEXT_XLS_PARSER_H
3 |
4 | #include "metadata.h"
5 | #include "link.h"
6 | #include
7 | #include
8 |
9 | namespace doctotext
10 | {
11 | struct FormattingStyle;
12 | }
13 | class ThreadSafeOLEStorage;
14 |
15 | using namespace doctotext;
16 |
17 | class XLSParser
18 | {
19 | private:
20 | struct Implementation;
21 | Implementation* impl;
22 |
23 | public:
24 | XLSParser(const std::string& file_name);
25 | XLSParser(const char* buffer, size_t size);
26 | ~XLSParser();
27 | void setVerboseLogging(bool verbose);
28 | void setLogStream(std::ostream& log_stream);
29 | bool isXLS();
30 | void getLinks(std::vector& links);
31 | std::string plainText(const FormattingStyle& formatting);
32 | std::string plainText(ThreadSafeOLEStorage& storage, const FormattingStyle& formatting);
33 | Metadata metaData();
34 | bool error();
35 | };
36 |
37 | #endif
38 |
--------------------------------------------------------------------------------
/src/html_parser.h:
--------------------------------------------------------------------------------
1 | #ifndef DOCTOTEXT_HTML_PARSER_H
2 | #define DOCTOTEXT_HTML_PARSER_H
3 |
4 | #include "link.h"
5 | #include
6 | #include
7 |
8 | namespace doctotext
9 | {
10 | struct FormattingStyle;
11 | class Metadata;
12 | }
13 |
14 | using namespace doctotext;
15 |
16 | class HTMLParser
17 | {
18 | private:
19 | struct Implementation;
20 | Implementation* impl;
21 |
22 | public:
23 | HTMLParser(const std::string& file_name);
24 | HTMLParser(const char* buffer, size_t size);
25 | ~HTMLParser();
26 | void setVerboseLogging(bool verbose);
27 | void setLogStream(std::ostream& log_stream);
28 | bool isHTML();
29 | void getLinks(std::vector& links);
30 | std::string plainText(const FormattingStyle& formatting);
31 | Metadata metaData();
32 | ///turns off charset decoding. It may be useful, if we want to decode data ourself (EML parser is an example).
33 | void skipCharsetDecoding();
34 | bool error();
35 | };
36 |
37 | #endif
38 |
--------------------------------------------------------------------------------
/3rdparty/pdf_font_metrics.txt:
--------------------------------------------------------------------------------
1 | Binary distribution of doctotext contains informations about 14 core fonts from Adobe.
2 | Those data was obtained here:
3 | http://www.ctan.org/tex-archive/fonts/adobe/afm/
4 |
5 | Font metrics are used by PDF parser to compute the boundary of each character
6 | written with a proportional font.
7 |
8 | BEGIN Verbatim copy of the license part
9 |
10 | Adobe Core 35 AFM Files with 229 Glyph Entries - ReadMe
11 |
12 | This file and the 35 PostScript(R) AFM files it accompanies may be
13 | used, copied, and distributed for any purpose and without charge,
14 | with or without modification, provided that all copyright notices
15 | are retained; that the AFM files are not distributed without this
16 | file; that all modifications to this file or any of the AFM files
17 | are prominently noted in the modified file(s); and that this
18 | paragraph is not modified. Adobe Systems has no responsibility or
19 | obligation to support the use of the AFM files.
20 |
21 |
22 | END Verbatim copy of the license part
23 |
--------------------------------------------------------------------------------
/3rdparty/resources/pdf_font_metrics.txt:
--------------------------------------------------------------------------------
1 | Binary distribution of doctotext contains informations about 14 core fonts from Adobe.
2 | Those data was obtained here:
3 | http://www.ctan.org/tex-archive/fonts/adobe/afm/
4 |
5 | Font metrics are used by PDF parser to compute the boundary of each character
6 | written with a proportional font.
7 |
8 | BEGIN Verbatim copy of the license part
9 |
10 | Adobe Core 35 AFM Files with 229 Glyph Entries - ReadMe
11 |
12 | This file and the 35 PostScript(R) AFM files it accompanies may be
13 | used, copied, and distributed for any purpose and without charge,
14 | with or without modification, provided that all copyright notices
15 | are retained; that the AFM files are not distributed without this
16 | file; that all modifications to this file or any of the AFM files
17 | are prominently noted in the modified file(s); and that this
18 | paragraph is not modified. Adobe Systems has no responsibility or
19 | obligation to support the use of the AFM files.
20 |
21 |
22 | END Verbatim copy of the license part
23 |
--------------------------------------------------------------------------------
/src/formatting_style.h:
--------------------------------------------------------------------------------
1 | #ifndef DOCTOTEXT_FORMATTING_STYLE_H
2 | #define DOCTOTEXT_FORMATTING_STYLE_H
3 |
4 | #include
5 |
6 | namespace doctotext
7 | {
8 | enum TableStyle { TABLE_STYLE_TABLE_LOOK, TABLE_STYLE_ONE_ROW, TABLE_STYLE_ONE_COL, };
9 | enum UrlStyle { URL_STYLE_TEXT_ONLY, URL_STYLE_EXTENDED, URL_STYLE_UNDERSCORED, };
10 |
11 | class ListStyle
12 | {
13 | private:
14 | struct Implementation;
15 | Implementation* m_impl;
16 |
17 | public:
18 | ListStyle();
19 | ~ListStyle();
20 | ListStyle(const ListStyle& style);
21 | ListStyle& operator = (const ListStyle& style);
22 | ///Sets leading characters for each position in the list. prefix must be a value encoded in UTF8. Default value: empty string (no prefix)
23 | void setPrefix(const std::string& prefix);
24 | void setPrefix(const char* prefix);
25 | const char* getPrefix() const;
26 | };
27 |
28 | struct FormattingStyle
29 | {
30 | TableStyle table_style;
31 | UrlStyle url_style;
32 | ListStyle list_style;
33 | };
34 |
35 | enum XmlParseMode { PARSE_XML, FIX_XML, STRIP_XML };
36 | }
37 |
38 | #endif
39 |
--------------------------------------------------------------------------------
/src/thread_safe_ole_storage.h:
--------------------------------------------------------------------------------
1 | #ifndef DOCTOTEXT_THREAD_SAFE_OLE_STORAGE_H
2 | #define DOCTOTEXT_THREAD_SAFE_OLE_STORAGE_H
3 |
4 | #include
5 | #include
6 | #include "wv2/olestorage.h"
7 |
8 | class ThreadSafeOLEStreamReader;
9 | using namespace wvWare;
10 |
11 | class ThreadSafeOLEStorage : public AbstractOLEStorage
12 | {
13 | private:
14 | struct Implementation;
15 | Implementation* impl;
16 | public:
17 | ThreadSafeOLEStorage(const std::string& file_name);
18 | ThreadSafeOLEStorage(const char* buffer, size_t len);
19 | ~ThreadSafeOLEStorage();
20 | bool isValid() const;
21 | bool open(Mode mode);
22 | void close();
23 | std::string name() const;
24 | std::string getLastError();
25 | bool getStreamsAndStoragesList(std::vector& components);
26 | std::string getCurrentDirectory();
27 | bool enterDirectory(const std::string& directory_path);
28 | bool leaveDirectory();
29 | bool readDirectFromBuffer(unsigned char* buffer, int size, int offset);
30 | AbstractOLEStreamReader* createStreamReader(const std::string& stream_path);
31 | private:
32 | void streamDestroyed(OLEStream* stream);
33 | };
34 |
35 | #endif // DOCTOTEXT_THREAD_SAFE_OLE_STORAGE_H
36 |
--------------------------------------------------------------------------------
/src/doctotext_unzip.h:
--------------------------------------------------------------------------------
1 | #ifndef DOCTOTEXT_UNZIP_H
2 | #define DOCTOTEXT_UNZIP_H
3 |
4 | #include
5 |
6 | class DocToTextUnzip
7 | {
8 | private:
9 | struct Implementation;
10 | Implementation* Impl;
11 |
12 | public:
13 | DocToTextUnzip();
14 | DocToTextUnzip(const std::string& archive_file_name);
15 | DocToTextUnzip(const char* buffer, size_t size);
16 | void setArchiveFile(const std::string& archive_file_name);
17 | void setBuffer(const char* buffer, size_t size);
18 | ~DocToTextUnzip();
19 | void setLogStream(std::ostream& log_stream);
20 | static void setUnzipCommand(const std::string& command);
21 | bool open();
22 | void close();
23 | bool exists(const std::string& file_name) const;
24 | bool read(const std::string& file_name, std::string* contents, int num_of_chars = 0) const;
25 | bool getFileSize(const std::string& file_name, unsigned long& file_size) const;
26 | bool readChunk(const std::string& file_name, std::string* contents, int chunk_size) const;
27 | bool readChunk(const std::string& file_name, char* contents, int chunk_size, int& readed) const;
28 | void closeReadingFileForChunks() const;
29 | /**
30 | Load and cache zip file directory. Speed up locating files dramatically. Use before multiple read() calls.
31 | **/
32 | bool loadDirectory();
33 | };
34 |
35 | #endif
36 |
--------------------------------------------------------------------------------
/src/thread_safe_ole_stream_reader.h:
--------------------------------------------------------------------------------
1 | #ifndef DOCTOTEXT_THREAD_SAFE_OLE_STREAM_READER_H
2 | #define DOCTOTEXT_THREAD_SAFE_OLE_STREAM_READER_H
3 |
4 | #include
5 | #include
6 | #include
7 | #include
8 | #include "wv2/olestream.h"
9 |
10 | class ThreadSafeOLEStorage;
11 | using namespace wvWare;
12 | class DataStream;
13 |
14 | class ThreadSafeOLEStreamReader : public AbstractOLEStreamReader
15 | {
16 | friend class ThreadSafeOLEStorage;
17 | private:
18 | struct Implementation;
19 | Implementation* impl;
20 | struct Stream
21 | {
22 | uint64_t m_size;
23 | std::vector m_file_positions;
24 | uint32_t m_sector_size;
25 | DataStream* m_data_stream;
26 | };
27 | ThreadSafeOLEStreamReader(ThreadSafeOLEStorage* storage, Stream& stream);
28 | public:
29 | ~ThreadSafeOLEStreamReader();
30 | std::string getLastError();
31 | bool isValid() const;
32 | int tell() const;
33 | size_t size() const;
34 | bool seek(int offset, int whence = SEEK_SET);
35 | bool readU8(U8& data);
36 | U8 readU8();
37 | bool readS8(S8& data);
38 | S8 readS8();
39 | bool readU16(U16& data);
40 | U16 readU16();
41 | bool readS16(S16& data);
42 | S16 readS16();
43 | bool readU32(U32& data);
44 | U32 readU32();
45 | bool readS32(S32& data);
46 | S32 readS32();
47 | bool read(U8 *buffer, size_t length);
48 | };
49 |
50 | #endif // DOCTOTEXT_THREAD_SAFE_OLE_STREAM_READER_H
51 |
--------------------------------------------------------------------------------
/src/link.h:
--------------------------------------------------------------------------------
1 | #ifndef DOCTOTEXT_LINK_H
2 | #define DOCTOTEXT_LINK_H
3 |
4 | #include
5 |
6 | namespace doctotext
7 | {
8 | /**
9 | Structure of the link in parsed file.
10 | Example:
11 | Suppose that content of example.html is: "text before link link text after link".
12 | As the result, example.html will contain one link. We are using URL_STYLE_TEXT_ONLY style.
13 | The result of the parsing this file is: "text before link link text after link".
14 | We should obtain one link with the following values:
15 | getLinkUrl() returs word "target".
16 | getLinkText() returns word "link".
17 | getLinkTextSize() returns 4 (because word "link" has four characters).
18 | getLinkTextPosition() returns 17 (because lenght of "text before link " is 17).
19 | **/
20 | class Link
21 | {
22 | private:
23 | struct Implementation;
24 | Implementation* impl;
25 |
26 | public:
27 | Link();
28 | Link(const std::string& link_url, const std::string& link_text, size_t link_text_position);
29 | Link(const Link& link);
30 | ~Link();
31 | Link& operator = (const Link& link);
32 | void setLinkText(const std::string& link_text);
33 | void setLinkUrl(const std::string& link_url);
34 | void setLinkTextPosition(size_t link_text_position);
35 | const char* getLinkUrl() const;
36 | const char* getLinkText() const;
37 | size_t getLinkTextPosition() const;
38 | };
39 | }
40 |
41 | #endif
42 |
--------------------------------------------------------------------------------
/src/attachment.h:
--------------------------------------------------------------------------------
1 | #ifndef DOCTOTEXT_ATTACHMENT_H
2 | #define DOCTOTEXT_ATTACHMENT_H
3 |
4 | #include