├── .gitignore
├── .github
└── FUNDING.yml
├── .idea
├── .gitignore
├── vcs.xml
├── misc.xml
├── compiler.xml
└── jarRepositories.xml
├── src
├── test
│ ├── resources
│ │ └── com
│ │ │ └── rtfparserkit
│ │ │ ├── parser
│ │ │ ├── standard
│ │ │ │ └── data
│ │ │ │ │ ├── testDefaultEncodingParse.rtf
│ │ │ │ │ ├── testDefaultEncodingParse.xml
│ │ │ │ │ ├── testJapaneseJisEncoding.rtf
│ │ │ │ │ ├── testJapaneseUtf8Encoding.rtf
│ │ │ │ │ ├── testGreekEncoding.rtf
│ │ │ │ │ ├── testJapaneseJisEncodingTwoFonts.rtf
│ │ │ │ │ ├── testEncodingParse.rtf
│ │ │ │ │ ├── testHex.rtf
│ │ │ │ │ ├── testTurkishEncoding.rtf
│ │ │ │ │ ├── testKoreanEncoding.rtf
│ │ │ │ │ ├── testMultiByteHex.rtf
│ │ │ │ │ ├── testJapaneseUtf8Encoding.xml
│ │ │ │ │ ├── testJapaneseJisEncoding.xml
│ │ │ │ │ ├── test950Encoding.rtf
│ │ │ │ │ ├── testStyles.rtf
│ │ │ │ │ ├── testNecCharacters.rtf
│ │ │ │ │ ├── test10007Encoding.rtf
│ │ │ │ │ ├── testSpecialChars.rtf
│ │ │ │ │ ├── testJapaneseJisEncodingTwoFonts.xml
│ │ │ │ │ ├── testGreekEncoding.xml
│ │ │ │ │ ├── testHex.xml
│ │ │ │ │ ├── testNegativeUnicode.rtf
│ │ │ │ │ ├── testEncodingParse.xml
│ │ │ │ │ ├── testKoreanEncoding.xml
│ │ │ │ │ ├── test10001Encoding.rtf
│ │ │ │ │ ├── testUnicode.rtf
│ │ │ │ │ ├── testTurkishEncoding.xml
│ │ │ │ │ ├── testMultiByteHex.xml
│ │ │ │ │ ├── testNecCharacters.xml
│ │ │ │ │ ├── testGitHubIssue6.rtf
│ │ │ │ │ ├── test950Encoding.xml
│ │ │ │ │ ├── testNegativeUnicode.xml
│ │ │ │ │ ├── test10007Encoding.xml
│ │ │ │ │ ├── testSpecialChars.xml
│ │ │ │ │ ├── testStyles.xml
│ │ │ │ │ ├── test437Encoding.rtf
│ │ │ │ │ ├── testUnicode.xml
│ │ │ │ │ ├── test10001Encoding.xml
│ │ │ │ │ ├── test874Encoding.rtf
│ │ │ │ │ ├── test437Encoding.xml
│ │ │ │ │ ├── testGitHubIssue6.xml
│ │ │ │ │ └── testUpr.rtf
│ │ │ └── raw
│ │ │ │ └── data
│ │ │ │ ├── testRawParse.rtf
│ │ │ │ ├── testSpecialChars.rtf
│ │ │ │ ├── testRawParse.xml
│ │ │ │ └── testSpecialChars.xml
│ │ │ └── converter
│ │ │ └── text
│ │ │ └── data
│ │ │ └── testTextConversion.txt
│ └── java
│ │ └── com
│ │ └── rtfparserkit
│ │ ├── parser
│ │ ├── raw
│ │ │ ├── RawRtfParserTest.java
│ │ │ └── ByteBufferTest.java
│ │ └── standard
│ │ │ └── StandardRtfParserTest.java
│ │ ├── converter
│ │ └── text
│ │ │ ├── StringTextConverterTest.java
│ │ │ └── StreamTextConverterTest.java
│ │ └── utils
│ │ └── TestUtilities.java
└── main
│ └── java
│ └── com
│ └── rtfparserkit
│ ├── utils
│ ├── RtfDump.java
│ ├── ImageDump.java
│ ├── ImageListener.java
│ ├── HexUtils.java
│ └── RtfDumpListener.java
│ ├── parser
│ ├── standard
│ │ ├── ParserEventType.java
│ │ ├── IParserEvent.java
│ │ ├── GroupEndEvent.java
│ │ ├── GroupStartEvent.java
│ │ ├── DocumentEndEvent.java
│ │ ├── DocumentStartEvent.java
│ │ ├── ParserState.java
│ │ ├── BinaryBytesEvent.java
│ │ ├── StringEvent.java
│ │ ├── IParserEventHandler.java
│ │ ├── CommandEvent.java
│ │ ├── FontCharset.java
│ │ ├── DefaultEventHandler.java
│ │ ├── UprHandler.java
│ │ ├── StandardRtfParser.java
│ │ └── Encoding.java
│ ├── IRtfParser.java
│ ├── IRtfSource.java
│ ├── RtfListenerAdaptor.java
│ ├── RtfStreamSource.java
│ ├── raw
│ │ ├── ByteBuffer.java
│ │ └── RawRtfParser.java
│ ├── RtfStringSource.java
│ └── IRtfListener.java
│ ├── rtf
│ └── CommandType.java
│ └── converter
│ └── text
│ ├── StringTextConverter.java
│ ├── StreamTextConverter.java
│ └── AbstractTextConverter.java
├── .settings
├── org.eclipse.m2e.core.prefs
└── org.eclipse.jdt.core.prefs
├── .gitattributes
├── .project
├── .classpath
├── README.md
├── pom.xml
└── licence.txt
/.gitignore:
--------------------------------------------------------------------------------
1 | /target/
2 |
--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | github: joniles
--------------------------------------------------------------------------------
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 |
--------------------------------------------------------------------------------
/src/test/resources/com/rtfparserkit/parser/standard/data/testDefaultEncodingParse.rtf:
--------------------------------------------------------------------------------
1 | {\rtf1
2 | Test1\par
3 | }
4 |
--------------------------------------------------------------------------------
/.settings/org.eclipse.m2e.core.prefs:
--------------------------------------------------------------------------------
1 | activeProfiles=
2 | eclipse.preferences.version=1
3 | resolveWorkspaceProjects=true
4 | version=1
5 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Explicitly declare test files to have only LF as line ending (Windows!)
2 | # See https://help.github.com/articles/dealing-with-line-endings
3 |
4 | *.txt text eol=lf
5 | *.xml text eol=lf
6 |
--------------------------------------------------------------------------------
/src/test/resources/com/rtfparserkit/parser/standard/data/testDefaultEncodingParse.xml:
--------------------------------------------------------------------------------
1 | Test1
--------------------------------------------------------------------------------
/src/test/resources/com/rtfparserkit/parser/standard/data/testJapaneseJisEncoding.rtf:
--------------------------------------------------------------------------------
1 | {\rtf1\ansi\ansicpg1252\deff0\nouicompat\deflang1033{\fonttbl{\f0\fnil\fcharset128 Arial Unicode MS;}}
2 | \f0\fs24\'92\'86\'9a\'a0
3 | }
4 |
--------------------------------------------------------------------------------
/src/test/resources/com/rtfparserkit/parser/standard/data/testJapaneseUtf8Encoding.rtf:
--------------------------------------------------------------------------------
1 | {\rtf1\ansi\ansicpg1252\deff0\nouicompat\deflang1033{\fonttbl{\f0\fnil\fcharset128\cpg65001 Arial Unicode MS;}}
2 | \'e4\'b8\'ad\'e5\'9c\'8b
3 | }
4 |
--------------------------------------------------------------------------------
/src/test/resources/com/rtfparserkit/parser/standard/data/testGreekEncoding.rtf:
--------------------------------------------------------------------------------
1 | {\rtf1\ansi\ansicpg1252\deff0\deflang2057{\fonttbl{\f0\fnil\fcharset0 Tahoma;}{\f1\fnil\fcharset161 Tahoma;}}
2 | \viewkind4\uc1\pard\f0\fs20 Unicode \'80\f1\'d9\f0\par
3 | }
--------------------------------------------------------------------------------
/src/test/resources/com/rtfparserkit/parser/raw/data/testRawParse.rtf:
--------------------------------------------------------------------------------
1 | {\rtf1\ansi\ansicpg1252\deff0\deflang2057{\fonttbl{\f0\fnil\fcharset0 Calibri;}}
2 | {\*\generator Msftedit 5.41.21.2510;}\viewkind4\uc1\pard\sa200\sl276\slmult1\lang9\f0\fs22 Test1\par
3 | }
4 |
--------------------------------------------------------------------------------
/src/test/resources/com/rtfparserkit/parser/standard/data/testJapaneseJisEncodingTwoFonts.rtf:
--------------------------------------------------------------------------------
1 | {\rtf1\ansi\ansicpg1252\deff0\nouicompat\deflang1033{\fonttbl{\f0\fnil\fcharset0 Times New Roman;}{\f1\fnil\fcharset128 Arial Unicode MS;}}
2 | \f1\fs24\'92\'86\'9a\'a0
3 | }
--------------------------------------------------------------------------------
/src/test/resources/com/rtfparserkit/parser/standard/data/testEncodingParse.rtf:
--------------------------------------------------------------------------------
1 | {\rtf1\ansi\ansicpg1252\deff0\deflang2057{\fonttbl{\f0\fnil\fcharset0 Calibri;}}
2 | {\*\generator Msftedit 5.41.21.2510;}\viewkind4\uc1\pard\sa200\sl276\slmult1\lang9\f0\fs22 Test1\par
3 | }
4 |
--------------------------------------------------------------------------------
/src/test/resources/com/rtfparserkit/parser/standard/data/testHex.rtf:
--------------------------------------------------------------------------------
1 | {\rtf1\ansi\ansicpg1252\deff0\deflang2057{\fonttbl{\f0\fnil\fcharset0 Calibri;}}
2 | {\*\generator Msftedit 5.41.21.2510;}\viewkind4\uc1\pard\sa200\sl276\slmult1\lang9\f0\fs22\'41\'42\'43\par
3 | }
4 |
--------------------------------------------------------------------------------
/src/test/resources/com/rtfparserkit/parser/standard/data/testTurkishEncoding.rtf:
--------------------------------------------------------------------------------
1 | {\rtf1\ansi\ansicpg1254\deff0\nouicompat\deflang1055{\fonttbl{\f0\fnil\fcharset162 Segoe UI;}{\f1\fnil\fcharset0 Segoe UI;}}
2 | {\*\generator Riched20 15.0.4567}{\*\mmathPr\mwrapIndent1440 }\viewkind4\uc1
3 | \pard\f0\fs20 Turkish Encoding.\f1\par
4 | }
--------------------------------------------------------------------------------
/.settings/org.eclipse.jdt.core.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.6
3 | org.eclipse.jdt.core.compiler.compliance=1.6
4 | org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning
5 | org.eclipse.jdt.core.compiler.release=disabled
6 | org.eclipse.jdt.core.compiler.source=1.6
7 |
--------------------------------------------------------------------------------
/src/test/resources/com/rtfparserkit/parser/standard/data/testKoreanEncoding.rtf:
--------------------------------------------------------------------------------
1 | {\rtf1\ansi\ansicpg949\deff0\nouicompat\deflang1033\deflangfe1042{\fonttbl{\f0\fswiss\fprq2\fcharset129 \'b8\'bc\'c0\'ba \'b0\'ed\'b5\'f1;}}
2 | {\*\generator Riched20 14.0.4750.1000;}{\*\mmathPr\mwrapIndent1440}\viewkind4\uc1
3 | \pard\f0\fs20\lang1042 MS Project Addin ProjectPlus\par
4 | }
5 |
--------------------------------------------------------------------------------
/src/test/resources/com/rtfparserkit/parser/standard/data/testMultiByteHex.rtf:
--------------------------------------------------------------------------------
1 | {\rtf1\ansi\ansicpg932\deff0\deflang1033\deflangfe1041{\fonttbl{\f0\froman\fprq1\fcharset128 \'82\'6c\'82\'72 \'82\'6f\'83\'53\'83\'56\'83\'62\'83\'4e;}{\f1\fnil\fprq2\fcharset128 Tahoma;}}
2 | {\*\generator Riched20 5.50.99.2014;}\viewkind4\uc1\pard\f0\fs18\lang1041\'82\'a8\'8c\'b3\'8b\'43\'82\'c5\'82\'b7\'82\'a9\'81\'48\f1\par
3 | }
4 |
--------------------------------------------------------------------------------
/src/test/resources/com/rtfparserkit/parser/standard/data/testJapaneseUtf8Encoding.xml:
--------------------------------------------------------------------------------
1 | Arial Unicode MS;中國
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/src/test/resources/com/rtfparserkit/parser/standard/data/testJapaneseJisEncoding.xml:
--------------------------------------------------------------------------------
1 | Arial Unicode MS;中國
--------------------------------------------------------------------------------
/src/test/resources/com/rtfparserkit/parser/standard/data/test950Encoding.rtf:
--------------------------------------------------------------------------------
1 | {\rtf1\ansi\ansicpg950\cocoartf1187\cocoasubrtf340
2 | \cocoascreenfonts1{\fonttbl\f0\fnil\fcharset0 LucidaGrande;}
3 | {\colortbl;\red255\green255\blue255;}
4 | {\info
5 | {\title Title}
6 | {\author Test Author}}\paperw11900\paperh16840\margl1440\margr1440\vieww10980\viewh13860\viewkind0
7 | \deftab720
8 | \pard\pardeftab720\ri0\sl560\sa120
9 |
10 | \f0\fs36 \cf0 Test Text\
11 | \pard\pardeftab720\ri0\sl360\sb120
12 |
13 | \fs24 \cf0 Copyright \'a9 2004-2013 Test Project}
--------------------------------------------------------------------------------
/src/test/resources/com/rtfparserkit/parser/standard/data/testStyles.rtf:
--------------------------------------------------------------------------------
1 | {\rtf1\ansi\ansicpg1252\deff0\deflang2057{\fonttbl{\f0\fnil\fcharset0 Calibri;}{\f1\fnil\fcharset0 Arial;}}
2 | {\*\generator Msftedit 5.41.21.2510;}\viewkind4\uc1\pard\sa200\sl276\slmult1\lang9\f0\fs22 Test 1 - normal.\par
3 | \i Test 2 - italic.\par
4 | \b\i0 Test 3 - bold.\par
5 | \b0 Test 4 -normal.\par
6 | Test 5 - Calibri 11.\par
7 | \f1 Test 6 - Arial 11.\par
8 | \f0 Test 7 - Calibri 11.\par
9 | \f1\fs24 Test 6 - Arial 12.\par
10 | \f0\fs22 Test 7 - Calibri 11.\par
11 | \par
12 | }
13 |
--------------------------------------------------------------------------------
/.idea/compiler.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/src/test/resources/com/rtfparserkit/parser/standard/data/testNecCharacters.rtf:
--------------------------------------------------------------------------------
1 | {\rtf1\ansi\ansicpg932\deff0\deflang1033\deflangfe1041{\fonttbl{\f0\fnil\fcharset0 MS Sans Serif;}
2 | {\f1\froman\fprq1\fcharset128 MS UI Gothic;}}
3 | {\colortbl;\red255\green0\blue0;\red0\green0\blue255;}
4 | \viewkind4\uc1\pard\cf1\lang1041\f0\fs17 BLC U=>L Splice \f1\fs18\'82\'c5U/W No.2 Dancer
5 | \'82\'a9\'82\'e7\'83\'56\'83\'8f\'94\'ad\'90\'b6\'81\'42Set\'8e\'9e\'82\'c9\'95\'5c\'91\'7710\'87\'6f\'82\'d9\'82\'c7\'83\'80\'81\'5b\'83\'6a\'83\'93\'83\'4f\'81\'40
6 | pallet\'92\'ea\'82\'cc
7 | Roll\cf2\f0\fs17 \par }
--------------------------------------------------------------------------------
/src/test/resources/com/rtfparserkit/parser/standard/data/test10007Encoding.rtf:
--------------------------------------------------------------------------------
1 | {\rtf1\mac\ansicpg10007\cocoartf102
2 | {\fonttbl\f0\fnil\fcharset77 LucidaGrande;\f1\fnil\fcharset77 Georgia;\f2\fnil\fcharset77 Verdana;
3 | }
4 | {\colortbl;\red255\green255\blue255;}
5 | \margl1440\margr1440\vieww16780\viewh13600\viewkind0
6 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\ql\qnatural
7 |
8 | \f0\fs36 \cf0 \uc0\u1050 \u1086 \u1084 \u1084 \u1072 \u1085 \u1076 \u1072 \u1088 \u1072 \u1079 \u1088 \u1072 \u1073 \u1086 \u1090 \u1095 \u1080 \u1082 \u1086 \u1074
9 | \f1 \
10 | }
11 |
12 |
13 |
--------------------------------------------------------------------------------
/src/test/resources/com/rtfparserkit/parser/raw/data/testSpecialChars.rtf:
--------------------------------------------------------------------------------
1 | {\rtf1\ansi\ansicpg1252\deff0\deflang2057{\fonttbl{\f0\fnil\fcharset0 Calibri;}}
2 | {\*\generator Msftedit 5.41.21.2510;}\viewkind4\uc1\pard\sa200\sl276\slmult1\lang9\f0\fs22 Tab A\tab B\par
3 | Tab A B\par
4 | CR A\
B\par
5 | LF A\
6 | B\par
7 | This is a line break.\line
8 | Here is the new line.\par
9 | Emdash: \emdash\par
10 | Endash: \endash\par
11 | Emspace: \emspace\par
12 | Enspace: \enspace\par
13 | Qmspace: \qmspace\par
14 | Bullet: \bullet\par
15 | Lquote: \lquote\par
16 | Rquote: \rquote\par
17 | Ldblquote: \ldblquote\par
18 | Rdblquote: \rdblquote\par
19 | }
20 |
--------------------------------------------------------------------------------
/.project:
--------------------------------------------------------------------------------
1 |
2 |
3 | rtfparserkit
4 |
5 |
6 |
7 |
8 |
9 | org.eclipse.jdt.core.javabuilder
10 |
11 |
12 |
13 |
14 | org.eclipse.m2e.core.maven2Builder
15 |
16 |
17 |
18 |
19 |
20 | org.eclipse.jdt.core.javanature
21 | org.eclipse.m2e.core.maven2Nature
22 |
23 |
24 |
--------------------------------------------------------------------------------
/src/test/resources/com/rtfparserkit/parser/standard/data/testSpecialChars.rtf:
--------------------------------------------------------------------------------
1 | {\rtf1\ansi\ansicpg1252\deff0\deflang2057{\fonttbl{\f0\fnil\fcharset0 Calibri;}}
2 | {\*\generator Msftedit 5.41.21.2510;}\viewkind4\uc1\pard\sa200\sl276\slmult1\lang9\f0\fs22 Tab A\tab B\par
3 | Tab A B\par
4 | CR A\
B\par
5 | LF A\
6 | B\par
7 | This is a line break.\line
8 | Here is the new line.\par
9 | Emdash: \emdash\par
10 | Endash: \endash\par
11 | Emspace: \emspace\par
12 | Enspace: \enspace\par
13 | Qmspace: \qmspace\par
14 | Bullet: \bullet\par
15 | Lquote: \lquote\par
16 | Rquote: \rquote\par
17 | Ldblquote: \ldblquote\par
18 | Rdblquote: \rdblquote\par
19 | }
20 |
--------------------------------------------------------------------------------
/src/test/resources/com/rtfparserkit/parser/standard/data/testJapaneseJisEncodingTwoFonts.xml:
--------------------------------------------------------------------------------
1 | Times New Roman;Arial Unicode MS;中國
--------------------------------------------------------------------------------
/src/test/resources/com/rtfparserkit/parser/standard/data/testGreekEncoding.xml:
--------------------------------------------------------------------------------
1 | Tahoma;Tahoma;Unicode €Ω
--------------------------------------------------------------------------------
/src/test/resources/com/rtfparserkit/parser/standard/data/testHex.xml:
--------------------------------------------------------------------------------
1 | Calibri;Msftedit 5.41.21.2510;ABC
--------------------------------------------------------------------------------
/src/test/resources/com/rtfparserkit/parser/standard/data/testNegativeUnicode.rtf:
--------------------------------------------------------------------------------
1 | {\rtf1\ansi\ansicpg1252\deff0\deflang2057{\fonttbl{\f0\fnil\fcharset0 Calibri;}}
2 | {\colortbl ;\red0\green0\blue255;}
3 | {\*\generator Msftedit 5.41.21.2510;}\viewkind4\uc1\pard\sa200\sl276\slmult1\lang9\f0\fs22 From: {\field{\*\fldinst{HYPERLINK "http://unicode-table.com/en/#private-use-area"}}{\fldrslt{\ul\cf1 http://unicode-table.com/en/#private-use-area}}}\f0\fs22\par
4 | F700\u-2304?\u-2303?\u-2302?\u-2301?\u-2300?\u-2299?\u-2298?\u-2297?\u-2296?\u-2295?\u-2294?\u-2293?\u-2292?\u-2291?\u-2290?\u-2289?\par
5 | F710\u-2288?\u-2287?\u-2286?\u-2285?\u-2284?\u-2283?\u-2282?\u-2281?\u-2280?\u-2279?\u-2278?\u-2277?\u-2276?\u-2275?\u-2274?\u-2273?\par
6 | F720\u-2272?\u-2271?\u-2270?\u-2269?\u-2268?\u-2267?\u-2266?\par
7 | }
8 |
--------------------------------------------------------------------------------
/src/test/resources/com/rtfparserkit/parser/standard/data/testEncodingParse.xml:
--------------------------------------------------------------------------------
1 | Calibri;Msftedit 5.41.21.2510;Test1
--------------------------------------------------------------------------------
/src/test/resources/com/rtfparserkit/parser/raw/data/testRawParse.xml:
--------------------------------------------------------------------------------
1 | Calibri;Msftedit 5.41.21.2510;Test1
--------------------------------------------------------------------------------
/.idea/jarRepositories.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
--------------------------------------------------------------------------------
/src/test/resources/com/rtfparserkit/parser/standard/data/testKoreanEncoding.xml:
--------------------------------------------------------------------------------
1 | 맑은 고딕;Riched20 14.0.4750.1000;MS Project Addin ProjectPlus
--------------------------------------------------------------------------------
/src/test/resources/com/rtfparserkit/parser/standard/data/test10001Encoding.rtf:
--------------------------------------------------------------------------------
1 | {\rtf1\mac\ansicpg10001\cocoartf824\cocoasubrtf230
2 | {\fonttbl\f0\fnil\fcharset78 HiraKakuPro-W6;\f1\fswiss\fcharset77 Helvetica-Bold;\f2\fswiss\fcharset77 Helvetica;
3 | \f3\fnil\fcharset78 HiraKakuPro-W3;}
4 | {\colortbl;\red255\green255\blue255;}
5 | {\*\listtable{\list\listtemplateid1\listhybrid{\listlevel\levelnfc23\levelnfcn23\leveljc2\leveljcn2\levelfollow0\levelstartat1\levelspace360\levelindent0{\*\levelmarker \{disc\}}{\leveltext\leveltemplateid0\'02\'05.;}{\levelnumbers\'01;}}{\listname ;}\listid1}}
6 | {\*\listoverridetable{\listoverride\listid1\listoverridecount0\ls1}}
7 | \margl1440\margr1440\vieww12240\viewh8980\viewkind0
8 | \pard\tx220\tx720\tx1133\tx1700\tx2267\tx2834\tx3401\tx3968\tx4535\tx5102\tx5669\tx6236\tx6803\li720\fi-720\ql\qnatural\pardirnatural
9 | \ls1\ilvl0
10 | \f0\b\fs50 \cf0 \'82\'a8\'93\'c7\'82\'dd\'82\'ad\'82\'be\'82\'b3\'82\'a2
11 | }
--------------------------------------------------------------------------------
/src/main/java/com/rtfparserkit/utils/RtfDump.java:
--------------------------------------------------------------------------------
1 |
2 | package com.rtfparserkit.utils;
3 |
4 | import java.io.FileInputStream;
5 | import java.io.FileOutputStream;
6 | import java.io.InputStream;
7 | import java.io.OutputStream;
8 |
9 | import com.rtfparserkit.parser.IRtfParser;
10 | import com.rtfparserkit.parser.RtfStreamSource;
11 | import com.rtfparserkit.parser.standard.StandardRtfParser;
12 |
13 | public class RtfDump
14 | {
15 | public static void main(String[] argv)
16 | {
17 | try
18 | {
19 | InputStream is = new FileInputStream(argv[0]);
20 | OutputStream os = new FileOutputStream(argv[1]);
21 | IRtfParser parser = new StandardRtfParser();
22 | parser.parse(new RtfStreamSource(is), new RtfDumpListener(os));
23 | os.close();
24 | }
25 |
26 | catch (Exception ex)
27 | {
28 | ex.printStackTrace();
29 | }
30 | }
31 |
32 | }
33 |
--------------------------------------------------------------------------------
/src/test/resources/com/rtfparserkit/parser/standard/data/testUnicode.rtf:
--------------------------------------------------------------------------------
1 | {\rtf1\fbidis\ansi\ansicpg1252\deff0\deflang2057{\fonttbl{\f0\fnil\fcharset1 Sylfaen;}{\f1\fnil\fcharset0 Sylfaen;}{\f2\fnil\fcharset204 Sylfaen;}{\f3\fnil\fcharset1 Shonar Bangla;}{\f4\fnil\fcharset1 Microsoft Himalaya;}{\f5\fnil\fcharset1 DaunPenh;}{\f6\fswiss\fcharset1 Euphemia;}{\f7\fnil\fcharset134 SimSun;}{\f8\fnil\fcharset238 Calibri;}}
2 | {\*\generator Msftedit 5.41.21.2510;}\viewkind4\uc1\pard\ltrpar\sa200\sl276\slmult1\lang9\f0\fs24\u1329?\u1408?\u1377?\u1396?\par
3 | \f1 Johann Strau\'df\par
4 | \f2 Belgi\f1\'eb\tab Ren\'e9 Magritte\par
5 | \f3\u2476?\u2494?\u2434?\u2482?\u2494?\par
6 | \f4\u3851?\u3937?\u3956?\u3939?\u3853?\par
7 | \f5\u6036?\u6098?\u6042?\u6033?\u6081?\u6047?\u8203?\u8203?\u8203?\u6016?\u6040?\u6098?\u6038?\u6075?\u6023?\u6070?\par
8 | \f6\u5316?\u5319?\u5463?\u5307?\u5285?\u5125?\u5222?\par
9 | \f7\'d6\'d0\'b9\'fa\par
10 | \f8\'c8esk\'e1 republika\par
11 | }
12 |
--------------------------------------------------------------------------------
/src/test/resources/com/rtfparserkit/parser/standard/data/testTurkishEncoding.xml:
--------------------------------------------------------------------------------
1 | Segoe UI;Segoe UI;Riched20 15.0.4567Turkish Encoding.
--------------------------------------------------------------------------------
/src/test/resources/com/rtfparserkit/parser/standard/data/testMultiByteHex.xml:
--------------------------------------------------------------------------------
1 | MS Pゴシック;Tahoma;Riched20 5.50.99.2014;お元気ですか?
--------------------------------------------------------------------------------
/src/main/java/com/rtfparserkit/parser/standard/ParserEventType.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2013 Jon Iles
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.rtfparserkit.parser.standard;
18 |
19 | /**
20 | * Types of event which may be raised by the parser.
21 | */
22 | enum ParserEventType
23 | {
24 | BINARY_BYTES_EVENT, COMMAND_EVENT, DOCUMENT_END_EVENT, DOCUMENT_START_EVENT, GROUP_END_EVENT, GROUP_START_EVENT, STRING_EVENT;
25 | }
26 |
--------------------------------------------------------------------------------
/src/main/java/com/rtfparserkit/utils/ImageDump.java:
--------------------------------------------------------------------------------
1 |
2 | package com.rtfparserkit.utils;
3 |
4 | import com.rtfparserkit.parser.IRtfParser;
5 | import com.rtfparserkit.parser.RtfStreamSource;
6 | import com.rtfparserkit.parser.standard.StandardRtfParser;
7 |
8 | import java.io.FileInputStream;
9 | import java.io.InputStream;
10 | import java.util.Map;
11 |
12 | public class ImageDump
13 | {
14 | public static void main(String[] argv)
15 | {
16 | try
17 | {
18 | InputStream is = new FileInputStream(argv[0]);
19 | IRtfParser parser = new StandardRtfParser();
20 | ImageListener listener = new ImageListener() {
21 | @Override
22 | public void handleImageData(Map data) {
23 | // Handle image data here
24 | System.out.println(data);
25 | }
26 | };
27 | parser.parse(new RtfStreamSource(is), listener);
28 | }
29 |
30 | catch (Exception ex)
31 | {
32 | ex.printStackTrace();
33 | }
34 | }
35 | }
36 |
--------------------------------------------------------------------------------
/src/main/java/com/rtfparserkit/parser/standard/IParserEvent.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2013 Jon Iles
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.rtfparserkit.parser.standard;
18 |
19 | import com.rtfparserkit.parser.IRtfListener;
20 |
21 | /**
22 | * Represents an event generate by the parser which will at some point
23 | * in the future be passed to the listener.
24 | */
25 | interface IParserEvent
26 | {
27 | /**
28 | * Retrieve the event type.
29 | */
30 | public ParserEventType getType();
31 |
32 | /**
33 | * Pass this event to the listener.
34 | */
35 | public void fire(IRtfListener listener);
36 | }
37 |
--------------------------------------------------------------------------------
/src/main/java/com/rtfparserkit/parser/IRtfParser.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2013 Jon Iles
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.rtfparserkit.parser;
18 |
19 | import java.io.IOException;
20 |
21 | /**
22 | * Standard interface implemented by an RTF parser.
23 | */
24 | public interface IRtfParser
25 | {
26 | /**
27 | * At the point the parser is constructed, a listener will have been
28 | * supplied by the caller. When this method is called, the parser reads
29 | * RTF data from the source and calls the listener with details
30 | * of the content.
31 | */
32 | public void parse(IRtfSource source, IRtfListener listener) throws IOException;
33 | }
34 |
--------------------------------------------------------------------------------
/src/test/java/com/rtfparserkit/parser/raw/RawRtfParserTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2013 Jon Iles
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.rtfparserkit.parser.raw;
18 |
19 | import org.junit.Test;
20 |
21 | import com.rtfparserkit.utils.TestUtilities;
22 |
23 | public class RawRtfParserTest
24 | {
25 | @Test
26 | public void testRawParse() throws Exception
27 | {
28 | TestUtilities.assertRtfParserDumpMatches(this, new RawRtfParser(), "testRawParse");
29 | }
30 |
31 | @Test
32 | public void testSpecialChars() throws Exception
33 | {
34 | TestUtilities.assertRtfParserDumpMatches(this, new RawRtfParser(), "testSpecialChars");
35 | }
36 | }
37 |
--------------------------------------------------------------------------------
/src/main/java/com/rtfparserkit/parser/IRtfSource.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2013 Jon Iles
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.rtfparserkit.parser;
18 |
19 | import java.io.IOException;
20 |
21 | /**
22 | * Classes implementing this interface can be used as the source of RTF data for the parser to consume.
23 | */
24 | public interface IRtfSource
25 | {
26 | /**
27 | * Read a single byte.
28 | */
29 | int read() throws IOException;
30 |
31 | /**
32 | * Push back a single byte to allow it to be read again by the parser.
33 | */
34 | void unread(int c) throws IOException;
35 |
36 | /**
37 | * Read enough bytes to fill the array.
38 | */
39 | int read(byte[] b) throws IOException;
40 | }
41 |
--------------------------------------------------------------------------------
/src/main/java/com/rtfparserkit/rtf/CommandType.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2013 Jon Iles
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.rtfparserkit.rtf;
18 |
19 | /**
20 | * Enumeration of command types. See Appendix B of the RTF specification.
21 | */
22 | public enum CommandType
23 | {
24 | Symbol, // This control word represents a special character
25 | Flag, // This control word ignores any parameter.
26 | Toggle, // This control word distinguishes between the ON and OFF states for the given property
27 | Value, // This control word requires a parameter.
28 | Destination, // This control word starts a group or destination. It ignores any parameter.
29 | Encoding; // Switch the character encoding used from this point in the document
30 | }
31 |
--------------------------------------------------------------------------------
/src/main/java/com/rtfparserkit/parser/standard/GroupEndEvent.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2013 Jon Iles
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.rtfparserkit.parser.standard;
18 |
19 | import com.rtfparserkit.parser.IRtfListener;
20 |
21 | /**
22 | * Event represents the end of a group.
23 | */
24 | class GroupEndEvent implements IParserEvent
25 | {
26 | /**
27 | * Pass the event to the listener.
28 | */
29 | @Override
30 | public void fire(IRtfListener listener)
31 | {
32 | listener.processGroupEnd();
33 | }
34 |
35 | /**
36 | * Retrieve the event type.
37 | */
38 | @Override
39 | public ParserEventType getType()
40 | {
41 | return ParserEventType.GROUP_END_EVENT;
42 | }
43 |
44 | @Override
45 | public String toString()
46 | {
47 | return "[GroupEndEvent]";
48 | }
49 | }
50 |
--------------------------------------------------------------------------------
/src/test/resources/com/rtfparserkit/parser/standard/data/testNecCharacters.xml:
--------------------------------------------------------------------------------
1 | MS Sans Serif;MS UI Gothic; ;;;BLC U=>L Splice でU/W No.2 Dancerからシワ発生。Set時に表層10㎜ほどムーニング pallet底のRoll
--------------------------------------------------------------------------------
/src/test/resources/com/rtfparserkit/parser/standard/data/testGitHubIssue6.rtf:
--------------------------------------------------------------------------------
1 | {\rtf1\ansi\ansicpg1252\deff0{\fonttbl{\f0\froman\fcharset0 Times New Roman;}{\f1\froman\fcharset0 Arial;}{\f2\froman\fcharset0 Courier;}}{\colortbl\red0\green0\blue0;\red255\green255\blue255;}{\stylesheet {\style\s0 \ql\fi0\li0\ri0\f1\fs24\cf0 Normal;}{\style\s3 \ql\fi0\li0\ri0\f1\fs26\b\cf0 heading 3;}{\style\s2 \ql\fi0\li0\ri0\f1\fs28\b\i\cf0 heading 2;}{\style\s1 \ql\fi0\li0\ri0\f1\fs32\b\cf0 heading 1;}}{\*\listtable}{\*\listoverridetable}{\*\generator iText 2.1.7 by 1T3XT}{\info}\paperw12242\paperh15842\margl1425\margr360\margt950\margb1425{\header \pard\plain\s0\qr\fi0\li0\ri0\sl320\plain\f0{\field{\*\fldinst PAGE}{\fldrslt }}\f2\fs24 . \line \par}\pgwsxn12242\pghsxn15842\marglsxn1425\margrsxn360\margtsxn950\margbsxn1425\pard\plain\s0\ql\fi-734\li734\ri0\sb480\sa240\sl240\plain\tx720\tqr\tx9580\tx9720{\f2\fs24\cf0\chcbpat1 \tab }{\f2\fs24\cf0\chcbpat1 INNEN. K\u220?CHE - TAG}\par\pard\plain\s0\qj\fi0\li734\ri864\sb240\sa240\sl240\plain\tx1920\tx3840\tx5760\tx7680\tx9600{\f2\fs24\cf0\chcbpat1 Ein Absatz mit Line-Separator:\line Der geht hier auf einer neuen Zeile weiter.}\par\pard\plain\s0\ql\fi-734\li734\ri0\sb480\sa240\sl240\plain\tx720\tqr\tx9580\tx9720{\f2\fs24\cf0\chcbpat1 \tab }{\f2\fs24\cf0\chcbpat1 INNEN. K\u220?CHE - TAG}\par\pard\plain\s0\qj\fi0\li734\ri864\sb240\sl240\plain\tx1920\tx3840\tx5760\tx7680\tx9600{\f2\fs24\cf0\chcbpat1 Hier ist die zweite Szene.}\par}
--------------------------------------------------------------------------------
/src/main/java/com/rtfparserkit/parser/standard/GroupStartEvent.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2013 Jon Iles
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.rtfparserkit.parser.standard;
18 |
19 | import com.rtfparserkit.parser.IRtfListener;
20 |
21 | /**
22 | * Event represents the start of a group.
23 | */
24 | class GroupStartEvent implements IParserEvent
25 | {
26 | /**
27 | * Pass the event to the listener.
28 | */
29 | @Override
30 | public void fire(IRtfListener listener)
31 | {
32 | listener.processGroupStart();
33 | }
34 |
35 | /**
36 | * Retrieve the event type.
37 | */
38 | @Override
39 | public ParserEventType getType()
40 | {
41 | return ParserEventType.GROUP_START_EVENT;
42 | }
43 |
44 | @Override
45 | public String toString()
46 | {
47 | return "[GroupStartEvent]";
48 | }
49 | }
50 |
--------------------------------------------------------------------------------
/.classpath:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
--------------------------------------------------------------------------------
/src/main/java/com/rtfparserkit/parser/standard/DocumentEndEvent.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2013 Jon Iles
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.rtfparserkit.parser.standard;
18 |
19 | import com.rtfparserkit.parser.IRtfListener;
20 |
21 | /**
22 | * Event represents the end of a document.
23 | */
24 | class DocumentEndEvent implements IParserEvent
25 | {
26 | /**
27 | * Pass the event to the listener.
28 | */
29 | @Override
30 | public void fire(IRtfListener listener)
31 | {
32 | listener.processDocumentEnd();
33 | }
34 |
35 | /**
36 | * Retrieve the event type.
37 | */
38 | @Override
39 | public ParserEventType getType()
40 | {
41 | return ParserEventType.DOCUMENT_END_EVENT;
42 | }
43 |
44 | @Override
45 | public String toString()
46 | {
47 | return "[DocumentEndEvent]";
48 | }
49 | }
50 |
--------------------------------------------------------------------------------
/src/main/java/com/rtfparserkit/parser/standard/DocumentStartEvent.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2013 Jon Iles
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.rtfparserkit.parser.standard;
18 |
19 | import com.rtfparserkit.parser.IRtfListener;
20 |
21 | /**
22 | * Event represents the start of a document.
23 | */
24 | class DocumentStartEvent implements IParserEvent
25 | {
26 | /**
27 | * Pass the event to the listener.
28 | */
29 | @Override
30 | public void fire(IRtfListener listener)
31 | {
32 | listener.processDocumentStart();
33 | }
34 |
35 | /**
36 | * Retrieve the event type.
37 | */
38 | @Override
39 | public ParserEventType getType()
40 | {
41 | return ParserEventType.DOCUMENT_START_EVENT;
42 | }
43 |
44 | @Override
45 | public String toString()
46 | {
47 | return "[DocumentStartEvent]";
48 | }
49 | }
50 |
--------------------------------------------------------------------------------
/src/test/resources/com/rtfparserkit/parser/standard/data/test950Encoding.xml:
--------------------------------------------------------------------------------
1 | LucidaGrande;;;TitleTest AuthorTest TextCopyright © 2004-2013 Test Project
--------------------------------------------------------------------------------
/src/test/resources/com/rtfparserkit/parser/standard/data/testNegativeUnicode.xml:
--------------------------------------------------------------------------------
1 | Calibri;;;Msftedit 5.41.21.2510;From: HYPERLINK "http://unicode-table.com/en/#private-use-area"http://unicode-table.com/en/#private-use-areaF700F710F720
--------------------------------------------------------------------------------
/src/test/resources/com/rtfparserkit/parser/standard/data/test10007Encoding.xml:
--------------------------------------------------------------------------------
1 | LucidaGrande;Georgia;Verdana;;;Комманда разработчиков
--------------------------------------------------------------------------------
/src/main/java/com/rtfparserkit/converter/text/StringTextConverter.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2013 Jon Iles
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.rtfparserkit.converter.text;
18 |
19 | import java.io.IOException;
20 |
21 | import com.rtfparserkit.parser.IRtfSource;
22 |
23 | /**
24 | * This class implements a trivial RTF to text converter.
25 | * The extracted text is cached in a buffer and is available
26 | * to the caller using the getText() method.
27 | */
28 | public class StringTextConverter extends AbstractTextConverter
29 | {
30 | @Override
31 | public void convert(IRtfSource source) throws IOException
32 | {
33 | buffer.setLength(0);
34 | super.convert(source);
35 | }
36 |
37 | @Override
38 | public void processExtractedText(String text)
39 | {
40 | buffer.append(text);
41 | }
42 |
43 | public String getText()
44 | {
45 | return buffer.toString();
46 | }
47 |
48 | private StringBuilder buffer = new StringBuilder();
49 | }
50 |
--------------------------------------------------------------------------------
/src/main/java/com/rtfparserkit/utils/ImageListener.java:
--------------------------------------------------------------------------------
1 | package com.rtfparserkit.utils;
2 |
3 | import com.rtfparserkit.parser.RtfListenerAdaptor;
4 | import com.rtfparserkit.rtf.Command;
5 |
6 | import java.util.HashMap;
7 | import java.util.Map;
8 |
9 | public abstract class ImageListener extends RtfListenerAdaptor
10 | {
11 | public abstract void handleImageData(Map data);
12 |
13 | @Override public void processGroupStart()
14 | {
15 | ++groupDepth;
16 | }
17 |
18 | @Override public void processGroupEnd()
19 | {
20 | --groupDepth;
21 | if (pictData != null && groupDepth < pictGroupDepth)
22 | {
23 | handleImageData(pictData);
24 | pictData = null;
25 | }
26 | }
27 |
28 | @Override public void processCommand(Command command, int parameter, boolean hasParameter, boolean optional)
29 | {
30 | if (pictData != null)
31 | {
32 | Integer value = hasParameter ? Integer.valueOf(parameter) : null;
33 | pictData.put(command.getCommandName(), value);
34 | }
35 | else
36 | {
37 | if (command == Command.pict)
38 | {
39 | pictGroupDepth = groupDepth;
40 | pictData = new HashMap();
41 | }
42 | }
43 | }
44 |
45 | @Override public void processString(String string)
46 | {
47 | if (pictData != null)
48 | {
49 | pictData.put("data", string);
50 | }
51 | }
52 |
53 | private int groupDepth;
54 | private int pictGroupDepth;
55 | private Map pictData;
56 | }
57 |
--------------------------------------------------------------------------------
/src/main/java/com/rtfparserkit/parser/standard/ParserState.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2013 Jon Iles
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.rtfparserkit.parser.standard;
18 |
19 | /**
20 | * A simple "struct" (hence the public members) representing the current state of the parser.
21 | */
22 | class ParserState
23 | {
24 | public ParserState()
25 | {
26 |
27 | }
28 |
29 | public ParserState(ParserState state)
30 | {
31 | currentFontExplicitlySet = state.currentFontExplicitlySet;
32 | currentFont = state.currentFont;
33 | currentEncoding = state.currentEncoding;
34 | currentFontEncoding = state.currentFontEncoding;
35 | unicodeAlternateSkipCount = state.unicodeAlternateSkipCount;
36 | }
37 |
38 | public boolean currentFontExplicitlySet = false;
39 | public int currentFont;
40 | public String currentEncoding = Encoding.ANSI_ENCODING;
41 | public String currentFontEncoding;
42 | public int unicodeAlternateSkipCount = 1;
43 | }
44 |
--------------------------------------------------------------------------------
/src/test/resources/com/rtfparserkit/parser/standard/data/testSpecialChars.xml:
--------------------------------------------------------------------------------
1 | Calibri;Msftedit 5.41.21.2510;Tab ABTab ABCR ABLF ABThis is a line break.Here is the new line.Emdash: —Endash: –Emspace: Enspace: Qmspace: Bullet: •Lquote: ‘Rquote: ’Ldblquote: “Rdblquote: ”
--------------------------------------------------------------------------------
/src/main/java/com/rtfparserkit/parser/standard/BinaryBytesEvent.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2013 Jon Iles
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.rtfparserkit.parser.standard;
18 |
19 | import com.rtfparserkit.parser.IRtfListener;
20 |
21 | /**
22 | * Represents an event to be sent to the listener.
23 | */
24 | class BinaryBytesEvent implements IParserEvent
25 | {
26 | /**
27 | * Constructor.
28 | */
29 | public BinaryBytesEvent(byte[] data)
30 | {
31 | this.data = data;
32 | }
33 |
34 | /**
35 | * Retrieve the event type.
36 | */
37 | @Override
38 | public ParserEventType getType()
39 | {
40 | return ParserEventType.BINARY_BYTES_EVENT;
41 | }
42 |
43 | /**
44 | * Pass the event to the listener.
45 | */
46 | @Override
47 | public void fire(IRtfListener listener)
48 | {
49 | listener.processBinaryBytes(data);
50 | }
51 |
52 | @Override
53 | public String toString()
54 | {
55 | return "[BinaryBytesEvent " + data.length + " bytes]";
56 | }
57 |
58 | private final byte[] data;
59 | }
60 |
--------------------------------------------------------------------------------
/src/test/resources/com/rtfparserkit/parser/standard/data/testStyles.xml:
--------------------------------------------------------------------------------
1 | Calibri;Arial;Msftedit 5.41.21.2510;Test 1 - normal.Test 2 - italic.Test 3 - bold.Test 4 -normal.Test 5 - Calibri 11.Test 6 - Arial 11.Test 7 - Calibri 11.Test 6 - Arial 12.Test 7 - Calibri 11.
--------------------------------------------------------------------------------
/src/main/java/com/rtfparserkit/converter/text/StreamTextConverter.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2013 Jon Iles
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.rtfparserkit.converter.text;
18 |
19 | import java.io.IOException;
20 | import java.io.OutputStream;
21 | import java.nio.charset.Charset;
22 |
23 | import com.rtfparserkit.parser.IRtfSource;
24 |
25 | /**
26 | * This class implements a trivial RTF to text converter.
27 | * The extracted text is written to the OutputStream as it is extracted.
28 | */
29 | public class StreamTextConverter extends AbstractTextConverter
30 | {
31 | public void convert(IRtfSource source, OutputStream os, String outputCharsetName) throws IOException
32 | {
33 | this.os = os;
34 | this.charset = Charset.forName(outputCharsetName);
35 | convert(source);
36 | }
37 |
38 | @Override
39 | public void processExtractedText(String text)
40 | {
41 | try
42 | {
43 | os.write(text.getBytes(charset));
44 | }
45 |
46 | catch (IOException ex)
47 | {
48 | throw new RuntimeException(ex);
49 | }
50 | }
51 |
52 | private Charset charset;
53 | private OutputStream os;
54 | }
55 |
--------------------------------------------------------------------------------
/src/main/java/com/rtfparserkit/parser/standard/StringEvent.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2013 Jon Iles
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.rtfparserkit.parser.standard;
18 |
19 | import com.rtfparserkit.parser.IRtfListener;
20 |
21 | /**
22 | * Represents an event to be sent to the listener.
23 | */
24 | class StringEvent implements IParserEvent
25 | {
26 | /**
27 | * Constructor.
28 | */
29 | public StringEvent(String data)
30 | {
31 | this.data = data;
32 | }
33 |
34 | /**
35 | * Retrieve the event type.
36 | */
37 | @Override
38 | public ParserEventType getType()
39 | {
40 | return ParserEventType.STRING_EVENT;
41 | }
42 |
43 | /**
44 | * Pass the event to the listener.
45 | */
46 | @Override
47 | public void fire(IRtfListener listener)
48 | {
49 | listener.processString(data);
50 | }
51 |
52 | /**
53 | * Retrieve the string data.
54 | */
55 | public String getString()
56 | {
57 | return data;
58 | }
59 |
60 | @Override
61 | public String toString()
62 | {
63 | return "[StringEvent data=" + data + "]";
64 | }
65 |
66 | private final String data;
67 | }
68 |
--------------------------------------------------------------------------------
/src/test/resources/com/rtfparserkit/parser/standard/data/test437Encoding.rtf:
--------------------------------------------------------------------------------
1 | {\rtf1 \mac \ansicpg437 \cocoartf102 {\fonttbl {\f0 \fnil \fcharset77 Times New Roman{\*\falt Times}
2 | ;}
3 | {\f1 \fnil \fcharset77 Helvetica-Bold{\*\falt Helvetica}
4 | ;}
5 | {\f2 \fnil \fcharset77 Helvetica;}
6 | {\f3 \fnil \fcharset77 LucidaGrande{\*\falt Lucida Grande}
7 | ;}
8 | }
9 | {\colortbl ;\red0 \green0 \blue0 ;}
10 | {\stylesheet {\*\cs335 {\*\nsmpltxt The quick brown fox jumped over the lazy dogs.}
11 | \super footnote reference;}
12 | {\*\cs336 {\*\nsmpltxt The quick brown fox jumped over the lazy dogs.}
13 | \super endnote reference;}
14 | {\s337 \nisusnoteplacement0 \nisusreferencestyle335 {\*\nsmpltxt Some text goes here so you can see what your style will look like.}
15 | \f3 footnote text;}
16 | {\s338 \nisusnoteplacement1 \nisusreferencestyle336 {\*\nsmpltxt Sample text for Foot/End Notes Style}
17 | \f3 endnote text;}
18 | }
19 | \deftab720 \defformat \viewkind1 \viewzk1 {\*\nisuswindow \x70 \y194 \w741 \h638 }
20 | \nshwinv0 \nshwpg1 \hyphauto0 \ftnnar \endnotes \aendnotes \aftnnar \fet2 \ftnbj \paperw12240 \paperh15840 \margl1440 \margr1440 \margt1440 \margb1440 \gutter0 \pgnstart1 \nocolbal \sectd \sbknone \cols1 \ltrsect \colbalsxn0 \marglsxn1440 \margrsxn1440 \margtsxn1440 \margbsxn1440 \guttersxn0 \headery720 \footery720 \pgnstarts1 \pgnrestart \pgndec \sxnstarts1 \sxnrestart \sxndec {\header \pard \ql \sb0 \sa0 \sl240 \slmult1 \ilvl0 \li0 \lin0 \fi0 \ri0 \rin0 \par }
21 | {\footer \pard \ql \sb0 \sa0 \sl240 \slmult1 \ilvl0 \li0 \lin0 \fi0 \ri0 \rin0 \par }
22 | {\pard \ql \sb0 \sa0 \sl240 \slmult1 \ilvl0 \li0 \lin0 \fi0 \ri0 \rin0 {\f1 \fs24 \b \cf1 Test Document\par
23 | \f2 \b0 \par
24 | \b Test Title: \tab (Test Subtitle)\par
25 | \b0 \par
26 | \b Test Heading:\b0 \par
27 | Test Text. \par}
28 | }
29 | }
--------------------------------------------------------------------------------
/src/main/java/com/rtfparserkit/parser/RtfListenerAdaptor.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2013 Jon Iles
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.rtfparserkit.parser;
18 |
19 | import com.rtfparserkit.rtf.Command;
20 |
21 | /**
22 | * This adaptor class is provided as a convenience for users of the IRtfListener
23 | * interface. Subclass this class to provide an implementation of IRtfListener
24 | * and override just the methods you need.
25 | */
26 | public class RtfListenerAdaptor implements IRtfListener
27 | {
28 | @Override
29 | public void processDocumentStart()
30 | {
31 |
32 | }
33 |
34 | @Override
35 | public void processDocumentEnd()
36 | {
37 |
38 | }
39 |
40 | @Override
41 | public void processGroupStart()
42 | {
43 |
44 | }
45 |
46 | @Override
47 | public void processGroupEnd()
48 | {
49 |
50 | }
51 |
52 | @Override
53 | public void processCharacterBytes(byte[] data)
54 | {
55 |
56 | }
57 |
58 | @Override
59 | public void processBinaryBytes(byte[] data)
60 | {
61 |
62 | }
63 |
64 | @Override
65 | public void processString(String string)
66 | {
67 |
68 | }
69 |
70 | @Override
71 | public void processCommand(Command command, int parameter, boolean hasParameter, boolean optional)
72 | {
73 |
74 | }
75 | }
76 |
--------------------------------------------------------------------------------
/src/test/resources/com/rtfparserkit/parser/raw/data/testSpecialChars.xml:
--------------------------------------------------------------------------------
1 | Calibri;Msftedit 5.41.21.2510;Tab ABTab ABCR ABLF ABThis is a line break.Here is the new line.Emdash: Endash: Emspace: Enspace: Qmspace: Bullet: Lquote: Rquote: Ldblquote: Rdblquote:
--------------------------------------------------------------------------------
/src/main/java/com/rtfparserkit/parser/RtfStreamSource.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2013 Jon Iles
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.rtfparserkit.parser;
18 |
19 | import java.io.BufferedInputStream;
20 | import java.io.IOException;
21 | import java.io.InputStream;
22 |
23 | /**
24 | * Implementation of IRtfSource which will read the RTF file from a stream.
25 | */
26 | public class RtfStreamSource implements IRtfSource
27 | {
28 | private final InputStream stream;
29 | private int pushBackChar = -1;
30 |
31 | public RtfStreamSource(InputStream stream)
32 | {
33 | if (stream instanceof BufferedInputStream)
34 | {
35 | this.stream = stream;
36 | }
37 | else
38 | {
39 | this.stream = new BufferedInputStream(stream);
40 | }
41 | }
42 |
43 | @Override
44 | public int read() throws IOException
45 | {
46 | int result;
47 |
48 | if (pushBackChar != -1)
49 | {
50 | result = pushBackChar;
51 | pushBackChar = -1;
52 | }
53 | else
54 | {
55 | result = stream.read();
56 | }
57 |
58 | return result;
59 | }
60 |
61 | @Override
62 | public void unread(int c) throws IOException
63 | {
64 | if (pushBackChar != -1)
65 | {
66 | throw new IOException("Unread not possible");
67 | }
68 |
69 | pushBackChar = c;
70 | }
71 |
72 | @Override
73 | public int read(byte[] b) throws IOException
74 | {
75 | return stream.read(b);
76 | }
77 | }
78 |
--------------------------------------------------------------------------------
/src/test/java/com/rtfparserkit/parser/raw/ByteBufferTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2013 Jon Iles
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.rtfparserkit.parser.raw;
18 |
19 | import static org.junit.Assert.assertArrayEquals;
20 | import static org.junit.Assert.assertEquals;
21 | import static org.junit.Assert.assertFalse;
22 | import static org.junit.Assert.assertTrue;
23 |
24 | import java.util.Random;
25 |
26 | import org.junit.Test;
27 |
28 | public class ByteBufferTest
29 | {
30 | @Test
31 | public void testBasicOperations()
32 | {
33 | ByteBuffer buffer = new ByteBuffer();
34 | byte[] array = buffer.toArray();
35 | assertEquals(0, array.length);
36 | assertTrue(buffer.isEmpty());
37 |
38 | buffer.add(1);
39 | array = buffer.toArray();
40 | assertEquals(1, array.length);
41 | assertEquals(1, array[0]);
42 | assertFalse(buffer.isEmpty());
43 |
44 | buffer.clear();
45 | array = buffer.toArray();
46 | assertEquals(0, array.length);
47 | }
48 |
49 | @Test
50 | public void testBufferSizeIncrease()
51 | {
52 | Random r = new Random();
53 |
54 | byte[] array = new byte[1025];
55 | ByteBuffer buffer = new ByteBuffer();
56 |
57 | for (int loop = 0; loop < array.length; loop++)
58 | {
59 | byte value = (byte) r.nextInt();
60 | array[loop] = value;
61 | buffer.add(value);
62 | }
63 |
64 | byte[] actualArray = buffer.toArray();
65 | assertArrayEquals(array, actualArray);
66 | }
67 | }
68 |
--------------------------------------------------------------------------------
/src/main/java/com/rtfparserkit/parser/standard/IParserEventHandler.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2013 Jon Iles
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.rtfparserkit.parser.standard;
18 |
19 | /**
20 | * Represents a handler which will consume events raised by the parser and handle them
21 | * appropriately. By default this will typically mean passing them to the listener,
22 | * but there may be cases where we may wish to implement something like a state machine
23 | * to consume a set of related events, then take some action based on the complete set
24 | * of events read, rather than reacting to events one at a time.
25 | *
26 | * This interface allows this functionality to be switched in and out as required.
27 | */
28 | interface IParserEventHandler
29 | {
30 | /**
31 | * The parser informs the handler of an event.
32 | */
33 | public void handleEvent(IParserEvent event);
34 |
35 | /**
36 | * Retrieve the last event seen by the handler.
37 | */
38 | public IParserEvent getLastEvent();
39 |
40 | /**
41 | * Assumes the handler is buffering events, and removes the last event from this buffer.
42 | */
43 | public void removeLastEvent();
44 |
45 | /**
46 | * Returns false if this handler is OK to receive further events, or true
47 | * if this handler is complete, and the previous handler should be used again.
48 | * This assumes that the parser is keeping a stack of handlers and popping the
49 | * last handler from the stack when the current handler has consumed all the events
50 | * it can.
51 | */
52 | public boolean isComplete();
53 | }
54 |
--------------------------------------------------------------------------------
/src/main/java/com/rtfparserkit/parser/standard/CommandEvent.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2013 Jon Iles
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.rtfparserkit.parser.standard;
18 |
19 | import com.rtfparserkit.parser.IRtfListener;
20 | import com.rtfparserkit.rtf.Command;
21 |
22 | /**
23 | * Represents an event to be sent to the listener.
24 | */
25 | class CommandEvent implements IParserEvent
26 | {
27 | /**
28 | * Constructor.
29 | */
30 | public CommandEvent(Command command, int parameter, boolean hasParameter, boolean optional)
31 | {
32 | this.command = command;
33 | this.parameter = parameter;
34 | this.hasParameter = hasParameter;
35 | this.optional = optional;
36 | }
37 |
38 | /**
39 | * Retrieve the event type.
40 | */
41 | @Override
42 | public ParserEventType getType()
43 | {
44 | return ParserEventType.COMMAND_EVENT;
45 | }
46 |
47 | /**
48 | * Pass the event to the listener.
49 | */
50 | @Override
51 | public void fire(IRtfListener listener)
52 | {
53 | listener.processCommand(command, parameter, hasParameter, optional);
54 | }
55 |
56 | /**
57 | * Retrieve the command represented by this event.
58 | */
59 | public Command getCommand()
60 | {
61 | return command;
62 | }
63 |
64 | @Override
65 | public String toString()
66 | {
67 | return "[CommandEvent command=" + command + (hasParameter ? " parameter=" + parameter : "") + (optional ? " optional" : "") + "]";
68 | }
69 |
70 | private final Command command;
71 | private final int parameter;
72 | private final boolean hasParameter;
73 | private final boolean optional;
74 | }
75 |
--------------------------------------------------------------------------------
/src/main/java/com/rtfparserkit/parser/raw/ByteBuffer.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2013 Jon Iles
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.rtfparserkit.parser.raw;
18 |
19 | /**
20 | * Implements a simple byte array based buffer. Used to collect
21 | * data one byte at a time into a buffer, then pass the
22 | * collected data to the caller as an array.
23 | */
24 | class ByteBuffer
25 | {
26 | /**
27 | * Add a byte to the buffer.
28 | */
29 | public void add(int b)
30 | {
31 | if (bufferSize == buffer.length)
32 | {
33 | byte[] newBuffer = new byte[buffer.length + (buffer.length >> 1)];
34 | System.arraycopy(buffer, 0, newBuffer, 0, bufferSize);
35 | buffer = newBuffer;
36 | }
37 |
38 | buffer[bufferSize++] = (byte) b;
39 | }
40 |
41 | /**
42 | * Clear the buffer.
43 | */
44 | public void clear()
45 | {
46 | bufferSize = 0;
47 | }
48 |
49 | /**
50 | * Return the buffer as an array.
51 | */
52 | public byte[] toArray()
53 | {
54 | byte[] result = new byte[bufferSize];
55 | System.arraycopy(buffer, 0, result, 0, bufferSize);
56 | return result;
57 | }
58 |
59 | /**
60 | * Determines if the buffer is empty.
61 | */
62 | public boolean isEmpty()
63 | {
64 | return bufferSize == 0;
65 | }
66 |
67 | @Override
68 | public String toString()
69 | {
70 | return "[ByteBuffer bufferSize=" + bufferSize + " buffer=" + new String(buffer, 0, bufferSize) + "]";
71 | }
72 |
73 | private static final int INITIAL_BUFFER_CAPACITY = 10240;
74 | private int bufferSize;
75 | private byte[] buffer = new byte[INITIAL_BUFFER_CAPACITY];
76 | }
77 |
--------------------------------------------------------------------------------
/src/main/java/com/rtfparserkit/parser/RtfStringSource.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2013 Jon Iles
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.rtfparserkit.parser;
18 |
19 | import java.io.IOException;
20 |
21 | /**
22 | * This is a hack... as a convenience it allows RTF data held in a string
23 | * to be parsed. The reason it's a hack is that RTF commands are used to
24 | * manage character encoding, and it's unlikely that the caller handled
25 | * this when they read the RTF in to the string from its original source.
26 | * It is also possible for an RTF file to contain raw binary data, which this
27 | * class will definitely not deal with.
28 | *
29 | * Having said all of that, in the vast majority of cases this approach
30 | * will work fine. It's also more efficient than creating an InputStream
31 | * from a String - which usually involves duplicating byte arrays.
32 | */
33 | public class RtfStringSource implements IRtfSource
34 | {
35 | private final String data;
36 | private int index;
37 |
38 | public RtfStringSource(String data)
39 | {
40 | this.data = data;
41 | }
42 |
43 | @Override
44 | public int read() throws IOException
45 | {
46 | int result;
47 |
48 | if (index == data.length())
49 | {
50 | result = -1;
51 | }
52 | else
53 | {
54 | result = data.charAt(index++);
55 | }
56 |
57 | return result;
58 | }
59 |
60 | @Override
61 | public void unread(int c) throws IOException
62 | {
63 | if (index == 0)
64 | {
65 | throw new IOException("Unread not possible");
66 | }
67 | --index;
68 | }
69 |
70 | @Override
71 | public int read(byte[] b) throws IOException
72 | {
73 | throw new UnsupportedOperationException();
74 | }
75 | }
76 |
--------------------------------------------------------------------------------
/src/test/java/com/rtfparserkit/converter/text/StringTextConverterTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2013 Jon Iles
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.rtfparserkit.converter.text;
18 |
19 | import static org.junit.Assert.assertEquals;
20 |
21 | import java.io.IOException;
22 | import java.io.InputStream;
23 |
24 | import org.junit.Test;
25 |
26 | import com.rtfparserkit.parser.RtfStreamSource;
27 | import com.rtfparserkit.utils.TestUtilities;
28 |
29 | public class StringTextConverterTest
30 | {
31 | @Test
32 | public void testTextConversion() throws IOException
33 | {
34 | StringTextConverter tc = new StringTextConverter();
35 |
36 | InputStream is = null;
37 | try
38 | {
39 | is = StringTextConverterTest.class.getResourceAsStream("data/testTextConversion.rtf");
40 | tc.convert(new RtfStreamSource(is));
41 | }
42 |
43 | finally
44 | {
45 | if (is != null)
46 | {
47 | try
48 | {
49 | is.close();
50 | }
51 |
52 | catch (Exception ex)
53 | {
54 | // Ignore
55 | }
56 | }
57 | }
58 |
59 | InputStream expectedStream = StringTextConverterTest.class.getResourceAsStream("data/testTextConversion.txt");
60 | try
61 | {
62 | expectedStream = StringTextConverterTest.class.getResourceAsStream("data/testTextConversion.txt");
63 | String expectedText = TestUtilities.readStreamToString(expectedStream);
64 | String actualText = tc.getText();
65 | assertEquals(expectedText, actualText);
66 | }
67 |
68 | finally
69 | {
70 | if (expectedStream != null)
71 | {
72 | try
73 | {
74 | expectedStream.close();
75 | }
76 |
77 | catch (Exception ex)
78 | {
79 | // Ignore
80 | }
81 | }
82 | }
83 | }
84 | }
85 |
--------------------------------------------------------------------------------
/src/test/resources/com/rtfparserkit/parser/standard/data/testUnicode.xml:
--------------------------------------------------------------------------------
1 | Sylfaen;Sylfaen;Sylfaen;Shonar Bangla;Microsoft Himalaya;DaunPenh;Euphemia;SimSun;Calibri;Msftedit 5.41.21.2510;ԱրամJohann StraußBelgiëRené Magritteবাংলা་ཡུལ།ប្រទេសកម្ពុជាᓄᓇᕗᒻᒥᐅᑦ中国Česká republika
--------------------------------------------------------------------------------
/src/main/java/com/rtfparserkit/parser/IRtfListener.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2013 Jon Iles
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.rtfparserkit.parser;
18 |
19 | import com.rtfparserkit.rtf.Command;
20 |
21 | /**
22 | * Listener interface implemented by classes to receive event from an RTF parser.
23 | */
24 | public interface IRtfListener
25 | {
26 | /**
27 | * Event raised when the parser starts to read the document.
28 | */
29 | public void processDocumentStart();
30 |
31 | /**
32 | * Event raised when the parser reaches the end of the document.
33 | */
34 | public void processDocumentEnd();
35 |
36 | /**
37 | * Event raised when the stat of a group is encountered.
38 | */
39 | public void processGroupStart();
40 |
41 | /**
42 | * Event raised when the end of a group is encountered.
43 | */
44 | public void processGroupEnd();
45 |
46 | /**
47 | * Event raised by the raw RTF parser to pass bytes representing characters.
48 | * Note that these bytes will be "as read" from the RTF file, and have not been
49 | * processed to account for the current encoding.
50 | */
51 | public void processCharacterBytes(byte[] data);
52 |
53 | /**
54 | * Event raised by an RTF parser to pass binary bytes.
55 | */
56 | public void processBinaryBytes(byte[] data);
57 |
58 | /**
59 | * Event raised by the RTF parser to pass string data. Note that the string
60 | * has been created with the appropriate encoding and no further processing will
61 | * be required.
62 | */
63 | public void processString(String string);
64 |
65 | /**
66 | * Event raised by the RTF parser detailing a command read from the file.
67 | * The parameter argument details the optional integer parameter associated with the
68 | * command. If the hasParameter flag is false, the parameter argument can be ignored.
69 | * If the optional flag is set, this is a command which RTF readers can choose not
70 | * to implement.
71 | */
72 | public void processCommand(Command command, int parameter, boolean hasParameter, boolean optional);
73 | }
74 |
--------------------------------------------------------------------------------
/src/test/resources/com/rtfparserkit/converter/text/data/testTextConversion.txt:
--------------------------------------------------------------------------------
1 | First Heading
2 | Lorem ipsum dolor sit amet, consectetur adipiscing elit. Duis ornare dictum congue. Vivamus vel sapien sed enim pharetra molestie et eu odio. Sed id dui eu massa pulvinar ullamcorper. Nam ornare diam at massa porttitor, a tincidunt eros molestie. Duis lacinia nulla tempor feugiat convallis. Curabitur hendrerit sagittis aliquet. Vivamus porta molestie felis, lobortis sodales lacus accumsan sit amet. Aliquam ultricies porta congue. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos.
3 | Duis eleifend diam adipiscing purus dictum lobortis at sit amet arcu. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Mauris ac sollicitudin eros, in sodales diam. Suspendisse vitae sem quis nunc hendrerit vulputate vel vulputate sapien. Nulla pretium semper ipsum id sollicitudin. Vivamus eu congue leo. Nulla blandit fringilla luctus. Nulla ac erat laoreet, consequat felis quis, suscipit purus. Praesent varius pellentesque tortor, ut condimentum magna vulputate at. Cras posuere tortor ac turpis lobortis posuere. Suspendisse quam odio, iaculis eu turpis eu, mattis placerat neque. Nam aliquam accumsan neque ut commodo. Nulla porta nulla elit, eu semper tellus bibendum sit amet. Sed dolor odio, placerat eleifend tristique in, tempus in elit. Nam sed eros elit. Nam rhoncus diam et diam bibendum, at vestibulum lorem tempor.
4 | Second Heading
5 | Suspendisse venenatis rutrum tincidunt. Integer vulputate erat id luctus facilisis. Phasellus nibh mi, ornare vitae felis vitae, euismod interdum nisl. Etiam tempus aliquet tellus. Aliquam nulla mi, vestibulum nec odio vitae, adipiscing vehicula tellus. Fusce nisl risus, pharetra gravida est sit amet, auctor aliquet urna. Ut mattis odio ut risus tempus posuere. Donec facilisis, ante a fermentum euismod, sapien diam consectetur tellus, quis interdum libero neque vitae urna. Aenean id lectus auctor, varius metus quis, sollicitudin neque. Fusce nec tortor ut ligula ornare rhoncus in ut libero. Praesent ultrices nunc eget placerat consectetur. Aliquam erat volutpat. Mauris dapibus blandit elit a elementum.
6 | Pellentesque habitant morbi tristique senectus et netus et malesuada fames ac turpis egestas. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Fusce quis enim justo. Suspendisse tempor dictum cursus. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Cras hendrerit mi non ante iaculis, vitae consequat velit dictum. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. Curabitur massa erat, congue at sapien at, posuere porttitor felis.
7 |
8 | Third Heading
9 | Here is a table:
10 | Heading 1 Heading 2 Heading 3
11 | Aaa Bbb Ccc
12 | Ddd Eee Fff
13 | Ggg Hhh Iii
14 | Jjj Kkk Lll
15 |
16 |
17 |
--------------------------------------------------------------------------------
/src/test/resources/com/rtfparserkit/parser/standard/data/test10001Encoding.xml:
--------------------------------------------------------------------------------
1 | HiraKakuPro-W6;Helvetica-Bold;Helvetica;HiraKakuPro-W3;;;{disc}.;;;お読みください
--------------------------------------------------------------------------------
/src/main/java/com/rtfparserkit/utils/HexUtils.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2013 Jon Iles
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.rtfparserkit.utils;
18 |
19 | import java.util.Arrays;
20 |
21 | /**
22 | * Utilities for working with hex numbers.
23 | */
24 | public class HexUtils
25 | {
26 | /**
27 | * Parse a hex digit.
28 | */
29 | public static int parseHexDigit(int ch)
30 | {
31 | int b;
32 |
33 | try
34 | {
35 | b = HEX_MAP[ch];
36 | }
37 |
38 | catch (IndexOutOfBoundsException ex)
39 | {
40 | b = -1;
41 | }
42 |
43 | if (b == -1)
44 | {
45 | throw new IllegalArgumentException("Invalid hex digit " + ch);
46 | }
47 |
48 | return b;
49 | }
50 |
51 | /**
52 | * Convert a string of hex digits into an array of bytes.
53 | */
54 | public static final byte[] parseHexString(String hex)
55 | {
56 | if (hex.length() % 2 != 0)
57 | {
58 | throw new IllegalArgumentException("Invalid hex string");
59 | }
60 |
61 | byte[] bytes = new byte[hex.length() / 2];
62 | for (int byteIndex = 0; byteIndex < bytes.length; byteIndex++)
63 | {
64 | int stringIndex = byteIndex * 2;
65 | int b = parseHexDigit(hex.charAt(stringIndex)) << 4;
66 | b += parseHexDigit(hex.charAt(stringIndex + 1));
67 | bytes[byteIndex] = (byte) b;
68 | }
69 |
70 | return bytes;
71 | }
72 |
73 | private static final int[] HEX_MAP = new int['g'];
74 | static
75 | {
76 | Arrays.fill(HEX_MAP, -1);
77 |
78 | HEX_MAP['0'] = 0;
79 | HEX_MAP['1'] = 1;
80 | HEX_MAP['2'] = 2;
81 | HEX_MAP['3'] = 3;
82 | HEX_MAP['4'] = 4;
83 | HEX_MAP['5'] = 5;
84 | HEX_MAP['6'] = 6;
85 | HEX_MAP['7'] = 7;
86 | HEX_MAP['8'] = 8;
87 | HEX_MAP['9'] = 9;
88 |
89 | HEX_MAP['A'] = 10;
90 | HEX_MAP['B'] = 11;
91 | HEX_MAP['C'] = 12;
92 | HEX_MAP['D'] = 13;
93 | HEX_MAP['E'] = 14;
94 | HEX_MAP['F'] = 15;
95 |
96 | HEX_MAP['a'] = 10;
97 | HEX_MAP['b'] = 11;
98 | HEX_MAP['c'] = 12;
99 | HEX_MAP['d'] = 13;
100 | HEX_MAP['e'] = 14;
101 | HEX_MAP['f'] = 15;
102 | }
103 | }
104 |
--------------------------------------------------------------------------------
/src/main/java/com/rtfparserkit/parser/standard/FontCharset.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2013 Jon Iles
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.rtfparserkit.parser.standard;
18 |
19 | /**
20 | * Represents font character sets which may be encountered in an RTF file.
21 | */
22 | class FontCharset
23 | {
24 | /**
25 | * Convert a font character set to an encoding name.
26 | */
27 | public static String getCharset(int parameter)
28 | {
29 | String result = null;
30 | if (parameter >= 0 && parameter < MAPPING.length)
31 | {
32 | result = MAPPING[parameter];
33 | }
34 | return result;
35 | }
36 |
37 | private static final String[] MAPPING = new String[256];
38 | static
39 | {
40 | MAPPING[0] = "1252"; // ANSI
41 | MAPPING[1] = null; // Default
42 | MAPPING[2] = "1252"; // Symbol - according to the specs this is codepage 42 "Symbol". What's the Java equivalent? 1252 seems to work...
43 | MAPPING[77] = "10000"; // Mac Roman
44 | MAPPING[78] = "10001"; // Mac Shift Jis
45 | MAPPING[79] = "10003"; // Mac Hangul
46 | MAPPING[80] = "10008"; // Mac GB2312
47 | MAPPING[81] = "10002"; // Mac Big5
48 | MAPPING[82] = null; // Mac Johab (old)
49 | MAPPING[83] = "10005"; // Mac Hebrew
50 | MAPPING[84] = "10004"; // Mac Arabic
51 | MAPPING[85] = "10006"; // Mac Greek
52 | MAPPING[86] = "10081"; // Mac Turkish
53 | MAPPING[87] = "10021"; // Mac Thai
54 | MAPPING[88] = "10029"; // Mac East Europe
55 | MAPPING[89] = "10007"; // Mac Russian
56 | MAPPING[128] = "932"; // Shift JIS
57 | MAPPING[129] = "949"; // Hangul
58 | MAPPING[130] = "1361"; // Johab
59 | MAPPING[134] = "936"; // GB2312
60 | MAPPING[136] = "950"; // Big5
61 | MAPPING[161] = "1253"; // Greek
62 | MAPPING[162] = "1254"; // Turkish
63 | MAPPING[163] = "1258"; // Vietnamese
64 | MAPPING[177] = "1255"; // Hebrew
65 | MAPPING[178] = "1256"; // Arabic
66 | MAPPING[179] = null; // Arabic Traditional (old)
67 | MAPPING[180] = null; // Arabic user (old)
68 | MAPPING[181] = null; // Hebrew user (old)
69 | MAPPING[186] = "1257"; // Baltic
70 | MAPPING[204] = "1251"; // Russian
71 | MAPPING[222] = "874"; // Thai
72 | MAPPING[238] = "1250"; // Eastern European
73 | MAPPING[254] = "437"; // PC 437
74 | MAPPING[255] = "850"; // OEM
75 | }
76 | }
77 |
--------------------------------------------------------------------------------
/src/test/java/com/rtfparserkit/converter/text/StreamTextConverterTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2013 Jon Iles
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.rtfparserkit.converter.text;
18 |
19 | import static org.junit.Assert.assertEquals;
20 |
21 | import java.io.File;
22 | import java.io.FileInputStream;
23 | import java.io.FileOutputStream;
24 | import java.io.IOException;
25 | import java.io.InputStream;
26 | import java.io.OutputStream;
27 |
28 | import org.junit.Test;
29 |
30 | import com.rtfparserkit.parser.RtfStreamSource;
31 | import com.rtfparserkit.utils.TestUtilities;
32 |
33 | public class StreamTextConverterTest
34 | {
35 | @Test
36 | public void testTextConversion() throws IOException
37 | {
38 | StreamTextConverter tc = new StreamTextConverter();
39 | File tempFile = File.createTempFile("testTextConversion", ".rtf");
40 | tempFile.deleteOnExit();
41 |
42 | InputStream is = null;
43 | OutputStream os = null;
44 |
45 | try
46 | {
47 | is = StreamTextConverterTest.class.getResourceAsStream("data/testTextConversion.rtf");
48 | os = new FileOutputStream(tempFile);
49 | tc.convert(new RtfStreamSource(is), os, "UTF-8");
50 | }
51 |
52 | finally
53 | {
54 | if (is != null)
55 | {
56 | try
57 | {
58 | is.close();
59 | }
60 |
61 | catch (Exception ex)
62 | {
63 | // Ignored
64 | }
65 | }
66 |
67 | if (os != null)
68 | {
69 | try
70 | {
71 | os.close();
72 | }
73 |
74 | catch (Exception ex)
75 | {
76 | // Ignored
77 | }
78 | }
79 | }
80 |
81 | InputStream actualStream = null;
82 | InputStream expectedStream = null;
83 |
84 | try
85 | {
86 | actualStream = new FileInputStream(tempFile);
87 | expectedStream = StreamTextConverterTest.class.getResourceAsStream("data/testTextConversion.txt");
88 | String expectedText = TestUtilities.readStreamToString(expectedStream);
89 | String actualText = TestUtilities.readStreamToString(actualStream);
90 | assertEquals(expectedText, actualText);
91 | }
92 |
93 | finally
94 | {
95 | try
96 | {
97 | actualStream.close();
98 | }
99 |
100 | catch (Exception ex)
101 | {
102 | // Ignored
103 | }
104 |
105 | try
106 | {
107 | expectedStream.close();
108 | }
109 |
110 | catch (Exception ex)
111 | {
112 | // Ignored
113 | }
114 | }
115 | }
116 | }
117 |
--------------------------------------------------------------------------------
/src/main/java/com/rtfparserkit/converter/text/AbstractTextConverter.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2013 Jon Iles
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.rtfparserkit.converter.text;
18 |
19 | import java.io.IOException;
20 | import java.util.ArrayDeque;
21 | import java.util.Deque;
22 |
23 | import com.rtfparserkit.parser.IRtfParser;
24 | import com.rtfparserkit.parser.IRtfSource;
25 | import com.rtfparserkit.parser.RtfListenerAdaptor;
26 | import com.rtfparserkit.parser.standard.StandardRtfParser;
27 | import com.rtfparserkit.rtf.Command;
28 | import com.rtfparserkit.rtf.CommandType;
29 |
30 | /**
31 | * This class implements the core of a trivial RTF to text converter.
32 | * Subclasses implement the processExtractedText method to determine
33 | * how the extracted text is handled.
34 | */
35 | public abstract class AbstractTextConverter extends RtfListenerAdaptor
36 | {
37 | public void convert(IRtfSource source) throws IOException
38 | {
39 | IRtfParser parser = new StandardRtfParser();
40 | currentDestination = Command.rtf;
41 | parser.parse(source, this);
42 | }
43 |
44 | /**
45 | * This method is called to process the test we've extracted from the RTF file.
46 | */
47 | public abstract void processExtractedText(String text);
48 |
49 | @Override
50 | public void processGroupStart()
51 | {
52 | destinationStack.push(currentDestination);
53 | }
54 |
55 | @Override
56 | public void processGroupEnd()
57 | {
58 | currentDestination = destinationStack.pop();
59 | }
60 |
61 | @Override
62 | public void processString(String string)
63 | {
64 | switch (currentDestination)
65 | {
66 | case rtf:
67 | case pntext:
68 | case fldrslt:
69 | {
70 | processExtractedText(string);
71 | break;
72 | }
73 |
74 | default:
75 | {
76 | // Do nothing
77 | break;
78 | }
79 | }
80 |
81 | }
82 |
83 | @Override
84 | public void processCommand(Command command, int parameter, boolean hasParameter, boolean optional)
85 | {
86 | if (command.getCommandType() == CommandType.Destination)
87 | {
88 | currentDestination = command;
89 | }
90 |
91 | switch (command)
92 | {
93 | case par:
94 | case line:
95 | case row:
96 | {
97 | processExtractedText("\n");
98 | break;
99 | }
100 |
101 | case tab:
102 | case cell:
103 | {
104 | processExtractedText("\t");
105 | break;
106 | }
107 |
108 | default:
109 | {
110 | // Do nothing
111 | break;
112 | }
113 | }
114 | }
115 |
116 | private Command currentDestination = Command.rtf;
117 | private final Deque destinationStack = new ArrayDeque();
118 | }
119 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | RTF Parser Kit
2 | ==============
3 |
4 | I have often been frustrated by the lack of comprehensive support for working with RTF in Java, and the need to use RTF parsers which are incomplete and form part of larger projects whose libraries I don't want to import just to use the RTF parser. The RTF Parser Kit project is an attempt to address these points.
5 |
6 | The idea is to provide a "kit" of components which can either be used "as-is", for example to extract plain text or HTML from an RTF file, or can be used as a component in a larger application which requires the capability to parse RTF documents.
7 |
8 | What's currently included?
9 | --------------------------
10 | * Raw RTF Parser - parses RTF, sends events representing content to a listener. Performs minimal processing - you get the RTF commands and data exactly as they appear in the file.
11 | * Standard RTF Parser - parses RTF, sends events representing content to a listener. Handles character encoding, Unicode and so on, so you don't have to. This is probably the parser you want to use.
12 | * Text Converter - demonstrates very simple text extraction from an RTF file
13 | * RTF Dump - another demonstration, this time writing the RTF file contents as XML
14 |
15 | Getting Started
16 | ===============
17 |
18 | To install the library, you can either download the latest JAR directly from the GitHub releases page,
19 | or you can add RTF Parser Kit as a dependency using Maven:
20 |
21 | ```xml
22 |
23 | com.github.joniles
24 | rtfparserkit
25 | 1.16.0
26 |
27 | ```
28 |
29 | Once you have the library, you have a choice of two parsers to work with, the standard parser and the raw parser. The raw parser carries out minimal processing on the RTF, the standard parser handles character encodings, and translates commands which represent special characters into their Unicode equivalents. Most people will want to use the standard parser.
30 |
31 | The parser is invoked like this:
32 | ```java
33 | InputStream is = new FileInputStream("/path/to/my/file.rtf");
34 | IRtfSource source = new RtfStreamSource(is)
35 | IRtfParser parser = new StandardRtfParser();
36 | MyRtfListener listener = new MyRtfListener();
37 | parser.parse(source, listener);
38 | ```
39 | You provide input to the parser via a class that implements the `IRtfSource` interface. Two implementations are provided for you, `RtfStreamSource`, for reading RTF from a stream, and `RtfStringSource` for reading RTF from a string.
40 |
41 | The other thing you need to provide the parser with is a listener class. The listener class implements the `IRtfListener` listener interface. The interface consists of a set of methods which are called by the parser to inform you of when it encounters different parts of the docuent structure. The set of method, along with some comments describing their purpose can be seen [here](https://github.com/joniles/rtfparserkit/blob/master/RTF%20Parser%20Kit/src/com/rtfparserkit/parser/IRtfListener.java).
42 |
43 | You don't need to implement all of the `IRtfListener` interface yourself, if you wish you can subclass `RtfListenerAdaptor` which provides empty methods for all of the `IRtfListener` methods. You can then just override the methods you are interested in.
44 |
45 | An example text extractor is provided, you can invoke it like this:
46 | ```java
47 | new StreamTextConverter().convert(new RtfStreamSource(inputStream), outputStream, "UTF-8");
48 | ```
49 | This code reads an RTF file from the `inputStream` and writes the resulting text to the `outputStream` in the encoding specified by the last argument.
50 |
51 | A second example text extractor is also provided, this one extracts text from the RTF file into a string:
52 | ```java
53 | StringTextConverter converter = new StringTextConverter();
54 | converter.convert(new RtfStreamSource(inputStream));
55 | String extractedText = converter.getText();
56 | ```
57 |
--------------------------------------------------------------------------------
/src/test/resources/com/rtfparserkit/parser/standard/data/test874Encoding.rtf:
--------------------------------------------------------------------------------
1 | {\rtf1\adeflang1054\ansi\ansicpg874\uc1\adeff22\deff0\stshfdbch0\stshfloch0\stshfhich0\stshfbi0\deflang1033\deflangfe1033{\fonttbl{\f0\froman\fcharset0\fprq2{\*\panose 02020603050405020304}Times New Roman;}
2 | {\f1\fswiss\fcharset0\fprq2{\*\panose 020b0604020202020204}Arial;}{\f22\froman\fcharset0\fprq2{\*\panose 02020603050405020304}Angsana New;}{\f36\fnil\fcharset222\fprq0{\*\panose 00000000000000000000}SymbolMT;}
3 | {\f149\froman\fcharset238\fprq2 Times New Roman CE;}{\f150\froman\fcharset204\fprq2 Times New Roman Cyr;}{\f152\froman\fcharset161\fprq2 Times New Roman Greek;}{\f153\froman\fcharset162\fprq2 Times New Roman Tur;}
4 | {\f154\froman\fcharset177\fprq2 Times New Roman (Hebrew);}{\f155\froman\fcharset178\fprq2 Times New Roman (Arabic);}{\f156\froman\fcharset186\fprq2 Times New Roman Baltic;}{\f157\froman\fcharset163\fprq2 Times New Roman (Vietnamese);}
5 | {\f159\fswiss\fcharset238\fprq2 Arial CE;}{\f160\fswiss\fcharset204\fprq2 Arial Cyr;}{\f162\fswiss\fcharset161\fprq2 Arial Greek;}{\f163\fswiss\fcharset162\fprq2 Arial Tur;}{\f164\fswiss\fcharset177\fprq2 Arial (Hebrew);}
6 | {\f165\fswiss\fcharset178\fprq2 Arial (Arabic);}{\f166\fswiss\fcharset186\fprq2 Arial Baltic;}{\f167\fswiss\fcharset163\fprq2 Arial (Vietnamese);}{\f378\froman\fcharset222\fprq2 Angsana New (Thai);}}{\colortbl;\red0\green0\blue0;\red0\green0\blue255;
7 | \red0\green255\blue255;\red0\green255\blue0;\red255\green0\blue255;\red255\green0\blue0;\red255\green255\blue0;\red255\green255\blue255;\red0\green0\blue128;\red0\green128\blue128;\red0\green128\blue0;\red128\green0\blue128;\red128\green0\blue0;
8 | \red128\green128\blue0;\red128\green128\blue128;\red192\green192\blue192;}{\stylesheet{\ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \fcs1 \af22\afs24\alang1054 \fcs0 \fs24\lang1033\langfe1033\cgrid\langnp1033\langfenp1033
9 | \snext0 Normal;}{\*\cs10 \additive \ssemihidden Default Paragraph Font;}{\*
10 | \ts11\tsrowd\trftsWidthB3\trpaddl108\trpaddr108\trpaddfl3\trpaddft3\trpaddfb3\trpaddfr3\trcbpat1\trcfpat1\tscellwidthfts0\tsvertalt\tsbrdrt\tsbrdrl\tsbrdrb\tsbrdrr\tsbrdrdgl\tsbrdrdgr\tsbrdrh\tsbrdrv
11 | \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \fcs1 \af0\afs20 \fcs0 \fs20\lang1024\langfe1024\cgrid\langnp1024\langfenp1024 \snext11 \ssemihidden Normal Table;}}{\*\latentstyles\lsdstimax156\lsdlockeddef0}
12 | {\*\rsidtbl \rsid3489504\rsid8663529\rsid13765641}{\*\generator Microsoft Word 11.0.5604;}{\info{\title 3}{\author Carestream Health Inc.}{\operator SAMAK}{\creatim\yr2009\mo9\dy9\hr14\min28}{\revtim\yr2009\mo9\dy9\hr14\min28}{\version2}{\edmins1}
13 | {\nofpages2}{\nofwords643}{\nofchars3669}{\*\company }{\nofcharsws4304}{\vern24689}}\widowctrl\ftnbj\aenddoc\noxlattoyen\expshrtn\noultrlspc\dntblnsbdb\nospaceforul\hyphcaps0\horzdoc\dghspace120\dgvspace120\dghorigin1701\dgvorigin1984\dghshow0\dgvshow3
14 | \jcompress\viewkind4\viewscale100\nolnhtadjtbl\ApplyBrkRules\rsidroot8663529 \fet0\sectd \linex0\sectdefaultcl\sftnbj {\*\pnseclvl1\pnucrm\pnstart1\pnindent720\pnhang {\pntxta .}}{\*\pnseclvl2\pnucltr\pnstart1\pnindent720\pnhang {\pntxta .}}{\*\pnseclvl3
15 | \pndec\pnstart1\pnindent720\pnhang {\pntxta .}}{\*\pnseclvl4\pnlcltr\pnstart1\pnindent720\pnhang {\pntxta )}}{\*\pnseclvl5\pndec\pnstart1\pnindent720\pnhang {\pntxtb (}{\pntxta )}}{\*\pnseclvl6\pnlcltr\pnstart1\pnindent720\pnhang {\pntxtb (}{\pntxta )}}
16 | {\*\pnseclvl7\pnlcrm\pnstart1\pnindent720\pnhang {\pntxtb (}{\pntxta )}}{\*\pnseclvl8\pnlcltr\pnstart1\pnindent720\pnhang {\pntxtb (}{\pntxta )}}{\*\pnseclvl9\pnlcrm\pnstart1\pnindent720\pnhang {\pntxtb (}{\pntxta )}}\pard\plain
17 | \ql \li0\ri0\nowidctlpar\faauto\rin0\lin0\itap0 \fcs1 \af22\afs24\alang1054 \fcs0 \fs24\lang1033\langfe1033\cgrid\langnp1033\langfenp1033 {\fcs1 \ab\af1\afs28 \fcs0 \b\f1\fs28\cf1\insrsid3489504 1. Test Heading
18 | \par }{\fcs1 \af1\afs20 \fcs0 \f1\fs20\cf1\insrsid3489504 \bullet }{\fcs0 \afs20 \fcs1 \f36\fs20\cf1\lang1054\insrsid3489504 }{\fcs1 \af1\afs20 \fcs0 \f1\fs20\cf1\insrsid3489504 Some test text
19 | \par \bullet }{\fcs0 \afs20 \fcs1 \f36\fs20\cf1\lang1054\insrsid3489504 }{\fcs1 \af1\afs20 \fcs0 \f1\fs20\cf1\insrsid3489504 Some more test text
20 | }}
--------------------------------------------------------------------------------
/src/main/java/com/rtfparserkit/parser/standard/DefaultEventHandler.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2013 Jon Iles
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.rtfparserkit.parser.standard;
18 |
19 | import java.util.ArrayDeque;
20 | import java.util.Deque;
21 |
22 | import com.rtfparserkit.parser.IRtfListener;
23 |
24 | /**
25 | * Default parser event handler. Passes events to the listener. In this implementation
26 | * the events are queued to allow later events to modify earlier events before they are
27 | * passed to the listener. For example, we coalesce consecutive string events together.
28 | */
29 | class DefaultEventHandler implements IParserEventHandler
30 | {
31 | /**
32 | * Constructor.
33 | */
34 | public DefaultEventHandler(IRtfListener listener)
35 | {
36 | this.listener = listener;
37 | }
38 |
39 | /**
40 | * If we've reached the end of the document, flush all queued events to
41 | * the listener and pass on the document end event.
42 | * If we have received consecutive string events, coalesce them into
43 | * a single event in the buffer.
44 | * If the buffer has reached its maximum size, remove the event from the
45 | * front of the buffer and pass this to the listener.
46 | */
47 | @Override
48 | public void handleEvent(IParserEvent event)
49 | {
50 | if (event.getType() == ParserEventType.DOCUMENT_END_EVENT)
51 | {
52 | flushEvents();
53 | event.fire(listener);
54 | }
55 | else
56 | {
57 | IParserEvent lastEvent = events.peekLast();
58 | if (lastEvent != null && lastEvent.getType() == ParserEventType.STRING_EVENT && event.getType() == ParserEventType.STRING_EVENT)
59 | {
60 | event = mergeStringEvents((StringEvent) event);
61 | }
62 |
63 | events.add(event);
64 |
65 | if (events.size() > MAX_EVENTS)
66 | {
67 | events.removeFirst().fire(listener);
68 | }
69 | }
70 | }
71 |
72 | /**
73 | * It's always valid for this handler to continue processing events,
74 | * so we always return false.
75 | */
76 | @Override
77 | public boolean isComplete()
78 | {
79 | return false;
80 | }
81 |
82 | /**
83 | * Allows the caller to see the event at the end of the buffer.
84 | */
85 | @Override
86 | public IParserEvent getLastEvent()
87 | {
88 | return events.getLast();
89 | }
90 |
91 | /**
92 | * Allows the caller to remove the last event from the buffer.
93 | */
94 | @Override
95 | public void removeLastEvent()
96 | {
97 | events.removeLast();
98 | }
99 |
100 | /**
101 | * Removes the string event from the end of the buffer, merges it with the string
102 | * event we've just received, and adds the new event to the end of the buffer.
103 | */
104 | private IParserEvent mergeStringEvents(StringEvent event)
105 | {
106 | StringEvent lastEvent = (StringEvent) events.removeLast();
107 | StringEvent newEvent = new StringEvent(lastEvent.getString() + event.getString());
108 | return newEvent;
109 | }
110 |
111 | /**
112 | * Passes any remaining events in the buffer to the listener and clears the event buffer,
113 | */
114 | private void flushEvents()
115 | {
116 | for (IParserEvent event : events)
117 | {
118 | event.fire(listener);
119 | }
120 | events.clear();
121 | }
122 |
123 | private static final int MAX_EVENTS = 5;
124 |
125 | private final IRtfListener listener;
126 | private final Deque events = new ArrayDeque();
127 | }
128 |
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
4 | 4.0.0
5 | com.github.joniles
6 | rtfparserkit
7 | 1.16.0
8 | jar
9 |
10 | RTF Parser Kit
11 | Modular RTF parser
12 | https://github.com/joniles/rtfparserkit
13 | 2013
14 |
15 |
16 | https://github.com/joniles/rtfparserkit/issues
17 | GitHub Issues
18 |
19 |
20 |
21 | https://github.com/joniles/rtfparserkit
22 | scm:git:git://github.com/joniles/rtfparserkit.git
23 | scm:git:git@github.com:joniles/rtfparserkit.git
24 |
25 |
26 |
27 |
28 | joniles
29 | Jon Iles
30 | jon.iles@bcs.org.uk
31 | Packwood Software
32 |
33 |
34 |
35 |
36 |
37 | Apache License, Version 2.0
38 | http://www.apache.org/licenses/LICENSE-2.0.txt
39 | repo
40 | A business-friendly OSS license
41 |
42 |
43 |
44 |
45 |
46 | junit
47 | junit
48 | 4.13.1
49 | test
50 |
51 |
52 |
53 |
54 |
55 |
56 | ossrh
57 | https://oss.sonatype.org/content/repositories/snapshots
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 | org.apache.maven.plugins
67 | maven-compiler-plugin
68 | 3.0
69 |
70 | 1.6
71 | 1.6
72 |
73 |
74 |
75 |
76 |
77 | org.sonatype.plugins
78 | nexus-staging-maven-plugin
79 | 1.6.3
80 | true
81 |
82 | ossrh
83 | https://oss.sonatype.org/
84 | true
85 |
86 |
87 |
88 |
89 |
90 | org.apache.maven.plugins
91 | maven-source-plugin
92 | 2.2.1
93 |
94 |
95 | attach-sources
96 |
97 | jar-no-fork
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 | org.apache.maven.plugins
106 | maven-javadoc-plugin
107 | 2.9.1
108 |
109 | -Xdoclint:none
110 | true
111 |
112 |
113 |
114 | attach-javadocs
115 |
116 | jar
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 | org.apache.maven.plugins
125 | maven-gpg-plugin
126 | 1.5
127 |
128 |
129 | sign-artifacts
130 | verify
131 |
132 | sign
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
--------------------------------------------------------------------------------
/src/test/resources/com/rtfparserkit/parser/standard/data/test437Encoding.xml:
--------------------------------------------------------------------------------
1 | Times New RomanTimes;Helvetica-BoldHelvetica;Helvetica;LucidaGrandeLucida Grande;;;The quick brown fox jumped over the lazy dogs.footnote reference;The quick brown fox jumped over the lazy dogs.endnote reference;Some text goes here so you can see what your style will look like.footnote text;Sample text for Foot/End Notes Styleendnote text;Test DocumentTest Title: (Test Subtitle)Test Heading: Test Text.
--------------------------------------------------------------------------------
/src/test/java/com/rtfparserkit/utils/TestUtilities.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2013 Jon Iles
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.rtfparserkit.utils;
18 |
19 | import static org.junit.Assert.assertEquals;
20 |
21 | import java.io.File;
22 | import java.io.FileInputStream;
23 | import java.io.FileOutputStream;
24 | import java.io.IOException;
25 | import java.io.InputStream;
26 | import java.io.OutputStream;
27 |
28 | import com.rtfparserkit.parser.IRtfParser;
29 | import com.rtfparserkit.parser.RtfStreamSource;
30 | import com.rtfparserkit.parser.standard.StandardRtfParserTest;
31 |
32 | public class TestUtilities
33 | {
34 |
35 | public static String readStreamToString(InputStream is) throws IOException
36 | {
37 | StringBuilder result = new StringBuilder();
38 |
39 | byte[] buffer = new byte[1024];
40 | while (true)
41 | {
42 | int bytesRead = is.read(buffer);
43 | if (bytesRead == -1)
44 | {
45 | break;
46 | }
47 |
48 | result.append(new String(buffer, 0, bytesRead));
49 | }
50 |
51 | return result.toString();
52 | }
53 |
54 | public static void assertRtfParserDumpMatches(Object parentTest, IRtfParser parser, String filename) throws Exception
55 | {
56 |
57 | File outputFile = File.createTempFile(filename, ".xml");
58 | outputFile.deleteOnExit();
59 |
60 | InputStream is = null;
61 | OutputStream os = null;
62 |
63 | try
64 | {
65 | is = parentTest.getClass().getResourceAsStream("data/" + filename + ".rtf");
66 | os = new FileOutputStream(outputFile);
67 | parser.parse(new RtfStreamSource(is), new RtfDumpListener(os));
68 | }
69 |
70 | finally
71 | {
72 | if (is != null)
73 | {
74 | try
75 | {
76 | is.close();
77 | }
78 |
79 | catch (Exception ex)
80 | {
81 | // Ignored
82 | }
83 | }
84 |
85 | if (os != null)
86 | {
87 | try
88 | {
89 | os.close();
90 | }
91 |
92 | catch (Exception ex)
93 | {
94 | // Ignored
95 | }
96 | }
97 | }
98 |
99 | InputStream actualStream = null;
100 | InputStream expectedStream = null;
101 |
102 | try
103 | {
104 | actualStream = new FileInputStream(outputFile);
105 | expectedStream = parentTest.getClass().getResourceAsStream("data/" + filename + ".xml");
106 | String expectedText = TestUtilities.readStreamToString(expectedStream);
107 | String actualText = TestUtilities.readStreamToString(actualStream);
108 | assertEquals(expectedText, actualText);
109 | }
110 |
111 | finally
112 | {
113 | if (actualStream != null)
114 | {
115 | try
116 | {
117 | actualStream.close();
118 | }
119 |
120 | catch (Exception ex)
121 | {
122 | // Ignored
123 | }
124 | }
125 |
126 | if (expectedStream != null)
127 | {
128 | try
129 | {
130 | expectedStream.close();
131 | }
132 |
133 | catch (Exception ex)
134 | {
135 | // Ignored
136 | }
137 | }
138 |
139 | }
140 | }
141 |
142 | public static void dump(IRtfParser parser, String filename, String outputFilename) throws Exception
143 | {
144 | File outputFile = new File(outputFilename);
145 |
146 | InputStream is = null;
147 | OutputStream os = null;
148 |
149 | try
150 | {
151 | is = StandardRtfParserTest.class.getResourceAsStream("data/" + filename + ".rtf");
152 | os = new FileOutputStream(outputFile);
153 | parser.parse(new RtfStreamSource(is), new RtfDumpListener(os));
154 | }
155 |
156 | finally
157 | {
158 | if (is != null)
159 | {
160 | try
161 | {
162 | is.close();
163 | }
164 |
165 | finally
166 | {
167 | // Ignored
168 | }
169 | }
170 |
171 | if (os != null)
172 | {
173 | try
174 | {
175 | os.close();
176 | }
177 |
178 | finally
179 | {
180 | // Ignored
181 | }
182 | }
183 | }
184 | }
185 | }
186 |
--------------------------------------------------------------------------------
/src/test/java/com/rtfparserkit/parser/standard/StandardRtfParserTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2013 Jon Iles
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.rtfparserkit.parser.standard;
18 |
19 | import org.junit.Test;
20 |
21 | import com.rtfparserkit.utils.TestUtilities;
22 |
23 | public class StandardRtfParserTest
24 | {
25 |
26 | @Test
27 | public void testEncodingParse() throws Exception
28 | {
29 | TestUtilities.assertRtfParserDumpMatches(this, new StandardRtfParser(), "testEncodingParse");
30 | }
31 |
32 | @Test
33 | public void testDefaultEncodingParse() throws Exception
34 | {
35 | TestUtilities.assertRtfParserDumpMatches(this, new StandardRtfParser(), "testDefaultEncodingParse");
36 | }
37 |
38 | @Test
39 | public void testStylesParse() throws Exception
40 | {
41 | TestUtilities.assertRtfParserDumpMatches(this, new StandardRtfParser(), "testStyles");
42 | }
43 |
44 | @Test
45 | public void testUnicode() throws Exception
46 | {
47 | TestUtilities.assertRtfParserDumpMatches(this, new StandardRtfParser(), "testUnicode");
48 | }
49 |
50 | @Test
51 | public void testNegativeUnicode() throws Exception
52 | {
53 | TestUtilities.assertRtfParserDumpMatches(this, new StandardRtfParser(), "testNegativeUnicode");
54 | }
55 |
56 | @Test
57 | public void testUpr() throws Exception
58 | {
59 | TestUtilities.assertRtfParserDumpMatches(this, new StandardRtfParser(), "testUpr");
60 | }
61 |
62 | @Test
63 | public void testHex() throws Exception
64 | {
65 | TestUtilities.assertRtfParserDumpMatches(this, new StandardRtfParser(), "testHex");
66 | }
67 |
68 | @Test
69 | public void testMultiByteHex() throws Exception
70 | {
71 | TestUtilities.assertRtfParserDumpMatches(this, new StandardRtfParser(), "testMultiByteHex");
72 | }
73 |
74 | @Test
75 | public void testSpecialChars() throws Exception
76 | {
77 | TestUtilities.assertRtfParserDumpMatches(this, new StandardRtfParser(), "testSpecialChars");
78 | }
79 |
80 | @Test
81 | public void testGitHubIssue6() throws Exception
82 | {
83 | TestUtilities.assertRtfParserDumpMatches(this, new StandardRtfParser(), "testGitHubIssue6");
84 | }
85 |
86 | @Test
87 | public void testTurkishEncoding() throws Exception
88 | {
89 | TestUtilities.assertRtfParserDumpMatches(this, new StandardRtfParser(), "testTurkishEncoding");
90 | }
91 |
92 | @Test
93 | public void testGreekEncoding() throws Exception
94 | {
95 | TestUtilities.assertRtfParserDumpMatches(this, new StandardRtfParser(), "testGreekEncoding");
96 | }
97 |
98 | @Test
99 | public void test437Encoding() throws Exception
100 | {
101 | TestUtilities.assertRtfParserDumpMatches(this, new StandardRtfParser(), "test437Encoding");
102 | }
103 |
104 | @Test
105 | public void test874Encoding() throws Exception
106 | {
107 | TestUtilities.assertRtfParserDumpMatches(this, new StandardRtfParser(), "test874Encoding");
108 | }
109 |
110 | @Test
111 | public void test950Encoding() throws Exception
112 | {
113 | TestUtilities.assertRtfParserDumpMatches(this, new StandardRtfParser(), "test950Encoding");
114 | }
115 |
116 | @Test
117 | public void test10001Encoding() throws Exception
118 | {
119 | TestUtilities.assertRtfParserDumpMatches(this, new StandardRtfParser(), "test10001Encoding");
120 | }
121 |
122 | @Test
123 | public void test10007Encoding() throws Exception
124 | {
125 | TestUtilities.assertRtfParserDumpMatches(this, new StandardRtfParser(), "test10007Encoding");
126 | }
127 |
128 | @Test
129 | public void testKoreanEncoding() throws Exception
130 | {
131 | TestUtilities.assertRtfParserDumpMatches(this, new StandardRtfParser(), "testKoreanEncoding");
132 | }
133 |
134 | @Test
135 | public void testJapaneseJisEncoding() throws Exception
136 | {
137 | TestUtilities.assertRtfParserDumpMatches(this, new StandardRtfParser(), "testJapaneseJisEncoding");
138 | }
139 |
140 | @Test
141 | public void testJapaneseUtf8Encoding() throws Exception
142 | {
143 | TestUtilities.assertRtfParserDumpMatches(this, new StandardRtfParser(), "testJapaneseUtf8Encoding");
144 | }
145 |
146 | @Test
147 | public void testJapaneseJisEncodingTwoFonts() throws Exception
148 | {
149 | TestUtilities.assertRtfParserDumpMatches(this, new StandardRtfParser(), "testJapaneseJisEncodingTwoFonts");
150 | }
151 |
152 | @Test
153 | public void testNecCharacters() throws Exception
154 | {
155 | TestUtilities.assertRtfParserDumpMatches(this, new StandardRtfParser(), "testNecCharacters");
156 | }
157 | }
158 |
--------------------------------------------------------------------------------
/src/main/java/com/rtfparserkit/utils/RtfDumpListener.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2013 Jon Iles
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.rtfparserkit.utils;
18 |
19 | import java.io.OutputStream;
20 |
21 | import javax.xml.stream.XMLOutputFactory;
22 | import javax.xml.stream.XMLStreamException;
23 | import javax.xml.stream.XMLStreamWriter;
24 |
25 | import com.rtfparserkit.parser.IRtfListener;
26 | import com.rtfparserkit.rtf.Command;
27 |
28 | /**
29 | * Trivial class used to convert events generated by an RTF parser into an XML document.
30 | * The primary purpose of this code is to debug the parser output, and provide a
31 | * convenient method for comparing expected and actual parser behaviour in test cases.
32 | */
33 | public class RtfDumpListener implements IRtfListener
34 | {
35 | /**
36 | * Constructor.
37 | */
38 | public RtfDumpListener(OutputStream stream)
39 | throws XMLStreamException
40 | {
41 | writer = XMLOutputFactory.newInstance().createXMLStreamWriter(stream, "UTF-8");
42 | }
43 |
44 | /**
45 | * Create the document header.
46 | */
47 | @Override
48 | public void processDocumentStart()
49 | {
50 | try
51 | {
52 | writer.writeStartDocument("UTF-8", "1.0");
53 | writer.writeStartElement("rtf");
54 | }
55 | catch (XMLStreamException ex)
56 | {
57 | throw new RuntimeException(ex);
58 | }
59 | }
60 |
61 | /**
62 | * Create the document trailer.
63 | */
64 | @Override
65 | public void processDocumentEnd()
66 | {
67 | try
68 | {
69 | writer.writeEndElement();
70 | writer.writeEndDocument();
71 | }
72 | catch (XMLStreamException ex)
73 | {
74 | throw new RuntimeException(ex);
75 | }
76 | }
77 |
78 | /**
79 | * Write character bytes - note that we cheat, we just convert them
80 | * directly to a string for output with no regard to the encoding.
81 | */
82 | @Override
83 | public void processCharacterBytes(byte[] data)
84 | {
85 | try
86 | {
87 | if (data.length != 0)
88 | {
89 | writer.writeStartElement("chars");
90 | writer.writeCharacters(new String(data));
91 | writer.writeEndElement();
92 | }
93 | }
94 |
95 | catch (XMLStreamException ex)
96 | {
97 | throw new RuntimeException(ex);
98 | }
99 | }
100 |
101 | /**
102 | * Write binary data as hex.
103 | */
104 | @Override
105 | public void processBinaryBytes(byte[] data)
106 | {
107 | try
108 | {
109 | writer.writeStartElement("bytes");
110 | for (byte b : data)
111 | {
112 | writer.writeCharacters(Integer.toHexString(b));
113 | }
114 | writer.writeEndElement();
115 | }
116 |
117 | catch (XMLStreamException ex)
118 | {
119 | throw new RuntimeException(ex);
120 | }
121 | }
122 |
123 | /**
124 | * Write a group start tag.
125 | */
126 | @Override
127 | public void processGroupStart()
128 | {
129 | try
130 | {
131 | writer.writeStartElement("group");
132 | }
133 |
134 | catch (XMLStreamException ex)
135 | {
136 | throw new RuntimeException(ex);
137 | }
138 | }
139 |
140 | /**
141 | * Write a group end tag.
142 | */
143 | @Override
144 | public void processGroupEnd()
145 | {
146 | try
147 | {
148 | writer.writeEndElement();
149 | }
150 |
151 | catch (XMLStreamException ex)
152 | {
153 | throw new RuntimeException(ex);
154 | }
155 | }
156 |
157 | /**
158 | * Write a command tag.
159 | */
160 | @Override
161 | public void processCommand(Command command, int parameter, boolean hasParameter, boolean optional)
162 | {
163 | try
164 | {
165 | writer.writeEmptyElement("command");
166 | writer.writeAttribute("name", command.getCommandName());
167 |
168 | if (hasParameter)
169 | {
170 | writer.writeAttribute("parameter", Integer.toString(parameter));
171 | }
172 |
173 | if (optional)
174 | {
175 | writer.writeAttribute("optional", "true");
176 | }
177 | }
178 | catch (XMLStreamException ex)
179 | {
180 | throw new RuntimeException(ex);
181 | }
182 | }
183 |
184 | /**
185 | * Write string data.
186 | */
187 | @Override
188 | public void processString(String string)
189 | {
190 | try
191 | {
192 | writer.writeStartElement("chars");
193 | writer.writeCharacters(string);
194 | writer.writeEndElement();
195 | }
196 | catch (XMLStreamException ex)
197 | {
198 | throw new RuntimeException(ex);
199 | }
200 | }
201 |
202 | private final XMLStreamWriter writer;
203 | }
204 |
--------------------------------------------------------------------------------
/src/main/java/com/rtfparserkit/parser/standard/UprHandler.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2013 Jon Iles
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.rtfparserkit.parser.standard;
18 |
19 | import java.util.ArrayList;
20 | import java.util.List;
21 |
22 | import com.rtfparserkit.rtf.Command;
23 |
24 | /**
25 | * The upr command is used to wrap two different versions of the same set of
26 | * formatting commands. The first set of formatting commands uses ANSI encoding,
27 | * the second set uses Unicode. The upr command is expected to appear
28 | * in its own group, so this handler can be used to consume all of the RTF events
29 | * received up to the end of the group It can then pass the Unicode version of
30 | * the command it wraps to the listener, discarding the ANSI version.
31 | */
32 | public class UprHandler implements IParserEventHandler
33 | {
34 | /**
35 | * Constructor.
36 | */
37 | public UprHandler(IParserEventHandler handler)
38 | {
39 | this.handler = handler;
40 | }
41 |
42 | /**
43 | * Buffers events until the end of the group containing the upr command is reached.
44 | * Once the end of the group is reached, the buffered events representing the
45 | * Unicode content is sent to the listener.
46 | */
47 | @Override
48 | public void handleEvent(IParserEvent event)
49 | {
50 | events.add(event);
51 | switch (event.getType())
52 | {
53 | case GROUP_START_EVENT:
54 | {
55 | ++groupCount;
56 | break;
57 | }
58 |
59 | case GROUP_END_EVENT:
60 | {
61 | --groupCount;
62 | break;
63 | }
64 |
65 | default:
66 | break;
67 | }
68 |
69 | if (groupCount == 0)
70 | {
71 | processCommands();
72 | }
73 | }
74 |
75 | /**
76 | * Retrieve the last event seen by the handler.
77 | */
78 | @Override
79 | public IParserEvent getLastEvent()
80 | {
81 | return events.get(events.size() - 1);
82 | }
83 |
84 | /**
85 | * Assumes the handler is buffering events, and removes the last event from this buffer.
86 | */
87 | @Override
88 | public void removeLastEvent()
89 | {
90 | events.remove(events.size() - 1);
91 | }
92 |
93 | /**
94 | * Returns true once the end of the group containing the upr command as been reached.
95 | */
96 | @Override
97 | public boolean isComplete()
98 | {
99 | return complete;
100 | }
101 |
102 | /**
103 | * Extracts the Unicode version of the commands wrapped by the upr
104 | * command and passes them to the listener.
105 | */
106 | private void processCommands()
107 | {
108 | int index = 0;
109 | while (true)
110 | {
111 | if (index == events.size())
112 | {
113 | throw new RuntimeException("UPR command: structure not recognised");
114 | }
115 | IParserEvent event = events.get(index);
116 | if (event.getType() == ParserEventType.COMMAND_EVENT)
117 | {
118 | CommandEvent command = (CommandEvent) event;
119 | if (command.getCommand() == Command.ud)
120 | {
121 | break;
122 | }
123 | }
124 | ++index;
125 | }
126 |
127 | if (index == events.size())
128 | {
129 | throw new RuntimeException("UPR command: structure not recognised: unable to locate UD command");
130 | }
131 |
132 | ++index;
133 | if (events.get(index).getType() != ParserEventType.GROUP_START_EVENT)
134 | {
135 | throw new RuntimeException("UPR command: expecting group start, found: " + events.get(index).getType());
136 | }
137 |
138 | ++index;
139 | int endIndex = index;
140 | int groupCount = 1;
141 | while (true)
142 | {
143 | if (endIndex == events.size())
144 | {
145 | break;
146 | }
147 |
148 | IParserEvent event = events.get(endIndex);
149 | switch (event.getType())
150 | {
151 | case GROUP_START_EVENT:
152 | {
153 | ++groupCount;
154 | break;
155 | }
156 |
157 | case GROUP_END_EVENT:
158 | {
159 | --groupCount;
160 | break;
161 | }
162 |
163 | default:
164 | break;
165 | }
166 |
167 | if (groupCount == 0)
168 | {
169 | break;
170 | }
171 | ++endIndex;
172 | }
173 |
174 | if (index == events.size())
175 | {
176 | throw new RuntimeException("UPR command: structure not recognised: unable to locate UD group end");
177 | }
178 |
179 | while (index <= endIndex)
180 | {
181 | handler.handleEvent(events.get(index));
182 | ++index;
183 | }
184 |
185 | complete = true;
186 | }
187 |
188 | private final IParserEventHandler handler;
189 | private int groupCount = 1;
190 | private boolean complete;
191 | private final List events = new ArrayList();
192 | }
193 |
--------------------------------------------------------------------------------
/src/test/resources/com/rtfparserkit/parser/standard/data/testGitHubIssue6.xml:
--------------------------------------------------------------------------------
1 | Times New Roman;Arial;Courier;;;Normal;heading 3;heading 2;heading 1;iText 2.1.7 by 1T3XTPAGE . INNEN. KÜCHE - TAGEin Absatz mit Line-Separator:Der geht hier auf einer neuen Zeile weiter.INNEN. KÜCHE - TAGHier ist die zweite Szene.
--------------------------------------------------------------------------------
/src/test/resources/com/rtfparserkit/parser/standard/data/testUpr.rtf:
--------------------------------------------------------------------------------
1 | {\rtf1\adeflang1025\ansi\ansicpg10000\uc1\adeff31507\deff0\stshfdbch31506\stshfloch31506\stshfhich31506\stshfbi31507\deflang1033\deflangfe1033\themelang1033\themelangfe0\themelangcs0{\upr{\fonttbl{\f0\fbidi \fnil\fcharset0\fprq2{\*\panose 02020603050405020304}Times New Roman;}{\f4\fbidi \fnil\fcharset0\fprq2{\*\panose 02000500000000000000}Times;}
{\f23\fbidi \fnil\fcharset0\fprq2{\*\panose 02040503050406030204}Cambria;}{\f24\fbidi \fnil\fcharset0\fprq2{\*\panose 02000503060000020004}Optima;}{\f25\fbidi \fnil\fcharset0\fprq2 Lucida Grande;}
{\flomajor\f31500\fbidi \fnil\fcharset0\fprq2{\*\panose 020b0604020202020204}Arial;}{\fdbmajor\f31501\fbidi \fnil\fcharset0\fprq2{\*\panose 02020603050405020304}Times New Roman;}
{\fhimajor\f31502\fbidi \fnil\fcharset0\fprq2{\*\panose 020f0502020204030204}Calibri;}{\fbimajor\f31503\fbidi \fnil\fcharset0\fprq2{\*\panose 02020603050405020304}Times New Roman;}
{\flominor\f31504\fbidi \fnil\fcharset0\fprq2{\*\panose 02020603050405020304}Times New Roman;}{\fdbminor\f31505\fbidi \fnil\fcharset0\fprq2{\*\panose 02020603050405020304}Times New Roman;}
{\fhiminor\f31506\fbidi \fnil\fcharset0\fprq2{\*\panose 02040503050406030204}Cambria;}{\fbiminor\f31507\fbidi \fnil\fcharset0\fprq2{\*\panose 02020603050405020304}Times New Roman;}
}{\*\ud{\fonttbl{\f0\fbidi \fnil\fcharset0\fprq2{\*\panose 02020603050405020304}Times New Roman;}{\f4\fbidi \fnil\fcharset0\fprq2{\*\panose 02000500000000000000}Times;}{\f23\fbidi \fnil\fcharset0\fprq2{\*\panose 02040503050406030204}Cambria;}
{\f24\fbidi \fnil\fcharset0\fprq2{\*\panose 02000503060000020004}Optima;}{\f25\fbidi \fnil\fcharset0\fprq2 Lucida Grande;}{\flomajor\f31500\fbidi \fnil\fcharset0\fprq2{\*\panose 020b0604020202020204}Arial;}
{\fdbmajor\f31501\fbidi \fnil\fcharset0\fprq2{\*\panose 02020603050405020304}Times New Roman;}{\fhimajor\f31502\fbidi \fnil\fcharset0\fprq2{\*\panose 020f0502020204030204}Calibri;}
{\fbimajor\f31503\fbidi \fnil\fcharset0\fprq2{\*\panose 02020603050405020304}Times New Roman;}{\flominor\f31504\fbidi \fnil\fcharset0\fprq2{\*\panose 02020603050405020304}Times New Roman;}
{\fdbminor\f31505\fbidi \fnil\fcharset0\fprq2{\*\panose 02020603050405020304}Times New Roman;}{\fhiminor\f31506\fbidi \fnil\fcharset0\fprq2{\*\panose 02040503050406030204}Cambria;}
{\fbiminor\f31507\fbidi \fnil\fcharset0\fprq2{\*\panose 02020603050405020304}Times New Roman;}}}}{\colortbl;\red0\green0\blue0;\red0\green0\blue255;\red0\green255\blue255;\red0\green255\blue0;\red255\green0\blue255;\red255\green0\blue0;
\red255\green255\blue0;\red255\green255\blue255;\red0\green0\blue128;\red0\green128\blue128;\red0\green128\blue0;\red128\green0\blue128;\red128\green0\blue0;\red128\green128\blue0;\red128\green128\blue128;\red192\green192\blue192;\red240\green145\blue0;}
{\*\defchp \f31506\fs24 }{\*\defpap \ql \li0\ri0\widctlpar\wrapdefault\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 }\noqfpromote {\stylesheet{\ql \li0\ri0\widctlpar\wrapdefault\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \rtlch\fcs1
\af31507\afs24\alang1033 \ltrch\fcs0 \f31506\fs24\lang1033\langfe1033\cgrid\langnp1033\langfenp1033 \snext0 Normal;}{\s1\ql \li0\ri0\widctlpar\tqr\tx9026\wrapdefault\aspalpha\aspnum\faauto\outlinelevel0\adjustright\rin0\lin0\itap0 \rtlch\fcs1
\af31507\afs24\alang1033 \ltrch\fcs0 \b\f24\fs32\cf17\lang1033\langfe1033\cgrid\langnp1033\langfenp1033 \sbasedon18 \snext0 \sautoupd \slink17 heading 1;}{\s2\ql \li0\ri0\sb200\sl276\slmult1
\keep\keepn\widctlpar\wrapdefault\aspalpha\aspnum\faauto\outlinelevel1\adjustright\rin0\lin0\itap0 \rtlch\fcs1 \ab\af31503\afs26\alang1033 \ltrch\fcs0 \b\fs26\cf17\lang1033\langfe1033\loch\f24\hich\af24\dbch\af31501\cgrid\langnp1033\langfenp1033
\sbasedon0 \snext0 \sautoupd \slink16 heading 2;}{\s3\ql \li0\ri0\sb200\sl276\slmult1\keep\keepn\widctlpar\wrapdefault\aspalpha\aspnum\faauto\outlinelevel2\adjustright\rin0\lin0\itap0 \rtlch\fcs1 \ab\af31503\afs24\alang1033 \ltrch\fcs0
\b\fs24\cf17\lang1033\langfe1033\loch\f24\hich\af24\dbch\af31501\cgrid\langnp1033\langfenp1033 \sbasedon0 \snext0 \sautoupd \slink15 heading 3;}{\*\cs10 \additive \ssemihidden Default Paragraph Font;}{\*
\ts11\tsrowd\trftsWidthB3\trpaddl108\trpaddr108\trpaddfl3\trpaddft3\trpaddfb3\trpaddfr3\trcbpat1\trcfpat1\tblind0\tblindtype3\tscellwidthfts0\tsvertalt\tsbrdrt\tsbrdrl\tsbrdrb\tsbrdrr\tsbrdrdgl\tsbrdrdgr\tsbrdrh\tsbrdrv
\ql \li0\ri0\widctlpar\wrapdefault\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \rtlch\fcs1 \af31507\afs24\alang1033 \ltrch\fcs0 \f31506\fs24\lang1033\langfe1033\cgrid\langnp1033\langfenp1033 \snext11 \ssemihidden Normal Table;}{\*\cs15 \additive
\rtlch\fcs1 \ab\af31503 \ltrch\fcs0 \b\cf17\loch\f24\hich\af24\dbch\af31501 \sbasedon10 \slink3 \slocked Heading 3 Char;}{\*\cs16 \additive \rtlch\fcs1 \ab\af31503 \ltrch\fcs0 \b\fs26\cf17\loch\f24\hich\af24\dbch\af31501 \sbasedon10 \slink2 \slocked
Heading 2 Char;}{\*\cs17 \additive \rtlch\fcs1 \af0\alang1033 \ltrch\fcs0 \b\f24\fs32\cf17\lang1033\langfe0\langnp1033 \sbasedon10 \slink1 \slocked Heading 1 Char;}{\s18\ql \li0\ri0\widctlpar
\tqc\tx4320\tqr\tx8640\wrapdefault\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \rtlch\fcs1 \af31507\afs24\alang1033 \ltrch\fcs0 \f31506\fs24\lang1033\langfe1033\cgrid\langnp1033\langfenp1033 \sbasedon0 \snext18 \slink19 \ssemihidden header;}{\*
\cs19 \additive \rtlch\fcs1 \af0 \ltrch\fcs0 \sbasedon10 \slink18 \slocked \ssemihidden Header Char;}{\s20\ql \li0\ri0\widctlpar\wrapdefault\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\lisb1\lisa1 \rtlch\fcs1 \af0\afs20\alang1033 \ltrch\fcs0
\f4\fs20\lang1033\langfe1033\cgrid\langnp1033\langfenp1033 \sbasedon0 \snext20 \styrsid6106910 Normal (Web);}{\*\cs21 \additive \rtlch\fcs1 \af0 \ltrch\fcs0 \sbasedon10 \styrsid6106910 apple-converted-space;}}{\*\pgptbl {\pgp\ipgp0\itap0\li0\ri0\sb0\sa0}}
{\*\rsidtbl \rsid6106910}{\mmathPr\mmathFont34\mbrkBin0\mbrkBinSub0\msmallFrac0\mdispDef0\mlMargin0\mrMargin0\mwrapRight0\mintLim0\mnaryLim0}{\info{\author Test}{\operator Test}{\creatim\yr2010\mo6\dy8\hr14\min55}
{\revtim\yr2010\mo6\dy8\hr15\min8}{\version1}{\edmins10}{\nofpages1}{\nofwords0}{\nofchars0}{\*\company TEST}{\nofcharsws0}{\vern33033}{\*\saveprevpict}}{\*\xmlnstbl {\xmlns1 http://schemas.microsoft.com/office/word/2003/wordml}}
\paperw11900\paperh16840\margl1800\margr1800\margt1440\margb1440\gutter0\ltrsect
\ftnbj\aenddoc\trackmoves0\trackformatting1\donotembedsysfont0\relyonvml0\donotembedlingdata0\grfdocevents0\validatexml1\showplaceholdtext0\ignoremixedcontent0\saveinvalidxml0\showxmlerrors1\noxlattoyen
\expshrtn\noultrlspc\dntblnsbdb\nospaceforul\formshade\horzdoc\dgmargin\dghspace360\dgvspace360\dghorigin1800\dgvorigin1440\dghshow0\dgvshow0
\jexpand\viewkind1\viewscale100\pgbrdrhead\pgbrdrfoot\splytwnine\ftnlytwnine\htmautsp\nolnhtadjtbl\useltbaln\alntblind\lytcalctblwd\lyttblrtgr\lnbrkrule\nobrkwrptbl\snaptogridincell\allowfieldendsel\wrppunct
\asianbrkrule\rsidroot6106910\newtblstyruls\nogrowautofit\usenormstyforlist\noindnmbrts\felnbrelev\nocxsptable\indrlsweleven\afelev\utinl\hwelev\notvatxbx \nouicompat \fet0{\*\wgrffmtfilter 013f}\nofeaturethrottle1\ilfomacatclnup0\stylesortmethod0\ltrpar
\sectd \ltrsect\linex0\headery708\footery708\colsx708\endnhere\sectdefaultcl\sectrsid6106910\sftnbj {\*\pnseclvl1\pnucrm\pnstart1\pnindent720\pnhang {\pntxta .}}{\*\pnseclvl2\pnucltr\pnstart1\pnindent720\pnhang {\pntxta .}}{\*\pnseclvl3
\pndec\pnstart1\pnindent720\pnhang {\pntxta .}}{\*\pnseclvl4\pnlcltr\pnstart1\pnindent720\pnhang {\pntxta )}}{\*\pnseclvl5\pndec\pnstart1\pnindent720\pnhang {\pntxtb (}{\pntxta )}}{\*\pnseclvl6\pnlcltr\pnstart1\pnindent720\pnhang {\pntxtb (}{\pntxta )}}
{\*\pnseclvl7\pnlcrm\pnstart1\pnindent720\pnhang {\pntxtb (}{\pntxta )}}{\*\pnseclvl8\pnlcltr\pnstart1\pnindent720\pnhang {\pntxtb (}{\pntxta )}}{\*\pnseclvl9\pnlcrm\pnstart1\pnindent720\pnhang {\pntxtb (}{\pntxta )}}\pard\plain \ltrpar
\s20\ql \li280\ri0\widctlpar\wrapdefault\aspalpha\aspnum\faauto\adjustright\rin0\lin280\itap0\pararsid6106910 \rtlch\fcs1 \af0\afs20\alang1033 \ltrch\fcs0 \f4\fs20\lang1033\langfe1033\cgrid\langnp1033\langfenp1033 {\rtlch\fcs1 \af0 \ltrch\fcs0
\b\f25\fs34\insrsid6106910 TEST}{\rtlch\fcs1 \af0 \ltrch\fcs0 \f25\fs34\insrsid6106910
\par }{\rtlch\fcs1 \af0 \ltrch\fcs0 \b\f25\fs34\insrsid6106910 End User License Agreement}{\rtlch\fcs1 \af0 \ltrch\fcs0 \f25\fs34\insrsid6106910
\par }{\rtlch\fcs1 \af0 \ltrch\fcs0 \f25\fs22\insrsid6106910
\par }\pard \ltrpar\s20\qj \li280\ri0\widctlpar\wrapdefault\aspalpha\aspnum\faauto\adjustright\rin0\lin280\itap0\pararsid6106910 {\rtlch\fcs1 \af0 \ltrch\fcs0 \b\f25\fs22\insrsid6106910
TEST}{\rtlch\fcs1 \af0 \ltrch\fcs0
\f25\fs22\insrsid6106910
\par }\pard \ltrpar\s20\ql \fi-280\li560\ri0\widctlpar\wrapdefault\aspalpha\aspnum\faauto\adjustright\rin0\lin560\itap0\pararsid6106910 {\rtlch\fcs1 \af0 \ltrch\fcs0 \f25\fs22\insrsid6106910
\par
\par }}
--------------------------------------------------------------------------------
/src/main/java/com/rtfparserkit/parser/raw/RawRtfParser.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2013 Jon Iles
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.rtfparserkit.parser.raw;
18 |
19 | import java.io.EOFException;
20 | import java.io.IOException;
21 |
22 | import com.rtfparserkit.parser.IRtfListener;
23 | import com.rtfparserkit.parser.IRtfParser;
24 | import com.rtfparserkit.parser.IRtfSource;
25 | import com.rtfparserkit.rtf.Command;
26 | import com.rtfparserkit.utils.HexUtils;
27 |
28 | /**
29 | * This class implements a low level RTF parser. It performs the minimum amount
30 | * of processing on the data read from an RTF file, just passing the caller a
31 | * stream of events representing the commands and character bytes. In particular
32 | * the parser doesn't not deal with character encodings or the various
33 | * Unicode related commands which may be present in an RTF file.
34 | *
35 | * This code is based on the approach outlined in the sample C code provided in
36 | * the RTF Specification 1.9.1 (http://www.microsoft.com/en-gb/download/details.aspx?id=10725)
37 | */
38 | public class RawRtfParser implements IRtfParser
39 | {
40 | /**
41 | * Parse RTF data from an input source.
42 | */
43 | @Override
44 | public void parse(IRtfSource source, IRtfListener listener) throws IOException
45 | {
46 | this.source = source;
47 | this.listener = listener;
48 | groupDepth = 0;
49 | parsingHex = false;
50 | buffer = new ByteBuffer();
51 |
52 | listener.processDocumentStart();
53 |
54 | int ch;
55 | parsingHex = false;
56 |
57 | while (true)
58 | {
59 |
60 | ch = source.read();
61 | if (ch == -1)
62 | {
63 | break;
64 | }
65 |
66 | if (groupDepth < 0)
67 | {
68 | throw new IllegalStateException("Group stack underflow");
69 | }
70 |
71 | switch (ch)
72 | {
73 | case '{':
74 | {
75 | handleGroupStart();
76 | break;
77 | }
78 |
79 | case '}':
80 | {
81 | handleGroupEnd();
82 | break;
83 | }
84 |
85 | case '\\':
86 | {
87 | handleCommand();
88 | break;
89 | }
90 |
91 | case '\r':
92 | case '\n':
93 | {
94 | break;
95 | }
96 |
97 | case '\t':
98 | {
99 | handleCharacterData();
100 | listener.processCommand(Command.tab, 0, false, false);
101 | break;
102 | }
103 |
104 | default:
105 | {
106 | handleCharacterByte(ch);
107 | break;
108 | }
109 | }
110 | }
111 |
112 | if (groupDepth < 0)
113 | {
114 | throw new IllegalStateException("Group stack underflow");
115 | }
116 |
117 | if (groupDepth > 0)
118 | {
119 | throw new IllegalStateException("Unmatched brace");
120 | }
121 |
122 | listener.processDocumentEnd();
123 | }
124 |
125 | /**
126 | * Process a single character byte, or hex encoded character byte.
127 | */
128 | private void handleCharacterByte(int ch) throws IOException
129 | {
130 | if (parsingHex)
131 | {
132 | int b = HexUtils.parseHexDigit(ch) << 4;
133 | ch = source.read();
134 | if (ch == -1)
135 | {
136 | throw new IllegalStateException("Unexpected end of file");
137 | }
138 |
139 | // Have encountered malformed RTF where only a single hex digit
140 | // has been supplied. e.g. \'AA\'B\'CC so we hit the next \
141 | // rather than getting a hex digit. Try to handle this specific
142 | // case gracefully by unreading the next character and working with
143 | // the single digit we have.
144 | if (ch == '\\')
145 | {
146 | b = b >> 4;
147 | source.unread(ch);
148 | }
149 | else
150 | {
151 | b += HexUtils.parseHexDigit(ch);
152 | }
153 |
154 | buffer.add(b);
155 | parsingHex = false;
156 | }
157 | else
158 | {
159 | buffer.add(ch);
160 | }
161 | }
162 |
163 | /**
164 | * Read and process an RTF command.
165 | */
166 | private void handleCommand() throws IOException
167 | {
168 | boolean commandHasParameter = false;
169 | boolean parameterIsNegative = false;
170 | int parameterValue = 0;
171 | StringBuilder commandText = new StringBuilder();
172 | StringBuilder parameterText = new StringBuilder();
173 |
174 | int ch = source.read();
175 | if (ch == -1)
176 | {
177 | throw new EOFException();
178 | }
179 |
180 | commandText.append((char) ch);
181 |
182 | if (!Character.isLetter(ch))
183 | {
184 | handleCommand(commandText, 0, commandHasParameter);
185 | return;
186 | }
187 |
188 | while (true)
189 | {
190 | ch = source.read();
191 | if (ch == -1 || !Character.isLetter(ch))
192 | {
193 | break;
194 | }
195 | commandText.append((char) ch);
196 | if (commandText.length() > MAX_COMMAND_LENGTH)
197 | {
198 | break;
199 | }
200 | }
201 |
202 | if (ch == -1)
203 | {
204 | throw new EOFException();
205 | }
206 |
207 | if (commandText.length() > MAX_COMMAND_LENGTH)
208 | {
209 | throw new IllegalArgumentException("Invalid keyword: " + commandText.toString());
210 | }
211 |
212 | if (ch == '-')
213 | {
214 | parameterIsNegative = true;
215 | ch = source.read();
216 | if (ch == -1)
217 | {
218 | throw new EOFException();
219 | }
220 | }
221 | if (Character.isDigit(ch))
222 | {
223 | commandHasParameter = true;
224 | parameterText.append((char) ch);
225 | while (true)
226 | {
227 | ch = source.read();
228 | if (ch == -1 || !Character.isDigit(ch))
229 | {
230 | break;
231 | }
232 | parameterText.append((char) ch);
233 | if (parameterText.length() > MAX_PARAMETER_LENGTH)
234 | {
235 | break;
236 | }
237 | }
238 |
239 | if (parameterText.length() > MAX_PARAMETER_LENGTH)
240 | {
241 | throw new IllegalArgumentException("Invalid parameter: " + parameterText.toString());
242 | }
243 |
244 | parameterValue = Integer.parseInt(parameterText.toString());
245 | if (parameterIsNegative)
246 | {
247 | parameterValue = -parameterValue;
248 | }
249 | }
250 |
251 | if (ch != ' ')
252 | {
253 | source.unread(ch);
254 | }
255 |
256 | handleCommand(commandText, parameterValue, commandHasParameter);
257 | }
258 |
259 | /**
260 | * Determine what to do with the extracted command.
261 | */
262 | private void handleCommand(StringBuilder commandBuffer, int parameter, boolean hasParameter) throws IOException
263 | {
264 | String commandName = commandBuffer.toString();
265 | Command command = Command.getInstance(commandName);
266 |
267 | //
268 | // Note that we silently ignore commands that we don't recognise
269 | //
270 | if (command != null)
271 | {
272 | if (command != Command.hex)
273 | {
274 | handleCharacterData();
275 | }
276 |
277 | switch (command)
278 | {
279 | case bin:
280 | {
281 | handleBinaryData(parameter);
282 | break;
283 | }
284 |
285 | case hex:
286 | {
287 | parsingHex = true;
288 | break;
289 | }
290 |
291 | default:
292 | {
293 | listener.processCommand(command, parameter, hasParameter, false);
294 | break;
295 | }
296 | }
297 | }
298 | }
299 |
300 | /**
301 | * Pass accumulated character data to the listener.
302 | */
303 | private void handleCharacterData()
304 | {
305 | byte[] data = buffer.toArray();
306 | buffer.clear();
307 | listener.processCharacterBytes(data);
308 | }
309 |
310 | /**
311 | * Pass binary data to the listener.
312 | */
313 | private void handleBinaryData(int size) throws IOException
314 | {
315 | byte[] data = new byte[size];
316 | int bytesRead = source.read(data);
317 | if (bytesRead != size)
318 | {
319 | throw new EOFException();
320 | }
321 | listener.processBinaryBytes(data);
322 | }
323 |
324 | /**
325 | * Inform the listener of a group start.
326 | */
327 | private void handleGroupStart()
328 | {
329 | handleCharacterData();
330 | groupDepth++;
331 | listener.processGroupStart();
332 | }
333 |
334 | /**
335 | * Inform the listener of a group end.
336 | */
337 | private void handleGroupEnd()
338 | {
339 | handleCharacterData();
340 | listener.processGroupEnd();
341 | groupDepth--;
342 | }
343 |
344 | private IRtfSource source;
345 | private int groupDepth;
346 | private boolean parsingHex;
347 | private ByteBuffer buffer;
348 | private IRtfListener listener;
349 |
350 | private static final int MAX_PARAMETER_LENGTH = 20;
351 | private static final int MAX_COMMAND_LENGTH = 30;
352 | }
353 |
--------------------------------------------------------------------------------
/licence.txt:
--------------------------------------------------------------------------------
1 |
2 | Apache License
3 | Version 2.0, January 2004
4 | http://www.apache.org/licenses/
5 |
6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
7 |
8 | 1. Definitions.
9 |
10 | "License" shall mean the terms and conditions for use, reproduction,
11 | and distribution as defined by Sections 1 through 9 of this document.
12 |
13 | "Licensor" shall mean the copyright owner or entity authorized by
14 | the copyright owner that is granting the License.
15 |
16 | "Legal Entity" shall mean the union of the acting entity and all
17 | other entities that control, are controlled by, or are under common
18 | control with that entity. For the purposes of this definition,
19 | "control" means (i) the power, direct or indirect, to cause the
20 | direction or management of such entity, whether by contract or
21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
22 | outstanding shares, or (iii) beneficial ownership of such entity.
23 |
24 | "You" (or "Your") shall mean an individual or Legal Entity
25 | exercising permissions granted by this License.
26 |
27 | "Source" form shall mean the preferred form for making modifications,
28 | including but not limited to software source code, documentation
29 | source, and configuration files.
30 |
31 | "Object" form shall mean any form resulting from mechanical
32 | transformation or translation of a Source form, including but
33 | not limited to compiled object code, generated documentation,
34 | and conversions to other media types.
35 |
36 | "Work" shall mean the work of authorship, whether in Source or
37 | Object form, made available under the License, as indicated by a
38 | copyright notice that is included in or attached to the work
39 | (an example is provided in the Appendix below).
40 |
41 | "Derivative Works" shall mean any work, whether in Source or Object
42 | form, that is based on (or derived from) the Work and for which the
43 | editorial revisions, annotations, elaborations, or other modifications
44 | represent, as a whole, an original work of authorship. For the purposes
45 | of this License, Derivative Works shall not include works that remain
46 | separable from, or merely link (or bind by name) to the interfaces of,
47 | the Work and Derivative Works thereof.
48 |
49 | "Contribution" shall mean any work of authorship, including
50 | the original version of the Work and any modifications or additions
51 | to that Work or Derivative Works thereof, that is intentionally
52 | submitted to Licensor for inclusion in the Work by the copyright owner
53 | or by an individual or Legal Entity authorized to submit on behalf of
54 | the copyright owner. For the purposes of this definition, "submitted"
55 | means any form of electronic, verbal, or written communication sent
56 | to the Licensor or its representatives, including but not limited to
57 | communication on electronic mailing lists, source code control systems,
58 | and issue tracking systems that are managed by, or on behalf of, the
59 | Licensor for the purpose of discussing and improving the Work, but
60 | excluding communication that is conspicuously marked or otherwise
61 | designated in writing by the copyright owner as "Not a Contribution."
62 |
63 | "Contributor" shall mean Licensor and any individual or Legal Entity
64 | on behalf of whom a Contribution has been received by Licensor and
65 | subsequently incorporated within the Work.
66 |
67 | 2. Grant of Copyright License. Subject to the terms and conditions of
68 | this License, each Contributor hereby grants to You a perpetual,
69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
70 | copyright license to reproduce, prepare Derivative Works of,
71 | publicly display, publicly perform, sublicense, and distribute the
72 | Work and such Derivative Works in Source or Object form.
73 |
74 | 3. Grant of Patent License. Subject to the terms and conditions of
75 | this License, each Contributor hereby grants to You a perpetual,
76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
77 | (except as stated in this section) patent license to make, have made,
78 | use, offer to sell, sell, import, and otherwise transfer the Work,
79 | where such license applies only to those patent claims licensable
80 | by such Contributor that are necessarily infringed by their
81 | Contribution(s) alone or by combination of their Contribution(s)
82 | with the Work to which such Contribution(s) was submitted. If You
83 | institute patent litigation against any entity (including a
84 | cross-claim or counterclaim in a lawsuit) alleging that the Work
85 | or a Contribution incorporated within the Work constitutes direct
86 | or contributory patent infringement, then any patent licenses
87 | granted to You under this License for that Work shall terminate
88 | as of the date such litigation is filed.
89 |
90 | 4. Redistribution. You may reproduce and distribute copies of the
91 | Work or Derivative Works thereof in any medium, with or without
92 | modifications, and in Source or Object form, provided that You
93 | meet the following conditions:
94 |
95 | (a) You must give any other recipients of the Work or
96 | Derivative Works a copy of this License; and
97 |
98 | (b) You must cause any modified files to carry prominent notices
99 | stating that You changed the files; and
100 |
101 | (c) You must retain, in the Source form of any Derivative Works
102 | that You distribute, all copyright, patent, trademark, and
103 | attribution notices from the Source form of the Work,
104 | excluding those notices that do not pertain to any part of
105 | the Derivative Works; and
106 |
107 | (d) If the Work includes a "NOTICE" text file as part of its
108 | distribution, then any Derivative Works that You distribute must
109 | include a readable copy of the attribution notices contained
110 | within such NOTICE file, excluding those notices that do not
111 | pertain to any part of the Derivative Works, in at least one
112 | of the following places: within a NOTICE text file distributed
113 | as part of the Derivative Works; within the Source form or
114 | documentation, if provided along with the Derivative Works; or,
115 | within a display generated by the Derivative Works, if and
116 | wherever such third-party notices normally appear. The contents
117 | of the NOTICE file are for informational purposes only and
118 | do not modify the License. You may add Your own attribution
119 | notices within Derivative Works that You distribute, alongside
120 | or as an addendum to the NOTICE text from the Work, provided
121 | that such additional attribution notices cannot be construed
122 | as modifying the License.
123 |
124 | You may add Your own copyright statement to Your modifications and
125 | may provide additional or different license terms and conditions
126 | for use, reproduction, or distribution of Your modifications, or
127 | for any such Derivative Works as a whole, provided Your use,
128 | reproduction, and distribution of the Work otherwise complies with
129 | the conditions stated in this License.
130 |
131 | 5. Submission of Contributions. Unless You explicitly state otherwise,
132 | any Contribution intentionally submitted for inclusion in the Work
133 | by You to the Licensor shall be under the terms and conditions of
134 | this License, without any additional terms or conditions.
135 | Notwithstanding the above, nothing herein shall supersede or modify
136 | the terms of any separate license agreement you may have executed
137 | with Licensor regarding such Contributions.
138 |
139 | 6. Trademarks. This License does not grant permission to use the trade
140 | names, trademarks, service marks, or product names of the Licensor,
141 | except as required for reasonable and customary use in describing the
142 | origin of the Work and reproducing the content of the NOTICE file.
143 |
144 | 7. Disclaimer of Warranty. Unless required by applicable law or
145 | agreed to in writing, Licensor provides the Work (and each
146 | Contributor provides its Contributions) on an "AS IS" BASIS,
147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 | implied, including, without limitation, any warranties or conditions
149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 | PARTICULAR PURPOSE. You are solely responsible for determining the
151 | appropriateness of using or redistributing the Work and assume any
152 | risks associated with Your exercise of permissions under this License.
153 |
154 | 8. Limitation of Liability. In no event and under no legal theory,
155 | whether in tort (including negligence), contract, or otherwise,
156 | unless required by applicable law (such as deliberate and grossly
157 | negligent acts) or agreed to in writing, shall any Contributor be
158 | liable to You for damages, including any direct, indirect, special,
159 | incidental, or consequential damages of any character arising as a
160 | result of this License or out of the use or inability to use the
161 | Work (including but not limited to damages for loss of goodwill,
162 | work stoppage, computer failure or malfunction, or any and all
163 | other commercial damages or losses), even if such Contributor
164 | has been advised of the possibility of such damages.
165 |
166 | 9. Accepting Warranty or Additional Liability. While redistributing
167 | the Work or Derivative Works thereof, You may choose to offer,
168 | and charge a fee for, acceptance of support, warranty, indemnity,
169 | or other liability obligations and/or rights consistent with this
170 | License. However, in accepting such obligations, You may act only
171 | on Your own behalf and on Your sole responsibility, not on behalf
172 | of any other Contributor, and only if You agree to indemnify,
173 | defend, and hold each Contributor harmless for any liability
174 | incurred by, or claims asserted against, such Contributor by reason
175 | of your accepting any such warranty or additional liability.
176 |
177 | END OF TERMS AND CONDITIONS
178 |
179 | APPENDIX: How to apply the Apache License to your work.
180 |
181 | To apply the Apache License to your work, attach the following
182 | boilerplate notice, with the fields enclosed by brackets "[]"
183 | replaced with your own identifying information. (Don't include
184 | the brackets!) The text should be enclosed in the appropriate
185 | comment syntax for the file format. We also recommend that a
186 | file or class name and description of purpose be included on the
187 | same "printed page" as the copyright notice for easier
188 | identification within third-party archives.
189 |
190 | Copyright [yyyy] [name of copyright owner]
191 |
192 | Licensed under the Apache License, Version 2.0 (the "License");
193 | you may not use this file except in compliance with the License.
194 | You may obtain a copy of the License at
195 |
196 | http://www.apache.org/licenses/LICENSE-2.0
197 |
198 | Unless required by applicable law or agreed to in writing, software
199 | distributed under the License is distributed on an "AS IS" BASIS,
200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 | See the License for the specific language governing permissions and
202 | limitations under the License.
203 |
--------------------------------------------------------------------------------
/src/main/java/com/rtfparserkit/parser/standard/StandardRtfParser.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2013 Jon Iles
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.rtfparserkit.parser.standard;
18 |
19 | import java.io.IOException;
20 | import java.io.UnsupportedEncodingException;
21 | import java.util.ArrayDeque;
22 | import java.util.Deque;
23 | import java.util.HashMap;
24 | import java.util.Map;
25 |
26 | import com.rtfparserkit.parser.IRtfListener;
27 | import com.rtfparserkit.parser.IRtfParser;
28 | import com.rtfparserkit.parser.IRtfSource;
29 | import com.rtfparserkit.parser.raw.RawRtfParser;
30 | import com.rtfparserkit.rtf.Command;
31 | import com.rtfparserkit.rtf.CommandType;
32 |
33 | /**
34 | * This class builds on the RawRtfParser to provide a parser which can
35 | * deal with character encodings and Unicode. All of the character data it reads
36 | * is presented back to the client as Unicode strings to make it as simple as
37 | * possible to deal with.
38 | */
39 | public class StandardRtfParser implements IRtfParser, IRtfListener
40 | {
41 | /**
42 | * Main entry point: parse RTF data from the input stream, and pass events based on
43 | * the RTF content to the listener.
44 | */
45 | @Override
46 | public void parse(IRtfSource source, IRtfListener listener) throws IOException
47 | {
48 | handler = new DefaultEventHandler(listener);
49 | IRtfParser reader = new RawRtfParser();
50 | reader.parse(source, this);
51 | }
52 |
53 | /**
54 | * Handle event from the RawRtfParser.
55 | */
56 | @Override
57 | public void processGroupStart()
58 | {
59 | handleEvent(GROUP_START);
60 | stack.push(state);
61 | state = new ParserState(state);
62 | }
63 |
64 | /**
65 | * Handle event from the RawRtfParser.
66 | */
67 | @Override
68 | public void processGroupEnd()
69 | {
70 | handleEvent(GROUP_END);
71 | state = stack.pop();
72 | }
73 |
74 | /**
75 | * Handle event from the RawRtfParser.
76 | */
77 | @Override
78 | public void processCharacterBytes(byte[] data)
79 | {
80 | try
81 | {
82 | if (data.length != 0)
83 | {
84 | if (skipBytes < data.length)
85 | {
86 | handleEvent(new StringEvent(new String(data, skipBytes, data.length - skipBytes, currentEncoding())));
87 | }
88 | skipBytes = 0;
89 | }
90 | }
91 |
92 | catch (UnsupportedEncodingException ex)
93 | {
94 | throw new RuntimeException(ex);
95 | }
96 | }
97 |
98 | /**
99 | * Determine which encoding to use, one defined by the current font, or the current default encoding.
100 | */
101 | private String currentEncoding()
102 | {
103 | // Assume font 0 if a font has not been set explicitly
104 | if (!state.currentFontExplicitlySet)
105 | {
106 | state.currentFontExplicitlySet = true;
107 | state.currentFontEncoding = m_fontEncodings.get(Integer.valueOf(0));
108 | }
109 | return state.currentFontEncoding == null ? state.currentEncoding : state.currentFontEncoding;
110 | }
111 |
112 | /**
113 | * Handle event from the RawRtfParser.
114 | */
115 | @Override
116 | public void processDocumentStart()
117 | {
118 | handleEvent(DOCUMENT_START);
119 | }
120 |
121 | /**
122 | * Handle event from the RawRtfParser.
123 | */
124 | @Override
125 | public void processDocumentEnd()
126 | {
127 | handleEvent(DOCUMENT_END);
128 | }
129 |
130 | /**
131 | * Handle event from the RawRtfParser.
132 | */
133 | @Override
134 | public void processBinaryBytes(byte[] data)
135 | {
136 | handleEvent(new BinaryBytesEvent(data));
137 | }
138 |
139 | /**
140 | * Handle event from the RawRtfParser.
141 | */
142 | @Override
143 | public void processString(String string)
144 | {
145 | handleEvent(new StringEvent(string));
146 | }
147 |
148 | /**
149 | * Handle event from the RawRtfParser.
150 | */
151 | @Override
152 | public void processCommand(Command command, int parameter, boolean hasParameter, boolean optional)
153 | {
154 | if (command.getCommandType() == CommandType.Encoding)
155 | {
156 | processEncoding(command, hasParameter, parameter);
157 | }
158 | else
159 | {
160 | boolean optionalFlag = false;
161 |
162 | IParserEvent lastEvent = handler.getLastEvent();
163 | if (lastEvent.getType() == ParserEventType.COMMAND_EVENT)
164 | {
165 | if (((CommandEvent) lastEvent).getCommand() == Command.optionalcommand)
166 | {
167 | handler.removeLastEvent();
168 | optionalFlag = true;
169 | }
170 | }
171 |
172 | switch (command)
173 | {
174 | case u:
175 | {
176 | processUnicode(parameter);
177 | break;
178 | }
179 |
180 | case uc:
181 | {
182 | processUnicodeAlternateSkipCount(parameter);
183 | break;
184 | }
185 |
186 | case upr:
187 | {
188 | processUpr(new CommandEvent(command, parameter, hasParameter, optionalFlag));
189 | break;
190 | }
191 |
192 | case emdash:
193 | {
194 | processCharacter('\u2014');
195 | break;
196 | }
197 |
198 | case endash:
199 | {
200 | processCharacter('\u2013');
201 | break;
202 | }
203 |
204 | case emspace:
205 | {
206 | processCharacter('\u2003');
207 | break;
208 | }
209 |
210 | case enspace:
211 | {
212 | processCharacter('\u2002');
213 | break;
214 | }
215 |
216 | case qmspace:
217 | {
218 | processCharacter('\u2005');
219 | break;
220 | }
221 |
222 | case bullet:
223 | {
224 | processCharacter('\u2022');
225 | break;
226 | }
227 |
228 | case lquote:
229 | {
230 | processCharacter('\u2018');
231 | break;
232 | }
233 |
234 | case rquote:
235 | {
236 | processCharacter('\u2019');
237 | break;
238 | }
239 |
240 | case ldblquote:
241 | {
242 | processCharacter('\u201c');
243 | break;
244 | }
245 |
246 | case rdblquote:
247 | {
248 | processCharacter('\u201d');
249 | break;
250 | }
251 |
252 | case backslash:
253 | {
254 | processCharacter('\\');
255 | break;
256 | }
257 |
258 | case opencurly:
259 | {
260 | processCharacter('{');
261 | break;
262 | }
263 |
264 | case closecurly:
265 | {
266 | processCharacter('}');
267 | break;
268 | }
269 |
270 | case f:
271 | {
272 | processFont(parameter);
273 | handleCommand(command, parameter, hasParameter, optionalFlag);
274 | break;
275 | }
276 |
277 | case fcharset:
278 | {
279 | processFontCharset(parameter);
280 | handleCommand(command, parameter, hasParameter, optionalFlag);
281 | break;
282 | }
283 |
284 | case cpg:
285 | {
286 | processFontCodepage(parameter);
287 | handleCommand(command, parameter, hasParameter, optionalFlag);
288 | break;
289 | }
290 |
291 | default:
292 | {
293 | handleCommand(command, parameter, hasParameter, optionalFlag);
294 | break;
295 | }
296 | }
297 | }
298 | }
299 |
300 | /**
301 | * Set the current font and current font encoding in the state.
302 | */
303 | private void processFont(int parameter)
304 | {
305 | state.currentFontExplicitlySet = true;
306 | state.currentFont = parameter;
307 | state.currentFontEncoding = m_fontEncodings.get(Integer.valueOf(parameter));
308 | }
309 |
310 | /**
311 | * Set the charset for the current font.
312 | */
313 | private void processFontCharset(int parameter)
314 | {
315 | setFontEncoding(FontCharset.getCharset(parameter));
316 | }
317 |
318 | private void processFontCodepage(int parameter)
319 | {
320 | setFontEncoding(Integer.toString(parameter));
321 | }
322 |
323 | private void setFontEncoding(String charset)
324 | {
325 | if (charset != null)
326 | {
327 | String encoding = Encoding.LOCALEID_MAPPING.get(charset);
328 | if (encoding != null)
329 | {
330 | m_fontEncodings.put(Integer.valueOf(state.currentFont), encoding);
331 | }
332 | }
333 | }
334 |
335 | /**
336 | * Switch the encoding based on the RTF command received.
337 | */
338 | private void processEncoding(Command command, boolean hasParameter, int parameter)
339 | {
340 | String encoding = null;
341 | switch (command)
342 | {
343 | case ansi:
344 | {
345 | encoding = Encoding.ANSI_ENCODING;
346 | break;
347 | }
348 |
349 | case pc:
350 | {
351 | encoding = Encoding.PC_ENCODING;
352 | break;
353 | }
354 |
355 | case pca:
356 | {
357 | encoding = Encoding.PCA_ENCODING;
358 | break;
359 | }
360 |
361 | case mac:
362 | {
363 | encoding = Encoding.MAC_ENCODING;
364 | break;
365 | }
366 |
367 | case ansicpg:
368 | {
369 | encoding = hasParameter ? Encoding.LOCALEID_MAPPING.get(Integer.toString(unsignedValue(parameter))) : null;
370 | break;
371 | }
372 |
373 | default:
374 | {
375 | encoding = null;
376 | break;
377 | }
378 | }
379 |
380 | if (encoding == null)
381 | {
382 | throw new IllegalArgumentException("Unsupported encoding command " + command.getCommandName() + (hasParameter ? parameter : ""));
383 | }
384 |
385 | state.currentEncoding = encoding;
386 | }
387 |
388 | /**
389 | * Process an RTF command parameter representing a Unicode character.
390 | */
391 | private void processUnicode(int parameter)
392 | {
393 | processCharacter((char) unsignedValue(parameter));
394 | skipBytes = state.unicodeAlternateSkipCount;
395 | }
396 |
397 | /**
398 | * Set the number of bytes to skip after a Unicode character.
399 | */
400 | private void processUnicodeAlternateSkipCount(int parameter)
401 | {
402 | state.unicodeAlternateSkipCount = parameter;
403 | }
404 |
405 | /**
406 | * Process a upr command: consume all of the RTF commands relating to this
407 | * and emit events representing the Unicode content.
408 | * @param command
409 | */
410 | private void processUpr(IParserEvent command)
411 | {
412 | IParserEventHandler uprHandler = new UprHandler(handler);
413 | uprHandler.handleEvent(command);
414 |
415 | handlerStack.push(handler);
416 | handler = uprHandler;
417 | }
418 |
419 | /**
420 | * Process a single character.
421 | */
422 | private void processCharacter(char c)
423 | {
424 | handleEvent(new StringEvent(Character.toString(c)));
425 | }
426 |
427 | /**
428 | * Process an RTF command.
429 | */
430 | private void handleCommand(Command command, int parameter, boolean hasParameter, boolean optional)
431 | {
432 | handleEvent(new CommandEvent(command, parameter, hasParameter, optional));
433 | }
434 |
435 | /**
436 | * Pass an event to the event handler, pop the event handler stack if the current
437 | * event handler has consumed all of the events it can.
438 | */
439 | private void handleEvent(IParserEvent event)
440 | {
441 | handler.handleEvent(event);
442 | if (handler.isComplete())
443 | {
444 | handler = handlerStack.pop();
445 | }
446 | }
447 |
448 | private int unsignedValue(int parameter)
449 | {
450 | if (parameter < 0)
451 | {
452 | parameter += 65536;
453 | }
454 | return parameter;
455 | }
456 |
457 | private IParserEventHandler handler;
458 | private final Deque handlerStack = new ArrayDeque();
459 |
460 | private ParserState state = new ParserState();
461 | private final Deque stack = new ArrayDeque();
462 | private int skipBytes;
463 | private Map m_fontEncodings = new HashMap();
464 |
465 | private static final IParserEvent DOCUMENT_START = new DocumentStartEvent();
466 | private static final IParserEvent DOCUMENT_END = new DocumentEndEvent();
467 | private static final IParserEvent GROUP_START = new GroupStartEvent();
468 | private static final IParserEvent GROUP_END = new GroupEndEvent();
469 | }
470 |
--------------------------------------------------------------------------------
/src/main/java/com/rtfparserkit/parser/standard/Encoding.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2013 Jon Iles
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.rtfparserkit.parser.standard;
18 |
19 | import java.util.HashMap;
20 | import java.util.Map;
21 |
22 | /**
23 | * Represents character encodings which may be encountered in an RTF file.
24 | */
25 | class Encoding
26 | {
27 | public static final String ANSI_ENCODING = "Cp1252";
28 | public static final String PC_ENCODING = "Cp437";
29 | public static final String PCA_ENCODING = "Cp850";
30 | public static final String MAC_ENCODING = "MacRoman";
31 |
32 | public static final Map LOCALEID_MAPPING = new HashMap();
33 | static
34 | {
35 | // Comment lines based on: https://msdn.microsoft.com/en-us/library/windows/desktop/dd317756(v=vs.85).aspx
36 |
37 | // 037 IBM037 IBM EBCDIC US-Canada
38 | LOCALEID_MAPPING.put("437", "Cp437"); // IBM437 OEM United States
39 | // 500 IBM500 IBM EBCDIC International
40 | // 708 ASMO-708 Arabic (ASMO 708)
41 | // 709 Arabic (ASMO-449+, BCON V4)
42 | // 710 Arabic - Transparent Arabic
43 | // 720 DOS-720 Arabic (Transparent ASMO); Arabic (DOS)
44 | // 737 ibm737 OEM Greek (formerly 437G); Greek (DOS)
45 | // 775 ibm775 OEM Baltic; Baltic (DOS)
46 | // 850 ibm850 OEM Multilingual Latin 1; Western European (DOS)
47 | // 852 ibm852 OEM Latin 2; Central European (DOS)
48 | // 855 IBM855 OEM Cyrillic (primarily Russian)
49 | // 857 ibm857 OEM Turkish; Turkish (DOS)
50 | // 858 IBM00858 OEM Multilingual Latin 1 + Euro symbol
51 | // 860 IBM860 OEM Portuguese; Portuguese (DOS)
52 | // 861 ibm861 OEM Icelandic; Icelandic (DOS)
53 | // 862 DOS-862 OEM Hebrew; Hebrew (DOS)
54 | // 863 IBM863 OEM French Canadian; French Canadian (DOS)
55 | // 864 IBM864 OEM Arabic; Arabic (864)
56 | // 865 IBM865 OEM Nordic; Nordic (DOS)
57 | // 866 cp866 OEM Russian; Cyrillic (DOS)
58 | // 869 ibm869 OEM Modern Greek; Greek, Modern (DOS)
59 | // 870 IBM870 IBM EBCDIC Multilingual/ROECE (Latin 2); IBM EBCDIC Multilingual Latin 2
60 | LOCALEID_MAPPING.put("874", "Cp874"); // windows-874 ANSI/OEM Thai (ISO 8859-11); Thai (Windows)
61 | // 875 cp875 IBM EBCDIC Greek Modern
62 | LOCALEID_MAPPING.put("932", "MS932"); // Japanese
63 | LOCALEID_MAPPING.put("936", "Cp936"); // Simplified Chinese
64 | LOCALEID_MAPPING.put("949", "Cp949"); // Korean
65 | LOCALEID_MAPPING.put("950", "Cp950"); // ANSI/OEM Traditional Chinese (Taiwan; Hong Kong SAR, PRC); Chinese Traditional (Big5)
66 | LOCALEID_MAPPING.put("1025", "Cp1256"); // Arabic (Saudi Arabia)
67 | LOCALEID_MAPPING.put("1026", "Cp1251"); // Bulgarian
68 | LOCALEID_MAPPING.put("1028", "Cp950"); // Chinese (Taiwan)
69 | LOCALEID_MAPPING.put("1029", "Cp1250"); // Czech
70 | LOCALEID_MAPPING.put("1032", "Cp1253"); // Greek
71 | LOCALEID_MAPPING.put("1037", "Cp1255"); // Hebrew
72 | LOCALEID_MAPPING.put("1038", "Cp1250"); // Hungarian
73 | LOCALEID_MAPPING.put("1041", "SJIS"); // Japanese
74 | LOCALEID_MAPPING.put("1042", "Cp949"); // Korean
75 | LOCALEID_MAPPING.put("1045", "Cp1250"); // Polish
76 | // 1047 IBM01047 IBM EBCDIC Latin 1/Open System
77 | LOCALEID_MAPPING.put("1048", "Cp1250"); // Romanian
78 | LOCALEID_MAPPING.put("1049", "Cp1251"); // Russian
79 | LOCALEID_MAPPING.put("1050", "Cp1250"); // Croatian
80 | LOCALEID_MAPPING.put("1051", "Cp1250"); // Slovak
81 | LOCALEID_MAPPING.put("1052", "Cp1250"); // Albanian
82 | LOCALEID_MAPPING.put("1054", "Cp874"); // Thai
83 | LOCALEID_MAPPING.put("1055", "Cp1254"); // Turkish
84 | LOCALEID_MAPPING.put("1056", "Cp1256"); // Urdu
85 | LOCALEID_MAPPING.put("1058", "Cp1251"); // Ukrainian
86 | LOCALEID_MAPPING.put("1059", "Cp1251"); // Belarusian
87 | LOCALEID_MAPPING.put("1060", "Cp1250"); // Slovenian
88 | LOCALEID_MAPPING.put("1061", "Cp1257"); // Estonian
89 | LOCALEID_MAPPING.put("1062", "Cp1257"); // Latvian
90 | LOCALEID_MAPPING.put("1063", "Cp1257"); // Lithuanian
91 | LOCALEID_MAPPING.put("1065", "Cp1256"); // Farsi
92 | LOCALEID_MAPPING.put("1066", "Cp1258"); // Vietnamese
93 | LOCALEID_MAPPING.put("1068", "Cp1254"); // Azeri (Latin)
94 | LOCALEID_MAPPING.put("1071", "Cp1251"); // FYRO Macedonian
95 | LOCALEID_MAPPING.put("1087", "Cp1251"); // Kazakh
96 | LOCALEID_MAPPING.put("1088", "Cp1251"); // Kyrgyz (Cyrillic)
97 | LOCALEID_MAPPING.put("1091", "Cp1254"); // Uzbek (Latin)
98 | LOCALEID_MAPPING.put("1092", "Cp1251"); // Tatar
99 | LOCALEID_MAPPING.put("1104", "Cp1251"); // Mongolian (Cyrillic)
100 | // 1140 IBM01140 IBM EBCDIC US-Canada (037 + Euro symbol); IBM EBCDIC (US-Canada-Euro)
101 | // 1141 IBM01141 IBM EBCDIC Germany (20273 + Euro symbol); IBM EBCDIC (Germany-Euro)
102 | // 1142 IBM01142 IBM EBCDIC Denmark-Norway (20277 + Euro symbol); IBM EBCDIC (Denmark-Norway-Euro)
103 | // 1143 IBM01143 IBM EBCDIC Finland-Sweden (20278 + Euro symbol); IBM EBCDIC (Finland-Sweden-Euro)
104 | // 1144 IBM01144 IBM EBCDIC Italy (20280 + Euro symbol); IBM EBCDIC (Italy-Euro)
105 | // 1145 IBM01145 IBM EBCDIC Latin America-Spain (20284 + Euro symbol); IBM EBCDIC (Spain-Euro)
106 | // 1146 IBM01146 IBM EBCDIC United Kingdom (20285 + Euro symbol); IBM EBCDIC (UK-Euro)
107 | // 1147 IBM01147 IBM EBCDIC France (20297 + Euro symbol); IBM EBCDIC (France-Euro)
108 | // 1148 IBM01148 IBM EBCDIC International (500 + Euro symbol); IBM EBCDIC (International-Euro)
109 | // 1149 IBM01149 IBM EBCDIC Icelandic (20871 + Euro symbol); IBM EBCDIC (Icelandic-Euro)
110 | // 1200 utf-16 Unicode UTF-16, little endian byte order (BMP of ISO 10646)
111 | // 1201 unicodeFFFE Unicode UTF-16, big endian byte order
112 | LOCALEID_MAPPING.put("1250", "Cp1250"); // Windows Latin 2 (Central Europe)
113 | LOCALEID_MAPPING.put("1251", "Cp1251"); // Cyrillic
114 | LOCALEID_MAPPING.put("1252", "Cp1252"); // Latin
115 | LOCALEID_MAPPING.put("1253", "Cp1253"); // Greek
116 | LOCALEID_MAPPING.put("1254", "Cp1254"); // Turkish
117 | LOCALEID_MAPPING.put("1255", "Cp1255"); // Windows Hebrew
118 | LOCALEID_MAPPING.put("1256", "Cp1256"); // Arabic (Iraq)
119 | LOCALEID_MAPPING.put("1257", "Cp1257"); // Baltic
120 | LOCALEID_MAPPING.put("1258", "Cp1258"); // Vietnamese
121 | // 1361 Johab Korean (Johab)
122 | LOCALEID_MAPPING.put("2049", "Cp1256"); // Arabic (Iraq)
123 | LOCALEID_MAPPING.put("2052", "MS936"); // Chinese (PRC)
124 | LOCALEID_MAPPING.put("2074", "Cp1250"); // Serbian (Latin)
125 | LOCALEID_MAPPING.put("2092", "Cp1251"); // Azeri (Cyrillic)
126 | LOCALEID_MAPPING.put("2115", "Cp1251"); // Uzbek (Cyrillic)
127 | LOCALEID_MAPPING.put("3073", "Cp1256"); // Arabic (Egypt)
128 | LOCALEID_MAPPING.put("3076", "Cp950"); // Chinese (Hong Kong S.A.R.)
129 | LOCALEID_MAPPING.put("3098", "Cp1251"); // Serbian (Cyrillic)
130 | LOCALEID_MAPPING.put("4097", "Cp1256"); // Arabic (Libya)
131 | LOCALEID_MAPPING.put("4100", "MS936"); // Chinese (Singapore)
132 | LOCALEID_MAPPING.put("5121", "Cp1256"); // Arabic (Algeria)
133 | LOCALEID_MAPPING.put("5124", "Cp950"); // Chinese (Macau S.A.R.)
134 | LOCALEID_MAPPING.put("6145", "Cp1256"); // Arabic (Morocco)
135 | LOCALEID_MAPPING.put("7169", "Cp1256"); // Arabic (Tunisia)
136 | LOCALEID_MAPPING.put("8193", "Cp1256"); // Arabic (Oman)
137 | LOCALEID_MAPPING.put("9217", "Cp1256"); // Arabic (Yemen)
138 | LOCALEID_MAPPING.put("10000", "MacRoman"); // Mac Roman
139 | LOCALEID_MAPPING.put("10001", "Shift_JIS"); // x-mac-japanese Japanese (Mac)
140 | // 10002 x-mac-chinesetrad MAC Traditional Chinese (Big5); Chinese Traditional (Mac)
141 | // 10003 x-mac-korean Korean (Mac)
142 | LOCALEID_MAPPING.put("10004", "x-MacArabic"); // x-mac-arabic Arabic (Mac)
143 | LOCALEID_MAPPING.put("10005", "x-MacHebrew"); // x-mac-hebrew Hebrew (Mac)
144 | LOCALEID_MAPPING.put("10006", "x-MacHebrew"); // x-mac-greek Greek (Mac)
145 | LOCALEID_MAPPING.put("10007", "x-MacCyrillic"); // x-mac-cyrillic Cyrillic (Mac)
146 | // 10008 x-mac-chinesesimp MAC Simplified Chinese (GB 2312); Chinese Simplified (Mac)
147 | LOCALEID_MAPPING.put("10010", "x-MacRomania"); // x-mac-romanian Romanian (Mac)
148 | LOCALEID_MAPPING.put("10017", "x-MacUkraine"); // x-mac-ukrainian Ukrainian (Mac)
149 | LOCALEID_MAPPING.put("10021", "x-MacThai"); // x-mac-thai Thai (Mac)
150 | LOCALEID_MAPPING.put("10029", "x-MacCentralEurope"); // x-mac-ce MAC Latin 2; Central European (Mac)
151 | LOCALEID_MAPPING.put("10079", "x-MacIceland"); // x-mac-icelandic Icelandic (Mac)
152 | LOCALEID_MAPPING.put("10081", "x-MacTurkish"); // x-mac-turkish Turkish (Mac)
153 | LOCALEID_MAPPING.put("10082", "x-MacCroatian"); // x-mac-croatian Croatian (Mac)
154 | LOCALEID_MAPPING.put("10241", "Cp1256"); // Arabic (Syria)
155 | LOCALEID_MAPPING.put("11265", "Cp1256"); // Arabic (Jordan)
156 | // 12000 utf-32 Unicode UTF-32, little endian byte order
157 | // 12001 utf-32BE Unicode UTF-32, big endian byte order
158 | LOCALEID_MAPPING.put("12289", "Cp1256"); // Arabic (Lebanon)
159 | LOCALEID_MAPPING.put("13313", "Cp1256"); // Arabic (Kuwait)
160 | LOCALEID_MAPPING.put("14337", "Cp1256"); // Arabic (U.A.E.)
161 | LOCALEID_MAPPING.put("15361", "Cp1256"); // Arabic (Bahrain)
162 | LOCALEID_MAPPING.put("16385", "Cp1256"); // Arabic (Qatar)
163 | // 20000 x-Chinese_CNS CNS Taiwan; Chinese Traditional (CNS)
164 | // 20001 x-cp20001 TCA Taiwan
165 | // 20002 x_Chinese-Eten Eten Taiwan; Chinese Traditional (Eten)
166 | // 20003 x-cp20003 IBM5550 Taiwan
167 | // 20004 x-cp20004 TeleText Taiwan
168 | // 20005 x-cp20005 Wang Taiwan
169 | // 20105 x-IA5 IA5 (IRV International Alphabet No. 5, 7-bit); Western European (IA5)
170 | // 20106 x-IA5-German IA5 German (7-bit)
171 | // 20107 x-IA5-Swedish IA5 Swedish (7-bit)
172 | // 20108 x-IA5-Norwegian IA5 Norwegian (7-bit)
173 | // 20127 us-ascii US-ASCII (7-bit)
174 | // 20261 x-cp20261 T.61
175 | // 20269 x-cp20269 ISO 6937 Non-Spacing Accent
176 | // 20273 IBM273 IBM EBCDIC Germany
177 | // 20277 IBM277 IBM EBCDIC Denmark-Norway
178 | // 20278 IBM278 IBM EBCDIC Finland-Sweden
179 | // 20280 IBM280 IBM EBCDIC Italy
180 | // 20284 IBM284 IBM EBCDIC Latin America-Spain
181 | // 20285 IBM285 IBM EBCDIC United Kingdom
182 | // 20290 IBM290 IBM EBCDIC Japanese Katakana Extended
183 | // 20297 IBM297 IBM EBCDIC France
184 | // 20420 IBM420 IBM EBCDIC Arabic
185 | // 20423 IBM423 IBM EBCDIC Greek
186 | // 20424 IBM424 IBM EBCDIC Hebrew
187 | // 20833 x-EBCDIC-KoreanExtended IBM EBCDIC Korean Extended
188 | // 20838 IBM-Thai IBM EBCDIC Thai
189 | // 20866 koi8-r Russian (KOI8-R); Cyrillic (KOI8-R)
190 | // 20871 IBM871 IBM EBCDIC Icelandic
191 | // 20880 IBM880 IBM EBCDIC Cyrillic Russian
192 | // 20905 IBM905 IBM EBCDIC Turkish
193 | // 20924 IBM00924 IBM EBCDIC Latin 1/Open System (1047 + Euro symbol)
194 | // 20932 EUC-JP Japanese (JIS 0208-1990 and 0212-1990)
195 | // 20936 x-cp20936 Simplified Chinese (GB2312); Chinese Simplified (GB2312-80)
196 | // 20949 x-cp20949 Korean Wansung
197 | // 21025 cp1025 IBM EBCDIC Cyrillic Serbian-Bulgarian
198 | // 21027 (deprecated)
199 | // 21866 koi8-u Ukrainian (KOI8-U); Cyrillic (KOI8-U)
200 | // 28591 iso-8859-1 ISO 8859-1 Latin 1; Western European (ISO)
201 | // 28592 iso-8859-2 ISO 8859-2 Central European; Central European (ISO)
202 | // 28593 iso-8859-3 ISO 8859-3 Latin 3
203 | // 28594 iso-8859-4 ISO 8859-4 Baltic
204 | // 28595 iso-8859-5 ISO 8859-5 Cyrillic
205 | // 28596 iso-8859-6 ISO 8859-6 Arabic
206 | // 28597 iso-8859-7 ISO 8859-7 Greek
207 | // 28598 iso-8859-8 ISO 8859-8 Hebrew; Hebrew (ISO-Visual)
208 | // 28599 iso-8859-9 ISO 8859-9 Turkish
209 | // 28603 iso-8859-13 ISO 8859-13 Estonian
210 | // 28605 iso-8859-15 ISO 8859-15 Latin 9
211 | // 29001 x-Europa Europa 3
212 | // 38598 iso-8859-8-i ISO 8859-8 Hebrew; Hebrew (ISO-Logical)
213 | // 50220 iso-2022-jp ISO 2022 Japanese with no halfwidth Katakana; Japanese (JIS)
214 | // 50221 csISO2022JP ISO 2022 Japanese with halfwidth Katakana; Japanese (JIS-Allow 1 byte Kana)
215 | // 50222 iso-2022-jp ISO 2022 Japanese JIS X 0201-1989; Japanese (JIS-Allow 1 byte Kana - SO/SI)
216 | // 50225 iso-2022-kr ISO 2022 Korean
217 | // 50227 x-cp50227 ISO 2022 Simplified Chinese; Chinese Simplified (ISO 2022)
218 | // 50229 ISO 2022 Traditional Chinese
219 | // 50930 EBCDIC Japanese (Katakana) Extended
220 | // 50931 EBCDIC US-Canada and Japanese
221 | // 50933 EBCDIC Korean Extended and Korean
222 | // 50935 EBCDIC Simplified Chinese Extended and Simplified Chinese
223 | // 50936 EBCDIC Simplified Chinese
224 | // 50937 EBCDIC US-Canada and Traditional Chinese
225 | // 50939 EBCDIC Japanese (Latin) Extended and Japanese
226 | // 51932 euc-jp EUC Japanese
227 | // 51936 EUC-CN EUC Simplified Chinese; Chinese Simplified (EUC)
228 | // 51949 euc-kr EUC Korean
229 | // 51950 EUC Traditional Chinese
230 | // 52936 hz-gb-2312 HZ-GB2312 Simplified Chinese; Chinese Simplified (HZ)
231 | // 54936 GB18030 Windows XP and later: GB18030 Simplified Chinese (4 byte); Chinese Simplified (GB18030)
232 | // 57002 x-iscii-de ISCII Devanagari
233 | // 57003 x-iscii-be ISCII Bangla
234 | // 57004 x-iscii-ta ISCII Tamil
235 | // 57005 x-iscii-te ISCII Telugu
236 | // 57006 x-iscii-as ISCII Assamese
237 | // 57007 x-iscii-or ISCII Odia
238 | // 57008 x-iscii-ka ISCII Kannada
239 | // 57009 x-iscii-ma ISCII Malayalam
240 | // 57010 x-iscii-gu ISCII Gujarati
241 | // 57011 x-iscii-pa ISCII Punjabi
242 | LOCALEID_MAPPING.put("65000", null); // UTF-7 - not a supported Java encoding, see: http://stackoverflow.com/questions/19861987/java-io-unsupportedencodingexception-unicode-1-1-utf-7
243 | LOCALEID_MAPPING.put("65001", "UTF-8"); // UTF-8
244 | }
245 | }
246 |
--------------------------------------------------------------------------------