├── src
├── test
│ ├── resources
│ │ ├── v3
│ │ │ ├── 4e00-62ff.hwp
│ │ │ ├── empty-v3.hwp
│ │ │ └── han_special_char_3.0.hwp
│ │ ├── v5
│ │ │ ├── han_grammar.hwp
│ │ │ ├── han_special_char.hwp
│ │ │ └── test-distribute.hwp
│ │ └── log4j.properties
│ └── java
│ │ └── com
│ │ └── argo
│ │ └── hwp
│ │ ├── v3
│ │ ├── TestHwpV3Extractor.java
│ │ └── TestMapLoad.java
│ │ └── v5
│ │ └── TestHwpV5Extractor.java
└── main
│ └── java
│ └── com
│ └── argo
│ └── hwp
│ ├── HwpTextExtractor.java
│ ├── v3
│ ├── Hnc2String.java
│ └── HwpTextExtractorV3.java
│ ├── utils
│ └── HwpStreamReader.java
│ └── v5
│ └── HwpTextExtractorV5.java
├── target
├── classes
│ └── com
│ │ └── argo
│ │ └── hwp
│ │ ├── v3
│ │ ├── Hnc2String.class
│ │ └── HwpTextExtractorV3.class
│ │ ├── HwpTextExtractor.class
│ │ ├── utils
│ │ └── HwpStreamReader.class
│ │ └── v5
│ │ ├── HwpTextExtractorV5.class
│ │ ├── HwpTextExtractorV5$TagInfo.class
│ │ ├── HwpTextExtractorV5$FileHeader.class
│ │ └── HwpTextExtractorV5$HwpVersion.class
├── test-classes
│ ├── com
│ │ └── argo
│ │ │ └── hwp
│ │ │ ├── v3
│ │ │ └── TestHwpV3Extractor.class
│ │ │ └── v5
│ │ │ └── TestHwpV5Extractor.class
│ └── log4j.properties
└── surefire-reports
│ ├── com.argo.hwp.v3.TestHwpV3Extractor.txt
│ ├── com.argo.hwp.v5.TestHwpV5Extractor.txt
│ ├── TEST-com.argo.hwp.v5.TestHwpV5Extractor.xml
│ └── TEST-com.argo.hwp.v3.TestHwpV3Extractor.xml
├── .gitattributes
├── README.md
├── pom.xml
├── .gitignore
└── LICENSE-2.0.txt
/src/test/resources/v3/4e00-62ff.hwp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ddoleye/java-hwp/HEAD/src/test/resources/v3/4e00-62ff.hwp
--------------------------------------------------------------------------------
/src/test/resources/v3/empty-v3.hwp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ddoleye/java-hwp/HEAD/src/test/resources/v3/empty-v3.hwp
--------------------------------------------------------------------------------
/src/test/resources/v5/han_grammar.hwp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ddoleye/java-hwp/HEAD/src/test/resources/v5/han_grammar.hwp
--------------------------------------------------------------------------------
/src/test/resources/v5/han_special_char.hwp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ddoleye/java-hwp/HEAD/src/test/resources/v5/han_special_char.hwp
--------------------------------------------------------------------------------
/src/test/resources/v5/test-distribute.hwp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ddoleye/java-hwp/HEAD/src/test/resources/v5/test-distribute.hwp
--------------------------------------------------------------------------------
/src/test/resources/v3/han_special_char_3.0.hwp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ddoleye/java-hwp/HEAD/src/test/resources/v3/han_special_char_3.0.hwp
--------------------------------------------------------------------------------
/target/classes/com/argo/hwp/v3/Hnc2String.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ddoleye/java-hwp/HEAD/target/classes/com/argo/hwp/v3/Hnc2String.class
--------------------------------------------------------------------------------
/target/classes/com/argo/hwp/HwpTextExtractor.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ddoleye/java-hwp/HEAD/target/classes/com/argo/hwp/HwpTextExtractor.class
--------------------------------------------------------------------------------
/target/classes/com/argo/hwp/utils/HwpStreamReader.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ddoleye/java-hwp/HEAD/target/classes/com/argo/hwp/utils/HwpStreamReader.class
--------------------------------------------------------------------------------
/target/classes/com/argo/hwp/v3/HwpTextExtractorV3.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ddoleye/java-hwp/HEAD/target/classes/com/argo/hwp/v3/HwpTextExtractorV3.class
--------------------------------------------------------------------------------
/target/classes/com/argo/hwp/v5/HwpTextExtractorV5.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ddoleye/java-hwp/HEAD/target/classes/com/argo/hwp/v5/HwpTextExtractorV5.class
--------------------------------------------------------------------------------
/target/test-classes/com/argo/hwp/v3/TestHwpV3Extractor.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ddoleye/java-hwp/HEAD/target/test-classes/com/argo/hwp/v3/TestHwpV3Extractor.class
--------------------------------------------------------------------------------
/target/test-classes/com/argo/hwp/v5/TestHwpV5Extractor.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ddoleye/java-hwp/HEAD/target/test-classes/com/argo/hwp/v5/TestHwpV5Extractor.class
--------------------------------------------------------------------------------
/target/classes/com/argo/hwp/v5/HwpTextExtractorV5$TagInfo.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ddoleye/java-hwp/HEAD/target/classes/com/argo/hwp/v5/HwpTextExtractorV5$TagInfo.class
--------------------------------------------------------------------------------
/target/surefire-reports/com.argo.hwp.v3.TestHwpV3Extractor.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ddoleye/java-hwp/HEAD/target/surefire-reports/com.argo.hwp.v3.TestHwpV3Extractor.txt
--------------------------------------------------------------------------------
/target/classes/com/argo/hwp/v5/HwpTextExtractorV5$FileHeader.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ddoleye/java-hwp/HEAD/target/classes/com/argo/hwp/v5/HwpTextExtractorV5$FileHeader.class
--------------------------------------------------------------------------------
/target/classes/com/argo/hwp/v5/HwpTextExtractorV5$HwpVersion.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ddoleye/java-hwp/HEAD/target/classes/com/argo/hwp/v5/HwpTextExtractorV5$HwpVersion.class
--------------------------------------------------------------------------------
/src/test/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | #log4j.rootLogger = DEBUG, stdout, dailyfile
2 | log4j.rootLogger = DEBUG, stdout
3 |
4 | log4j.appender.stdout = org.apache.log4j.ConsoleAppender
5 | log4j.appender.stdout.layout = org.apache.log4j.PatternLayout
6 | log4j.appender.stdout.layout.ConversionPattern=%5p ({%t} %F[%M]:%L) [%d] - %m%n
7 |
8 |
--------------------------------------------------------------------------------
/target/test-classes/log4j.properties:
--------------------------------------------------------------------------------
1 | #log4j.rootLogger = DEBUG, stdout, dailyfile
2 | log4j.rootLogger = DEBUG, stdout
3 |
4 | log4j.appender.stdout = org.apache.log4j.ConsoleAppender
5 | log4j.appender.stdout.layout = org.apache.log4j.PatternLayout
6 | log4j.appender.stdout.layout.ConversionPattern=%5p ({%t} %F[%M]:%L) [%d] - %m%n
7 |
8 |
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 |
4 | # Custom for Visual Studio
5 | *.cs diff=csharp
6 | *.sln merge=union
7 | *.csproj merge=union
8 | *.vbproj merge=union
9 | *.fsproj merge=union
10 | *.dbproj merge=union
11 |
12 | # Standard to msysgit
13 | *.doc diff=astextplain
14 | *.DOC diff=astextplain
15 | *.docx diff=astextplain
16 | *.DOCX diff=astextplain
17 | *.dot diff=astextplain
18 | *.DOT diff=astextplain
19 | *.pdf diff=astextplain
20 | *.PDF diff=astextplain
21 | *.rtf diff=astextplain
22 | *.RTF diff=astextplain
23 |
--------------------------------------------------------------------------------
/target/surefire-reports/com.argo.hwp.v5.TestHwpV5Extractor.txt:
--------------------------------------------------------------------------------
1 | -------------------------------------------------------------------------------
2 | Test set: com.argo.hwp.v5.TestHwpV5Extractor
3 | -------------------------------------------------------------------------------
4 | Tests run: 1, Failures: 1, Errors: 0, Skipped: 0, Time elapsed: 0 sec <<< FAILURE!
5 | com.argo.hwp.v5.TestHwpV5Extractor.testObjectStreams() Time elapsed: 0 sec <<< FAILURE!
6 | java.io.FileNotFoundException
7 | at com.argo.hwp.v5.HwpTextExtractorV5.extractText(HwpTextExtractorV5.java:67)
8 | at com.argo.hwp.v5.TestHwpV5Extractor.extract(TestHwpV5Extractor.java:16)
9 | at com.argo.hwp.v5.TestHwpV5Extractor.testObjectStreams(TestHwpV5Extractor.java:47)
10 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | java-hwp
2 | ========
3 |
4 | [](https://jitpack.io/#ddoleye/java-hwp)
5 |
6 |
7 | 본 제품은 한글과컴퓨터의 한글 문서 파일(.hwp) 공개 문서를 참고하여 개발하였습니다.
8 |
9 | 개발에 많은 도움을 주신 [cogniti](https://github.com/cogniti)님과 [libhwp Google Group](https://groups.google.com/forum/#!forum/libhwp) 그룹에 감사드립니다.
10 |
11 | HWP 파일에서 텍스트를 추출하는 자바 라이브러리이며 [ruby-hwp](https://github.com/cogniti/ruby-hwp) 의 자바 버전입니다.
12 | ruby-hwp의 로직을 대부분 그대로 사용하며 ruby-hwp의 문자매핑 정보(hnc2unicode.rb) 파일을 사용합니다.
13 |
14 | HWP 5.0 버전의 Compound File은 [Apache-POI의 POIFS File System](http://poi.apache.org/poifs/fileformat.html)을 사용하여 처리합니다.
15 |
16 |
17 | ## 사용방법
18 |
19 | File hwp = new File("hangul.hwp"); // 텍스트를 추출할 HWP 파일
20 | Writer writer = new StringWriter(); // 추출된 텍스트를 출력할 버퍼
21 | HwpTextExtractor.extract(hwp, writer); // 파일로부터 텍스트 추출
22 | String text = writer.toString(); // 추출된 텍스트
23 |
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
3 | 4.0.0
4 | com.argo
5 | java-hwp
6 | 0.1
7 |
8 |
9 | UTF-8
10 | 1.7
11 |
12 |
13 |
14 |
15 | org.apache.poi
16 | poi
17 | 3.9
18 |
19 |
20 |
21 | org.slf4j
22 | slf4j-log4j12
23 | 1.7.5
24 | provided
25 |
26 |
27 | junit
28 | junit
29 | 4.12
30 |
31 |
32 |
33 |
34 |
35 |
36 | ${project.artifactId}
37 |
38 |
39 | maven-compiler-plugin
40 |
41 | ${project.source.version}
42 | ${project.source.version}
43 | ${project.source.encoding}
44 |
45 |
46 |
47 | maven-resources-plugin
48 |
49 | ${project.source.encoding}
50 |
51 |
52 |
53 |
54 |
--------------------------------------------------------------------------------
/src/main/java/com/argo/hwp/HwpTextExtractor.java:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright [2015] argonet.co.kr
3 |
4 | Licensed under the Apache License, Version 2.0 (the "License");
5 | you may not use this file except in compliance with the License.
6 | You may obtain a copy of the License at
7 |
8 | http://www.apache.org/licenses/LICENSE-2.0
9 |
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | /*
17 | * This software has been developed with reference to
18 | * the HWP file format open specification by Hancom, Inc.
19 | * http://www.hancom.co.kr/userofficedata.userofficedataList.do?menuFlag=3
20 | * 한글과컴퓨터의 한/글 문서 파일(.hwp) 공개 문서를 참고하여 개발하였습니다.
21 | *
22 | * 본 제품은 다음의 소스를 참조하였습니다.
23 | * https://github.com/cogniti/ruby-hwp/
24 | */
25 | package com.argo.hwp;
26 |
27 | import java.io.File;
28 | import java.io.FileNotFoundException;
29 | import java.io.IOException;
30 | import java.io.Writer;
31 |
32 | import org.slf4j.Logger;
33 | import org.slf4j.LoggerFactory;
34 |
35 | import com.argo.hwp.v3.HwpTextExtractorV3;
36 | import com.argo.hwp.v5.HwpTextExtractorV5;
37 |
38 | public abstract class HwpTextExtractor {
39 | protected static Logger log = LoggerFactory.getLogger(HwpTextExtractor.class);
40 |
41 | public static boolean extract(File source, Writer writer)
42 | throws FileNotFoundException, IOException {
43 | if (source == null || writer == null)
44 | throw new IllegalArgumentException();
45 | if (!source.exists())
46 | throw new FileNotFoundException();
47 |
48 | // 먼저 V5 부터 시도
49 | boolean success = HwpTextExtractorV5.extractText(source, writer);
50 |
51 | // 아니라면 V3 시도
52 | if (!success)
53 | success = HwpTextExtractorV3.extractText(source, writer);
54 |
55 | return success;
56 | }
57 | }
58 |
--------------------------------------------------------------------------------
/src/test/java/com/argo/hwp/v3/TestHwpV3Extractor.java:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright [2015] argonet.co.kr
3 |
4 | Licensed under the Apache License, Version 2.0 (the "License");
5 | you may not use this file except in compliance with the License.
6 | You may obtain a copy of the License at
7 |
8 | http://www.apache.org/licenses/LICENSE-2.0
9 |
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | /*
17 | * This software has been developed with reference to
18 | * the HWP file format open specification by Hancom, Inc.
19 | * http://www.hancom.co.kr/userofficedata.userofficedataList.do?menuFlag=3
20 | * 한글과컴퓨터의 한/글 문서 파일(.hwp) 공개 문서를 참고하여 개발하였습니다.
21 | *
22 | * 본 제품은 다음의 소스를 참조하였습니다.
23 | * https://github.com/cogniti/ruby-hwp/
24 | */
25 | package com.argo.hwp.v3;
26 |
27 | import java.io.File;
28 | import java.io.FileNotFoundException;
29 | import java.io.IOException;
30 | import java.io.StringWriter;
31 | import java.util.StringTokenizer;
32 |
33 | import org.junit.Test;
34 |
35 | public class TestHwpV3Extractor {
36 | private String extract(String path) throws FileNotFoundException,
37 | IOException {
38 | // File file = new File(path);
39 | File file = new File(getClass().getResource("/" + path).getFile());
40 | // System.out.println(file.getAbsolutePath());
41 | StringWriter writer = new StringWriter(4096);
42 | HwpTextExtractorV3.extractText(file, writer);
43 | return writer.toString();
44 | }
45 |
46 | private String tokenize(String t) {
47 | StringWriter writer = new StringWriter(4096);
48 | StringTokenizer token = new StringTokenizer(t);
49 | while (token.hasMoreTokens()) {
50 | writer.append(token.nextToken()).append("\n");
51 | }
52 | return writer.toString();
53 | }
54 |
55 | private String extractIgnoreException(String path) {
56 | try {
57 | return extract(path);
58 | } catch (Exception e) {
59 | System.out.println(e.getMessage());
60 | return null;
61 | }
62 | }
63 |
64 | @Test
65 | public void testCharConversion() {
66 | // 조합형
67 | System.out.println(Hnc2String.convert(Integer.parseInt(
68 | "1000010001100010", 2)));
69 | }
70 |
71 | @Test
72 | public void testExtractText() throws IOException, ClassNotFoundException {
73 | System.out.println(extract("v3/4e00-62ff.hwp"));
74 | System.out.println(extract("v3/han_special_char_3.0.hwp"));
75 | }
76 | }
--------------------------------------------------------------------------------
/src/test/java/com/argo/hwp/v5/TestHwpV5Extractor.java:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright [2015] argonet.co.kr
3 |
4 | Licensed under the Apache License, Version 2.0 (the "License");
5 | you may not use this file except in compliance with the License.
6 | You may obtain a copy of the License at
7 |
8 | http://www.apache.org/licenses/LICENSE-2.0
9 |
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | /*
17 | * This software has been developed with reference to
18 | * the HWP file format open specification by Hancom, Inc.
19 | * http://www.hancom.co.kr/userofficedata.userofficedataList.do?menuFlag=3
20 | * 한글과컴퓨터의 한/글 문서 파일(.hwp) 공개 문서를 참고하여 개발하였습니다.
21 | *
22 | * 본 제품은 다음의 소스를 참조하였습니다.
23 | * https://github.com/cogniti/ruby-hwp/
24 | */
25 | package com.argo.hwp.v5;
26 |
27 | import java.io.File;
28 | import java.io.FileNotFoundException;
29 | import java.io.IOException;
30 | import java.io.StringWriter;
31 |
32 | import org.junit.Test;
33 |
34 | public class TestHwpV5Extractor {
35 | private String extract(String path) throws FileNotFoundException,
36 | IOException {
37 | // File file = new File(path);
38 | File file = new File(getClass().getResource("/"+path).getFile());
39 | StringWriter writer = new StringWriter(4096);
40 | HwpTextExtractorV5.extractText(file, writer);
41 | return writer.toString();
42 | }
43 |
44 | /**
45 | * 디버그.. 문자와 코드값 출력
46 | *
47 | * @param t
48 | * @return
49 | */
50 | private String withCode(String t) {
51 | StringWriter writer = new StringWriter(4096);
52 | for (int ii = 0; ii < t.length(); ii++) {
53 | char ch = t.charAt(ii);
54 | if (ch == ' ' || ch == '\n')
55 | continue;
56 | writer.append(ch);
57 | if (ch >= 128) {
58 | writer.append("\t").append(String.format("0x%1$04x", (int) ch));
59 | }
60 | writer.append("\n");
61 | }
62 | return writer.toString();
63 | }
64 |
65 | private String extractIgnoreException(String path) {
66 | try {
67 | return extract(path);
68 | } catch (Exception e) {
69 | System.out.println(e.getMessage());
70 | return null;
71 | }
72 | }
73 |
74 | @Test
75 | public void testExtractText() throws IOException, ClassNotFoundException {
76 | // System.out.println(extract("v5/han_grammar.hwp"));
77 | // System.out.println(extract("v5/han_special_char.hwp"));
78 |
79 | System.out.println(extract("v5/test-distribute.hwp"));
80 | }
81 | }
--------------------------------------------------------------------------------
/src/test/java/com/argo/hwp/v3/TestMapLoad.java:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright [2015] argonet.co.kr
3 |
4 | Licensed under the Apache License, Version 2.0 (the "License");
5 | you may not use this file except in compliance with the License.
6 | You may obtain a copy of the License at
7 |
8 | http://www.apache.org/licenses/LICENSE-2.0
9 |
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | /*
17 | * This software has been developed with reference to
18 | * the HWP file format open specification by Hancom, Inc.
19 | * http://www.hancom.co.kr/userofficedata.userofficedataList.do?menuFlag=3
20 | * 한글과컴퓨터의 한/글 문서 파일(.hwp) 공개 문서를 참고하여 개발하였습니다.
21 | *
22 | * 본 제품은 다음의 소스를 참조하였습니다.
23 | * https://github.com/cogniti/ruby-hwp/
24 | */
25 | package com.argo.hwp.v3;
26 |
27 | import java.io.BufferedReader;
28 | import java.io.IOException;
29 | import java.io.InputStream;
30 | import java.io.InputStreamReader;
31 | import java.util.regex.Matcher;
32 | import java.util.regex.Pattern;
33 |
34 | import org.junit.Assert;
35 | import org.junit.Before;
36 | import org.junit.Test;
37 |
38 | public class TestMapLoad {
39 | private static String[] map = new String[65536];
40 |
41 | @Before
42 | public void test() throws IOException {
43 | Pattern P = Pattern
44 | .compile("0x([0-9a-f]{4})\\s*=>\\s*\\[" + "0x([0-9a-f]{4})"
45 | + "(?:,\\s*0x([0-9a-f]{4}))?"
46 | + "(?:,\\s*0x([0-9a-f]{4}))?" + "\\]",
47 | Pattern.CASE_INSENSITIVE);
48 |
49 | InputStream resource = Hnc2String.class.getClassLoader()
50 | .getResourceAsStream("hnc2unicode.rb");
51 | int lineNumber = 0;
52 | char[] chars = new char[3];
53 |
54 | try {
55 | BufferedReader reader = new BufferedReader(new InputStreamReader(
56 | resource, "UTF-8"));
57 |
58 | for (;;) {
59 | String line = reader.readLine();
60 | if (line == null)
61 | break;
62 |
63 | lineNumber++;
64 | line = line.trim();
65 | if (line.length() == 0)
66 | continue;
67 | if (line.startsWith("#"))
68 | continue;
69 |
70 | Matcher matcher = P.matcher(line);
71 | if (matcher.find()) {
72 | int code = Integer.parseInt(matcher.group(1), 16);
73 | int len;
74 |
75 | for (len = 1; len < matcher.groupCount(); len++) {
76 | String hex = matcher.group(len + 1);
77 | if (hex == null)
78 | break;
79 |
80 | chars[len - 1] = (char) Integer.parseInt(hex, 16);
81 | }
82 |
83 | map[code] = new String(chars, 0, len - 1);
84 | } else {
85 | System.out.println("[" + lineNumber + "]>>>" + line);
86 | }
87 | }
88 | } finally {
89 | try {
90 | resource.close();
91 | } catch (IOException e) {
92 | // ignore ?
93 | }
94 | }
95 | }
96 |
97 | @Test
98 | public void test1() {
99 | Assert.assertEquals("가", map[0x8861]);
100 | Assert.assertEquals(new String(new char[] { 0xf7fe, 0xf863, 0xf8e6 }),
101 | map[(int) 0xfff8]);
102 | }
103 | }
104 |
--------------------------------------------------------------------------------
/src/main/java/com/argo/hwp/v3/Hnc2String.java:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright [2015] argonet.co.kr
3 |
4 | Licensed under the Apache License, Version 2.0 (the "License");
5 | you may not use this file except in compliance with the License.
6 | You may obtain a copy of the License at
7 |
8 | http://www.apache.org/licenses/LICENSE-2.0
9 |
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | /*
17 | * This software has been developed with reference to
18 | * the HWP file format open specification by Hancom, Inc.
19 | * http://www.hancom.co.kr/userofficedata.userofficedataList.do?menuFlag=3
20 | * 한글과컴퓨터의 한/글 문서 파일(.hwp) 공개 문서를 참고하여 개발하였습니다.
21 | *
22 | * 본 제품은 다음의 소스를 참조하였습니다.
23 | * https://github.com/cogniti/ruby-hwp/
24 | */
25 | package com.argo.hwp.v3;
26 |
27 | import java.io.BufferedReader;
28 | import java.io.IOException;
29 | import java.io.InputStream;
30 | import java.io.InputStreamReader;
31 | import java.io.UnsupportedEncodingException;
32 | import java.util.regex.Matcher;
33 | import java.util.regex.Pattern;
34 |
35 | import org.slf4j.Logger;
36 | import org.slf4j.LoggerFactory;
37 |
38 | class Hnc2String {
39 | static Logger log = LoggerFactory.getLogger(Hnc2String.class);
40 | static final String[] map = new String[65536]; // max : 0xFFFF
41 |
42 | static {
43 | Pattern P = Pattern
44 | .compile("0x([0-9a-f]{4})\\s*=>\\s*\\[" + "0x([0-9a-f]{4})"
45 | + "(?:,\\s*0x([0-9a-f]{4}))?"
46 | + "(?:,\\s*0x([0-9a-f]{4}))?" + "\\]",
47 | Pattern.CASE_INSENSITIVE);
48 |
49 | InputStream resource = Hnc2String.class.getClassLoader()
50 | .getResourceAsStream("hnc2unicode.rb");
51 | int lineNumber = 0;
52 | char[] chars = new char[3];
53 |
54 | try {
55 | BufferedReader reader = new BufferedReader(new InputStreamReader(
56 | resource, "UTF-8"));
57 |
58 | for (;;) {
59 | String line = reader.readLine();
60 | if (line == null)
61 | break;
62 |
63 | lineNumber++;
64 | line = line.trim();
65 | if (line.length() == 0)
66 | continue;
67 | if (line.startsWith("#"))
68 | continue;
69 |
70 | Matcher matcher = P.matcher(line);
71 | if (matcher.find()) {
72 | int code = Integer.parseInt(matcher.group(1), 16);
73 | int len;
74 |
75 | for (len = 1; len < matcher.groupCount(); len++) {
76 | String hex = matcher.group(len + 1);
77 | if (hex == null)
78 | break;
79 |
80 | chars[len - 1] = (char) Integer.parseInt(hex, 16);
81 | }
82 |
83 | map[code] = new String(chars, 0, len - 1);
84 | } else {
85 | System.out.println("[" + lineNumber + "]>>>" + line);
86 | }
87 | }
88 | } catch (UnsupportedEncodingException e) {
89 | throw new RuntimeException(e);
90 | } catch (IOException e) {
91 | throw new RuntimeException(e);
92 | } finally {
93 | try {
94 | resource.close();
95 | } catch (IOException e) {
96 | // ignore ?
97 | }
98 | }
99 | }
100 |
101 | static String convert(int c) {
102 | assert c >= 0 && c < 0xFFFF;
103 |
104 | return map[c];
105 | }
106 | }
107 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | #################
2 | ## Eclipse
3 | #################
4 |
5 | *.pydevproject
6 | .project
7 | .metadata
8 | bin/
9 | tmp/
10 | *.tmp
11 | *.bak
12 | *.swp
13 | *~.nib
14 | local.properties
15 | .classpath
16 | .settings/
17 | .loadpath
18 |
19 | # External tool builders
20 | .externalToolBuilders/
21 |
22 | # Locally stored "Eclipse launch configurations"
23 | *.launch
24 |
25 | # CDT-specific
26 | .cproject
27 |
28 | # PDT-specific
29 | .buildpath
30 |
31 |
32 | #################
33 | ## Visual Studio
34 | #################
35 |
36 | ## Ignore Visual Studio temporary files, build results, and
37 | ## files generated by popular Visual Studio add-ons.
38 |
39 | # User-specific files
40 | *.suo
41 | *.user
42 | *.sln.docstates
43 |
44 | # Build results
45 |
46 | [Dd]ebug/
47 | [Rr]elease/
48 | x64/
49 | build/
50 | [Bb]in/
51 | [Oo]bj/
52 |
53 | # MSTest test Results
54 | [Tt]est[Rr]esult*/
55 | [Bb]uild[Ll]og.*
56 |
57 | *_i.c
58 | *_p.c
59 | *.ilk
60 | *.meta
61 | *.obj
62 | *.pch
63 | *.pdb
64 | *.pgc
65 | *.pgd
66 | *.rsp
67 | *.sbr
68 | *.tlb
69 | *.tli
70 | *.tlh
71 | *.tmp
72 | *.tmp_proj
73 | *.log
74 | *.vspscc
75 | *.vssscc
76 | .builds
77 | *.pidb
78 | *.log
79 | *.scc
80 |
81 | # Visual C++ cache files
82 | ipch/
83 | *.aps
84 | *.ncb
85 | *.opensdf
86 | *.sdf
87 | *.cachefile
88 |
89 | # Visual Studio profiler
90 | *.psess
91 | *.vsp
92 | *.vspx
93 |
94 | # Guidance Automation Toolkit
95 | *.gpState
96 |
97 | # ReSharper is a .NET coding add-in
98 | _ReSharper*/
99 | *.[Rr]e[Ss]harper
100 |
101 | # TeamCity is a build add-in
102 | _TeamCity*
103 |
104 | # DotCover is a Code Coverage Tool
105 | *.dotCover
106 |
107 | # NCrunch
108 | *.ncrunch*
109 | .*crunch*.local.xml
110 |
111 | # Installshield output folder
112 | [Ee]xpress/
113 |
114 | # DocProject is a documentation generator add-in
115 | DocProject/buildhelp/
116 | DocProject/Help/*.HxT
117 | DocProject/Help/*.HxC
118 | DocProject/Help/*.hhc
119 | DocProject/Help/*.hhk
120 | DocProject/Help/*.hhp
121 | DocProject/Help/Html2
122 | DocProject/Help/html
123 |
124 | # Click-Once directory
125 | publish/
126 |
127 | # Publish Web Output
128 | *.Publish.xml
129 | *.pubxml
130 |
131 | # NuGet Packages Directory
132 | ## TODO: If you have NuGet Package Restore enabled, uncomment the next line
133 | #packages/
134 |
135 | # Windows Azure Build Output
136 | csx
137 | *.build.csdef
138 |
139 | # Windows Store app package directory
140 | AppPackages/
141 |
142 | # Others
143 | sql/
144 | *.Cache
145 | ClientBin/
146 | [Ss]tyle[Cc]op.*
147 | ~$*
148 | *~
149 | *.dbmdl
150 | *.[Pp]ublish.xml
151 | *.pfx
152 | *.publishsettings
153 |
154 | # RIA/Silverlight projects
155 | Generated_Code/
156 |
157 | # Backup & report files from converting an old project file to a newer
158 | # Visual Studio version. Backup files are not needed, because we have git ;-)
159 | _UpgradeReport_Files/
160 | Backup*/
161 | UpgradeLog*.XML
162 | UpgradeLog*.htm
163 |
164 | # SQL Server files
165 | App_Data/*.mdf
166 | App_Data/*.ldf
167 |
168 | #############
169 | ## Windows detritus
170 | #############
171 |
172 | # Windows image file caches
173 | Thumbs.db
174 | ehthumbs.db
175 |
176 | # Folder config file
177 | Desktop.ini
178 |
179 | # Recycle Bin used on file shares
180 | $RECYCLE.BIN/
181 |
182 | # Mac crap
183 | .DS_Store
184 |
185 |
186 | #############
187 | ## Python
188 | #############
189 |
190 | *.py[co]
191 |
192 | # Packages
193 | *.egg
194 | *.egg-info
195 | dist/
196 | build/
197 | eggs/
198 | parts/
199 | var/
200 | sdist/
201 | develop-eggs/
202 | .installed.cfg
203 |
204 | # Installer logs
205 | pip-log.txt
206 |
207 | # Unit test / coverage reports
208 | .coverage
209 | .tox
210 |
211 | #Translations
212 | *.mo
213 |
214 | #Mr Developer
215 | .mr.developer.cfg
216 | /target/
217 |
--------------------------------------------------------------------------------
/src/main/java/com/argo/hwp/utils/HwpStreamReader.java:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright [2015] argonet.co.kr
3 |
4 | Licensed under the Apache License, Version 2.0 (the "License");
5 | you may not use this file except in compliance with the License.
6 | You may obtain a copy of the License at
7 |
8 | http://www.apache.org/licenses/LICENSE-2.0
9 |
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | /*
17 | * This software has been developed with reference to
18 | * the HWP file format open specification by Hancom, Inc.
19 | * http://www.hancom.co.kr/userofficedata.userofficedataList.do?menuFlag=3
20 | * 한글과컴퓨터의 한/글 문서 파일(.hwp) 공개 문서를 참고하여 개발하였습니다.
21 | *
22 | * 본 제품은 다음의 소스를 참조하였습니다.
23 | * https://github.com/cogniti/ruby-hwp/
24 | */
25 | package com.argo.hwp.utils;
26 |
27 | import java.io.EOFException;
28 | import java.io.IOException;
29 | import java.io.InputStream;
30 |
31 | import org.apache.poi.util.LittleEndian;
32 | import org.slf4j.Logger;
33 | import org.slf4j.LoggerFactory;
34 |
35 | public class HwpStreamReader {
36 | private Logger log = LoggerFactory.getLogger(getClass());
37 | private InputStream input;
38 | private byte[] buf;
39 |
40 | public HwpStreamReader(InputStream inputStream) {
41 | this.input = inputStream;
42 | buf = new byte[4];
43 | }
44 |
45 | /**
46 | * 읽을 데이터가 더 있는가?
47 | *
48 | * @return
49 | * @throws IOException
50 | */
51 | public boolean available() throws IOException {
52 | return input.available() > 0;
53 | }
54 |
55 | /**
56 | * unsigned 1 byte
57 | *
58 | * @return
59 | * @throws IOException
60 | */
61 | public short uint8() throws IOException {
62 | if (ensure(1) == 0)
63 | return -1;
64 |
65 | return LittleEndian.getUByte(buf);
66 | }
67 |
68 | /**
69 | * unsigned 2 byte
70 | *
71 | * @return
72 | * @throws IOException
73 | */
74 | public int uint16() throws IOException {
75 | if (ensure(2) == 0)
76 | return -1;
77 |
78 | return LittleEndian.getUShort(buf);
79 | }
80 |
81 | /**
82 | * unsigned 2 byte array
83 | *
84 | * @param i
85 | * @return
86 | * @throws IOException
87 | */
88 | public int[] uint16(int i) throws IOException {
89 | if (i <= 0)
90 | throw new IllegalArgumentException();
91 |
92 | int[] uints = new int[i];
93 | for (int ii = 0; ii < i; ii++) {
94 | if (ensure(2) == 0)
95 | throw new EOFException();
96 |
97 | uints[ii] = LittleEndian.getUShort(buf);
98 | }
99 |
100 | return uints;
101 | }
102 |
103 | /**
104 | * unsigned 4 byte
105 | *
106 | * @return
107 | * @throws IOException
108 | */
109 | public long uint32() throws IOException {
110 | if (ensure(4) == 0)
111 | return -1;
112 |
113 | return LittleEndian.getUInt(buf);
114 | }
115 |
116 | /**
117 | *
118 | * @param n
119 | * @return
120 | * @throws IOException
121 | */
122 | public long skip(long n) throws IOException {
123 | return input.skip(n);
124 | }
125 |
126 | /**
127 | * n만큼 skip 하지 못할 경우 IOException 을 발생한다
128 | *
129 | * @param n
130 | * @throws IOException
131 | */
132 | public void ensureSkip(long n) throws IOException {
133 | long skipped = skip(n);
134 | if (n != skipped) {
135 | log.error("Skip failed {} => {}", n, skipped);
136 | throw new IOException();
137 | }
138 | }
139 |
140 | /**
141 | * count만큼 바이트를 읽는다. InflaterInputStream의 경우 한번에 count만큼 read가 안되는 경우가 있다.
142 | * 그래서 count만큼 읽을 때까지 루프를 실행한다
143 | *
144 | * @param count
145 | * @return
146 | * @throws IOException
147 | * @throws EOFException
148 | */
149 | private int ensure(int count) throws IOException, EOFException {
150 | int total = 0;
151 | while (total < count) {
152 | // if (total > 0) {
153 | // log.warn("한번에 읽기 실패 {}/{}. 다시 읽기 시도함 {}", total, count, input);
154 | // }
155 |
156 | int read = input.read(buf, total, count - total);
157 | if (read <= 0)
158 | break;
159 |
160 | total += read;
161 | }
162 |
163 | if (total == 0) {
164 | // end
165 | } else if (total < count) {
166 | // unexpected end
167 | throw new EOFException();
168 | }
169 |
170 | return total;
171 | }
172 | }
--------------------------------------------------------------------------------
/target/surefire-reports/TEST-com.argo.hwp.v5.TestHwpV5Extractor.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 | java.io.FileNotFoundException
66 | at com.argo.hwp.v5.HwpTextExtractorV5.extractText(HwpTextExtractorV5.java:67)
67 | at com.argo.hwp.v5.TestHwpV5Extractor.extract(TestHwpV5Extractor.java:16)
68 | at com.argo.hwp.v5.TestHwpV5Extractor.testObjectStreams(TestHwpV5Extractor.java:47)
69 |
70 |
71 |
--------------------------------------------------------------------------------
/target/surefire-reports/TEST-com.argo.hwp.v3.TestHwpV3Extractor.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 | java.io.FileNotFoundException: empty-v3.hwp (지정된 파일을 찾을 수 없습니다)
66 | at java.io.FileInputStream.open(Native Method)
67 | at java.io.FileInputStream.<init>(FileInputStream.java:138)
68 | at com.argo.hwp.v3.HwpTextExtractorV3.extractText(HwpTextExtractorV3.java:35)
69 | at com.argo.hwp.v3.TestHwpV3Extractor.extract(TestHwpV3Extractor.java:18)
70 | at com.argo.hwp.v3.TestHwpV3Extractor.testObjectStreams(TestHwpV3Extractor.java:76)
71 |
72 |
73 |
--------------------------------------------------------------------------------
/src/main/java/com/argo/hwp/v3/HwpTextExtractorV3.java:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright [2015] argonet.co.kr
3 |
4 | Licensed under the Apache License, Version 2.0 (the "License");
5 | you may not use this file except in compliance with the License.
6 | You may obtain a copy of the License at
7 |
8 | http://www.apache.org/licenses/LICENSE-2.0
9 |
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | /*
17 | * This software has been developed with reference to
18 | * the HWP file format open specification by Hancom, Inc.
19 | * http://www.hancom.co.kr/userofficedata.userofficedataList.do?menuFlag=3
20 | * 한글과컴퓨터의 한/글 문서 파일(.hwp) 공개 문서를 참고하여 개발하였습니다.
21 | *
22 | * 본 제품은 다음의 소스를 참조하였습니다.
23 | * https://github.com/cogniti/ruby-hwp/
24 | */
25 | package com.argo.hwp.v3;
26 |
27 | import java.io.File;
28 | import java.io.FileInputStream;
29 | import java.io.IOException;
30 | import java.io.InputStream;
31 | import java.io.Writer;
32 | import java.util.Arrays;
33 | import java.util.zip.Inflater;
34 | import java.util.zip.InflaterInputStream;
35 |
36 | import org.slf4j.Logger;
37 | import org.slf4j.LoggerFactory;
38 |
39 | import com.argo.hwp.utils.HwpStreamReader;
40 |
41 | public abstract class HwpTextExtractorV3 {
42 | private static Logger log = LoggerFactory
43 | .getLogger(HwpTextExtractorV3.class);
44 |
45 | // 1byte 문자들..
46 | private static final byte[] HWP_V3_SIGNATURE = ("HWP Document File V3.00"
47 | + " \u001A\u0001\u0002\u0003\u0004\u0005").getBytes();
48 |
49 | public static boolean extractText(File source, Writer writer)
50 | throws IOException {
51 | InputStream input = new FileInputStream(source);
52 |
53 | try {
54 | // 한글V3 시그니처 확인
55 | try {
56 | byte[] buf = new byte[HWP_V3_SIGNATURE.length];
57 | int read = input.read(buf);
58 | if (read < HWP_V3_SIGNATURE.length)
59 | return false;
60 |
61 | // 시그니처 확인
62 | if (!Arrays.equals(HWP_V3_SIGNATURE, buf))
63 | return false;
64 | } catch (IOException e) {
65 | log.warn("파일정보 확인 중 오류. HWP 포맷이 아닌 것으로 간주함", e);
66 | return false;
67 | }
68 |
69 | extractText(input, writer);
70 |
71 | return true;
72 | } finally {
73 | try {
74 | // from javadoc. If this file has an associated channel then the
75 | // channel is closed as well.
76 | input.close();
77 | } catch (IOException e) {
78 | log.warn("exception while file.close", e);
79 | }
80 | }
81 | }
82 |
83 | private static void extractText(InputStream inputStream, Writer writer)
84 | throws IOException {
85 | // 시그니처를 위해서 30바이트 읽은 상태
86 |
87 | HwpStreamReader input = new HwpStreamReader(inputStream);
88 |
89 | // 문서 정보 p.72
90 |
91 | // 암호 걸린 파일 확인
92 | input.ensureSkip(96);
93 | int t = input.uint16();
94 | if (t != 0)
95 | throw new IOException("암호화된 문서는 해석할 수 없습니다");
96 |
97 | // 압축 확인
98 | input.ensureSkip(26); // 124
99 | boolean compressed = input.uint8() != 0;
100 | log.debug("압축 확인 : {}", compressed);
101 |
102 | // 정보 블럭 길이
103 | input.ensureSkip(1);
104 | int blockSize = input.uint16();
105 |
106 | // 문서 요약 건너뛰기
107 | input.ensureSkip(1008);
108 | // 정보 블럭 건너뛰기
109 | input.ensureSkip(blockSize);
110 |
111 | // 압축 풀기
112 | if (compressed) {
113 | log.info("본문 압축 해제");
114 | input = new HwpStreamReader(new InflaterInputStream(inputStream,
115 | new Inflater(true)));
116 | }
117 |
118 | // p.73 글꼴이름 건너뛰기
119 | for (int ii = 0; ii < 7; ii++)
120 | input.ensureSkip(input.uint16() * 40);
121 |
122 | // p.74 스타일 건너뛰기
123 | input.ensureSkip(input.uint16() * (20 + 31 + 187));
124 |
125 | // <문단 리스트> ::= <문단>+ <빈문단>
126 | // int paraCount = 0;
127 | while (input.available()) {
128 | // paraCount++;
129 | // log.debug("문단 {}", paraCount);
130 | if (!writeParaText(input, writer))
131 | break;
132 | }
133 | }
134 |
135 | private static boolean writeParaText(HwpStreamReader input, Writer writer)
136 | throws IOException {
137 | // # 문단 정보
138 | short prev_paragraph_shape = input.uint8();
139 | int n_chars = input.uint16();
140 | int n_lines = input.uint16();
141 | short char_shape_included = input.uint8();
142 |
143 | StringBuilder buf = new StringBuilder();
144 |
145 | // p.77 기타 플래그부터..
146 | input.ensureSkip(1 + 4 + 1 + 31);
147 | // # 여기까지 43 bytes
148 | if (prev_paragraph_shape == 0 && n_chars > 0)
149 | input.ensureSkip(187);
150 |
151 | // # 빈문단이면 false 반환
152 | if (n_chars == 0) {
153 | // log.debug("빈문단");
154 | return false;
155 | }
156 |
157 | // # 줄 정보
158 | input.ensureSkip(n_lines * 14);
159 |
160 | // # 글자 모양 정보 p.78
161 | if (char_shape_included != 0) {
162 | for (int ii = 0; ii < n_chars; ii++) {
163 | short flag = input.uint8();
164 | if (flag != 1)
165 | input.ensureSkip(31);
166 | }
167 | }
168 |
169 | log.trace("n_chars = {}", n_chars);
170 |
171 | // # 글자들
172 | int n_chars_read = 0;
173 |
174 | while (n_chars_read < n_chars) {
175 | int c = input.uint16(); // # 2바이트씩 읽는다.
176 | // log.debug("구분 : {}", Integer.toHexString(c));
177 | n_chars_read++;
178 |
179 | switch (c) {
180 | case 5: // 필드코드(덧말, 계산식, 환경정보, 누름틀)
181 | {
182 | long len = input.uint32(); // 정보 길이
183 | input.uint16(); // 5
184 | n_chars_read += 3;
185 | input.ensureSkip(len);
186 | }
187 | break;
188 | case 6: // 책갈피
189 | n_chars_read += 3;
190 | input.ensureSkip(6 + 34);
191 | break;
192 | case 9: // tab
193 | n_chars_read += 3;
194 | input.ensureSkip(6);
195 | writer.write('\t');
196 | break;
197 | case 10: // 표
198 | n_chars_read += 3;
199 | input.ensureSkip(6);
200 |
201 | // # 테이블 식별 정보 84 바이트
202 | input.ensureSkip(80);
203 | int n_cells = input.uint16();
204 | input.ensureSkip(2);
205 | input.ensureSkip(27 * n_cells);
206 |
207 | // # <셀 문단 리스트>+
208 | for (int ii = 0; ii < n_cells; ii++) {
209 | // # <셀 문단 리스트> ::= <셀 문단>+ <빈문단>
210 | // log.debug("셀 {}/{}", ii, n_cells);
211 | while (writeParaText(input, writer))
212 | ;
213 | }
214 | // # <캡션 문단 리스트> ::= <캡션 문단>+ <빈문단>
215 | while (writeParaText(input, writer))
216 | ;
217 | break;
218 |
219 | case 11: // 그림
220 | {
221 | n_chars_read += 3;
222 | input.ensureSkip(6);
223 | long len = input.uint32();
224 | input.ensureSkip(344);
225 | input.ensureSkip(len);
226 | // # <캡션 문단 리스트> ::= <캡션 문단>+ <빈문단>
227 | while (writeParaText(input, writer))
228 | ;
229 | }
230 | break;
231 | case 13: // # 글자들 끝
232 | writer.write('\n');
233 | break;
234 | case 16: // # 머리말/꼬리말
235 | n_chars_read += 3;
236 | input.ensureSkip(6);
237 | input.ensureSkip(10);
238 |
239 | // # <문단 리스트> ::= <문단>+ <빈문단>
240 | while (writeParaText(input, writer))
241 | ;
242 | break;
243 |
244 | case 17: // # 각주/미주
245 | n_chars_read += 3;
246 | input.ensureSkip(6);
247 | // # 각주/미주 정보 건너 뛰기
248 | input.ensureSkip(14);
249 | while (writeParaText(input, writer))
250 | ;
251 | break;
252 | case 18:
253 | case 19:
254 | case 20:
255 | case 21:
256 | n_chars_read += 3;
257 | input.ensureSkip(6);
258 | break;
259 | case 23: // # 글자 겹침
260 | n_chars_read += 4;
261 | input.ensureSkip(8);
262 | break;
263 | case 24:
264 | case 25:
265 | n_chars_read += 2;
266 | input.ensureSkip(4);
267 | break;
268 | case 28: // # 개요 모양/번호
269 | n_chars_read += 31;
270 | input.ensureSkip(62);
271 | break;
272 | case 30:
273 | case 31:
274 | n_chars_read += 1;
275 | input.ensureSkip(2);
276 | break;
277 | default:
278 | if (c >= 0x0020 && c <= 0xffff) {// # hnc code range
279 | String s = Hnc2String.convert(c);
280 | if (s == null) {
281 | log.warn("매핑 문자 없음 {}", Integer.toHexString(c));
282 | writer.write(unknown(c));
283 | } else {
284 | buf.append(s);
285 | writer.write(s);
286 | }
287 | } else {
288 | log.error("특수 문자 ? : {}", Integer.toHexString(c));
289 | // throw new NotImplementedException();
290 | }
291 | }
292 | }
293 |
294 | log.trace(">>> {}", buf.toString());
295 |
296 | return true;
297 | }
298 |
299 | private static String unknown(int c) {
300 | return String.format("?+0x%1$04x", c);
301 | }
302 | }
303 |
--------------------------------------------------------------------------------
/LICENSE-2.0.txt:
--------------------------------------------------------------------------------
1 |
2 | Apache License
3 | Version 2.0, January 2004
4 | http://www.apache.org/licenses/
5 |
6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
7 |
8 | 1. Definitions.
9 |
10 | "License" shall mean the terms and conditions for use, reproduction,
11 | and distribution as defined by Sections 1 through 9 of this document.
12 |
13 | "Licensor" shall mean the copyright owner or entity authorized by
14 | the copyright owner that is granting the License.
15 |
16 | "Legal Entity" shall mean the union of the acting entity and all
17 | other entities that control, are controlled by, or are under common
18 | control with that entity. For the purposes of this definition,
19 | "control" means (i) the power, direct or indirect, to cause the
20 | direction or management of such entity, whether by contract or
21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
22 | outstanding shares, or (iii) beneficial ownership of such entity.
23 |
24 | "You" (or "Your") shall mean an individual or Legal Entity
25 | exercising permissions granted by this License.
26 |
27 | "Source" form shall mean the preferred form for making modifications,
28 | including but not limited to software source code, documentation
29 | source, and configuration files.
30 |
31 | "Object" form shall mean any form resulting from mechanical
32 | transformation or translation of a Source form, including but
33 | not limited to compiled object code, generated documentation,
34 | and conversions to other media types.
35 |
36 | "Work" shall mean the work of authorship, whether in Source or
37 | Object form, made available under the License, as indicated by a
38 | copyright notice that is included in or attached to the work
39 | (an example is provided in the Appendix below).
40 |
41 | "Derivative Works" shall mean any work, whether in Source or Object
42 | form, that is based on (or derived from) the Work and for which the
43 | editorial revisions, annotations, elaborations, or other modifications
44 | represent, as a whole, an original work of authorship. For the purposes
45 | of this License, Derivative Works shall not include works that remain
46 | separable from, or merely link (or bind by name) to the interfaces of,
47 | the Work and Derivative Works thereof.
48 |
49 | "Contribution" shall mean any work of authorship, including
50 | the original version of the Work and any modifications or additions
51 | to that Work or Derivative Works thereof, that is intentionally
52 | submitted to Licensor for inclusion in the Work by the copyright owner
53 | or by an individual or Legal Entity authorized to submit on behalf of
54 | the copyright owner. For the purposes of this definition, "submitted"
55 | means any form of electronic, verbal, or written communication sent
56 | to the Licensor or its representatives, including but not limited to
57 | communication on electronic mailing lists, source code control systems,
58 | and issue tracking systems that are managed by, or on behalf of, the
59 | Licensor for the purpose of discussing and improving the Work, but
60 | excluding communication that is conspicuously marked or otherwise
61 | designated in writing by the copyright owner as "Not a Contribution."
62 |
63 | "Contributor" shall mean Licensor and any individual or Legal Entity
64 | on behalf of whom a Contribution has been received by Licensor and
65 | subsequently incorporated within the Work.
66 |
67 | 2. Grant of Copyright License. Subject to the terms and conditions of
68 | this License, each Contributor hereby grants to You a perpetual,
69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
70 | copyright license to reproduce, prepare Derivative Works of,
71 | publicly display, publicly perform, sublicense, and distribute the
72 | Work and such Derivative Works in Source or Object form.
73 |
74 | 3. Grant of Patent License. Subject to the terms and conditions of
75 | this License, each Contributor hereby grants to You a perpetual,
76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
77 | (except as stated in this section) patent license to make, have made,
78 | use, offer to sell, sell, import, and otherwise transfer the Work,
79 | where such license applies only to those patent claims licensable
80 | by such Contributor that are necessarily infringed by their
81 | Contribution(s) alone or by combination of their Contribution(s)
82 | with the Work to which such Contribution(s) was submitted. If You
83 | institute patent litigation against any entity (including a
84 | cross-claim or counterclaim in a lawsuit) alleging that the Work
85 | or a Contribution incorporated within the Work constitutes direct
86 | or contributory patent infringement, then any patent licenses
87 | granted to You under this License for that Work shall terminate
88 | as of the date such litigation is filed.
89 |
90 | 4. Redistribution. You may reproduce and distribute copies of the
91 | Work or Derivative Works thereof in any medium, with or without
92 | modifications, and in Source or Object form, provided that You
93 | meet the following conditions:
94 |
95 | (a) You must give any other recipients of the Work or
96 | Derivative Works a copy of this License; and
97 |
98 | (b) You must cause any modified files to carry prominent notices
99 | stating that You changed the files; and
100 |
101 | (c) You must retain, in the Source form of any Derivative Works
102 | that You distribute, all copyright, patent, trademark, and
103 | attribution notices from the Source form of the Work,
104 | excluding those notices that do not pertain to any part of
105 | the Derivative Works; and
106 |
107 | (d) If the Work includes a "NOTICE" text file as part of its
108 | distribution, then any Derivative Works that You distribute must
109 | include a readable copy of the attribution notices contained
110 | within such NOTICE file, excluding those notices that do not
111 | pertain to any part of the Derivative Works, in at least one
112 | of the following places: within a NOTICE text file distributed
113 | as part of the Derivative Works; within the Source form or
114 | documentation, if provided along with the Derivative Works; or,
115 | within a display generated by the Derivative Works, if and
116 | wherever such third-party notices normally appear. The contents
117 | of the NOTICE file are for informational purposes only and
118 | do not modify the License. You may add Your own attribution
119 | notices within Derivative Works that You distribute, alongside
120 | or as an addendum to the NOTICE text from the Work, provided
121 | that such additional attribution notices cannot be construed
122 | as modifying the License.
123 |
124 | You may add Your own copyright statement to Your modifications and
125 | may provide additional or different license terms and conditions
126 | for use, reproduction, or distribution of Your modifications, or
127 | for any such Derivative Works as a whole, provided Your use,
128 | reproduction, and distribution of the Work otherwise complies with
129 | the conditions stated in this License.
130 |
131 | 5. Submission of Contributions. Unless You explicitly state otherwise,
132 | any Contribution intentionally submitted for inclusion in the Work
133 | by You to the Licensor shall be under the terms and conditions of
134 | this License, without any additional terms or conditions.
135 | Notwithstanding the above, nothing herein shall supersede or modify
136 | the terms of any separate license agreement you may have executed
137 | with Licensor regarding such Contributions.
138 |
139 | 6. Trademarks. This License does not grant permission to use the trade
140 | names, trademarks, service marks, or product names of the Licensor,
141 | except as required for reasonable and customary use in describing the
142 | origin of the Work and reproducing the content of the NOTICE file.
143 |
144 | 7. Disclaimer of Warranty. Unless required by applicable law or
145 | agreed to in writing, Licensor provides the Work (and each
146 | Contributor provides its Contributions) on an "AS IS" BASIS,
147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 | implied, including, without limitation, any warranties or conditions
149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 | PARTICULAR PURPOSE. You are solely responsible for determining the
151 | appropriateness of using or redistributing the Work and assume any
152 | risks associated with Your exercise of permissions under this License.
153 |
154 | 8. Limitation of Liability. In no event and under no legal theory,
155 | whether in tort (including negligence), contract, or otherwise,
156 | unless required by applicable law (such as deliberate and grossly
157 | negligent acts) or agreed to in writing, shall any Contributor be
158 | liable to You for damages, including any direct, indirect, special,
159 | incidental, or consequential damages of any character arising as a
160 | result of this License or out of the use or inability to use the
161 | Work (including but not limited to damages for loss of goodwill,
162 | work stoppage, computer failure or malfunction, or any and all
163 | other commercial damages or losses), even if such Contributor
164 | has been advised of the possibility of such damages.
165 |
166 | 9. Accepting Warranty or Additional Liability. While redistributing
167 | the Work or Derivative Works thereof, You may choose to offer,
168 | and charge a fee for, acceptance of support, warranty, indemnity,
169 | or other liability obligations and/or rights consistent with this
170 | License. However, in accepting such obligations, You may act only
171 | on Your own behalf and on Your sole responsibility, not on behalf
172 | of any other Contributor, and only if You agree to indemnify,
173 | defend, and hold each Contributor harmless for any liability
174 | incurred by, or claims asserted against, such Contributor by reason
175 | of your accepting any such warranty or additional liability.
176 |
177 | END OF TERMS AND CONDITIONS
178 |
179 | APPENDIX: How to apply the Apache License to your work.
180 |
181 | To apply the Apache License to your work, attach the following
182 | boilerplate notice, with the fields enclosed by brackets "[]"
183 | replaced with your own identifying information. (Don't include
184 | the brackets!) The text should be enclosed in the appropriate
185 | comment syntax for the file format. We also recommend that a
186 | file or class name and description of purpose be included on the
187 | same "printed page" as the copyright notice for easier
188 | identification within third-party archives.
189 |
190 | Copyright [yyyy] [name of copyright owner]
191 |
192 | Licensed under the Apache License, Version 2.0 (the "License");
193 | you may not use this file except in compliance with the License.
194 | You may obtain a copy of the License at
195 |
196 | http://www.apache.org/licenses/LICENSE-2.0
197 |
198 | Unless required by applicable law or agreed to in writing, software
199 | distributed under the License is distributed on an "AS IS" BASIS,
200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 | See the License for the specific language governing permissions and
202 | limitations under the License.
203 |
--------------------------------------------------------------------------------
/src/main/java/com/argo/hwp/v5/HwpTextExtractorV5.java:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright [2015] argonet.co.kr
3 |
4 | Licensed under the Apache License, Version 2.0 (the "License");
5 | you may not use this file except in compliance with the License.
6 | You may obtain a copy of the License at
7 |
8 | http://www.apache.org/licenses/LICENSE-2.0
9 |
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | /*
17 | * This software has been developed with reference to
18 | * the HWP file format open specification by Hancom, Inc.
19 | * http://www.hancom.co.kr/userofficedata.userofficedataList.do?menuFlag=3
20 | * 한글과컴퓨터의 한/글 문서 파일(.hwp) 공개 문서를 참고하여 개발하였습니다.
21 | *
22 | * 본 제품은 다음의 소스를 참조하였습니다.
23 | * https://github.com/cogniti/ruby-hwp/
24 | */
25 | package com.argo.hwp.v5;
26 |
27 | import java.io.File;
28 | import java.io.FileNotFoundException;
29 | import java.io.IOException;
30 | import java.io.InputStream;
31 | import java.io.Writer;
32 | import java.security.InvalidKeyException;
33 | import java.security.Key;
34 | import java.security.NoSuchAlgorithmException;
35 | import java.util.Arrays;
36 | import java.util.Iterator;
37 | import java.util.zip.Inflater;
38 | import java.util.zip.InflaterInputStream;
39 |
40 | import javax.crypto.Cipher;
41 | import javax.crypto.CipherInputStream;
42 | import javax.crypto.NoSuchPaddingException;
43 | import javax.crypto.spec.SecretKeySpec;
44 |
45 | import org.apache.poi.poifs.filesystem.DirectoryEntry;
46 | import org.apache.poi.poifs.filesystem.DirectoryNode;
47 | import org.apache.poi.poifs.filesystem.DocumentEntry;
48 | import org.apache.poi.poifs.filesystem.DocumentInputStream;
49 | import org.apache.poi.poifs.filesystem.Entry;
50 | import org.apache.poi.poifs.filesystem.NDocumentInputStream;
51 | import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
52 | import org.apache.poi.util.LittleEndian;
53 | import org.slf4j.Logger;
54 | import org.slf4j.LoggerFactory;
55 |
56 | import com.argo.hwp.utils.HwpStreamReader;
57 |
58 | public abstract class HwpTextExtractorV5 {
59 | protected static Logger log = LoggerFactory
60 | .getLogger(HwpTextExtractorV5.class);
61 |
62 | private static final byte[] HWP_V5_SIGNATURE = "HWP Document File"
63 | .getBytes();
64 |
65 | private static final int[] HWP_CONTROL_CHARS = new int[] { 0, 10, 13, 24,
66 | 25, 26, 27, 28, 29, 30, 31 };
67 | private static final int[] HWP_INLINE_CHARS = new int[] { 4, 5, 6, 7, 8, 9,
68 | 19, 20 };
69 | private static final int[] HWP_EXTENDED_CHARS = new int[] { 1, 2, 3, 11,
70 | 12, 14, 15, 16, 17, 18, 21, 22, 23 };
71 |
72 | private static final int HWPTAG_BEGIN = 0x010;
73 |
74 | /**
75 | * HWP 파일에서 텍스트 추출
76 | *
77 | * @param source
78 | * @param writer
79 | * @return
80 | * @throws FileNotFoundException
81 | * @throws IOException
82 | */
83 | public static boolean extractText(File source, Writer writer)
84 | throws FileNotFoundException, IOException {
85 | if (source == null)
86 | throw new IllegalArgumentException();
87 | if (!source.exists())
88 | throw new FileNotFoundException();
89 |
90 | NPOIFSFileSystem fs = null;
91 | try {
92 | FileHeader header;
93 |
94 | // HWP Document가 맞는지 확인한다
95 | try {
96 | // 우선은 Compound File
97 | fs = new NPOIFSFileSystem(source);
98 | header = getHeader(fs);
99 | } catch (IOException e) {
100 | log.warn("파일정보 확인 중 오류. HWP 포맷이 아닌 것으로 간주함", e);
101 | return false;
102 | }
103 |
104 | if (header == null)
105 | return false;
106 |
107 | // 여기까지 왔다면 HWP 문서가 맞다고 본다
108 | // 이제부터의 IOException 은 HWP 읽는 중 오류이다.
109 |
110 | // 배포용 문서.. BodyText 가 아닌 ViewText에 Section 이 존재
111 | // https://groups.google.com/forum/#!msg/hwp-foss/d2KL2ypR89Q/lCTkebPcIYYJ
112 | if (header.viewtext) {
113 | extractViewText(header, fs, writer);
114 | } else {
115 | extractBodyText(header, fs, writer);
116 | }
117 |
118 | return true;
119 | } finally {
120 | if (fs != null) {
121 | try {
122 | fs.close();
123 | } catch (IOException e) {
124 | log.warn("Exception", e);
125 | }
126 | }
127 | }
128 | }
129 |
130 | /**
131 | * HWP의 FileHeader 추출
132 | *
133 | * @param fs
134 | * @return
135 | * @throws IOException
136 | */
137 | private static FileHeader getHeader(NPOIFSFileSystem fs) throws IOException {
138 | DirectoryNode root = fs.getRoot();
139 |
140 | // 파일인식정보 p.18
141 |
142 | // FileHeader 존재 여부
143 | Entry headerEntry = root.getEntry("FileHeader");
144 | if (!headerEntry.isDocumentEntry())
145 | return null;
146 |
147 | // 시그니처 확인
148 | byte[] header = new byte[256]; // FileHeader 길이는 256
149 | DocumentInputStream headerStream = new DocumentInputStream(
150 | (DocumentEntry) headerEntry);
151 | try {
152 | int read = headerStream.read(header);
153 | if (read != 256
154 | || !Arrays.equals(HWP_V5_SIGNATURE, Arrays.copyOfRange(
155 | header, 0, HWP_V5_SIGNATURE.length)))
156 | return null;
157 | } finally {
158 | headerStream.close();
159 | }
160 |
161 | FileHeader fileHeader = new FileHeader();
162 |
163 | // 버전. debug
164 | fileHeader.version = HwpVersion.parseVersion(LittleEndian.getUInt(
165 | header, 32));
166 | long flags = LittleEndian.getUInt(header, 36);
167 | log.debug("Flags={}", Long.toBinaryString(flags).replace(' ', '0'));
168 |
169 | fileHeader.compressed = (flags & 0x01) == 0x01;
170 | fileHeader.encrypted = (flags & 0x02) == 0x02;
171 | fileHeader.viewtext = (flags & 0x04) == 0x04;
172 |
173 | return fileHeader;
174 | }
175 |
176 | /**
177 | * 텍스트 추출
178 | *
179 | * @param writer
180 | * @param source
181 | *
182 | * @return
183 | * @throws IOException
184 | */
185 | private static void extractBodyText(FileHeader header, NPOIFSFileSystem fs,
186 | Writer writer) throws IOException {
187 | DirectoryNode root = fs.getRoot();
188 |
189 | // BodyText 읽기
190 | Entry bodyText = root.getEntry("BodyText");
191 | if (bodyText == null || !bodyText.isDirectoryEntry())
192 | throw new IOException("Invalid BodyText");
193 |
194 | Iterator iterator = ((DirectoryEntry) bodyText).getEntries();
195 | while (iterator.hasNext()) {
196 | Entry entry = iterator.next();
197 | if (entry.getName().startsWith("Section")
198 | && entry instanceof DocumentEntry) {
199 | log.debug("extract {}", entry.getName());
200 |
201 | InputStream input = new NDocumentInputStream(
202 | (DocumentEntry) entry);
203 | try {
204 | if (header.compressed)
205 | input = new InflaterInputStream(input, new Inflater(
206 | true));
207 |
208 | HwpStreamReader sectionStream = new HwpStreamReader(input);
209 |
210 | extractText(sectionStream, writer);
211 | } finally {
212 | // 닫을 필요는 없을 것이다
213 | try {
214 | input.close();
215 | } catch (IOException e) {
216 | log.error("있을 수 없는 일?", e);
217 | }
218 | }
219 | } else {
220 | log.warn("알수없는 Entry '{}'({})", entry.getName(), entry);
221 | }
222 | }
223 | }
224 |
225 | /**
226 | * 텍스트 추출
227 | *
228 | * @param writer
229 | * @param source
230 | *
231 | * @return
232 | * @throws IOException
233 | */
234 | private static void extractViewText(FileHeader header, NPOIFSFileSystem fs,
235 | Writer writer) throws IOException {
236 | DirectoryNode root = fs.getRoot();
237 |
238 | // BodyText 읽기
239 | Entry bodyText = root.getEntry("ViewText");
240 | if (bodyText == null || !bodyText.isDirectoryEntry())
241 | throw new IOException("Invalid ViewText");
242 |
243 | Iterator iterator = ((DirectoryEntry) bodyText).getEntries();
244 | while (iterator.hasNext()) {
245 | Entry entry = iterator.next();
246 | if (entry.getName().startsWith("Section")
247 | && entry instanceof DocumentEntry) {
248 | log.debug("extract {}", entry.getName());
249 |
250 | InputStream input = new NDocumentInputStream(
251 | (DocumentEntry) entry);
252 |
253 | // FIXME 섹션마다 키가 있는가?
254 | Key key = readKey(input);
255 | try {
256 | input = createDecryptStream(input, key);
257 | if (header.compressed)
258 | input = new InflaterInputStream(input, new Inflater(
259 | true));
260 |
261 | HwpStreamReader sectionStream = new HwpStreamReader(input);
262 | extractText(sectionStream, writer);
263 | } catch (InvalidKeyException e) {
264 | throw new IOException(e);
265 | } catch (NoSuchAlgorithmException e) {
266 | throw new IOException(e);
267 | } catch (NoSuchPaddingException e) {
268 | throw new IOException(e);
269 | } finally {
270 | // 닫을 필요는 없을 것이다
271 | try {
272 | input.close();
273 | } catch (IOException e) {
274 | log.error("있을 수 없는 일?", e);
275 | }
276 | }
277 | } else {
278 | log.warn("알수없는 Entry '{}'({})", entry.getName(), entry);
279 | }
280 | }
281 | }
282 |
283 | // https://groups.google.com/forum/#!msg/hwp-foss/d2KL2ypR89Q/lCTkebPcIYYJ
284 | private static class SRand {
285 | private int random_seed;
286 |
287 | private SRand(int seed) {
288 | random_seed = seed;
289 | }
290 |
291 | private int rand() {
292 | random_seed = (random_seed * 214013 + 2531011) & 0xFFFFFFFF;
293 | return (random_seed >> 16) & 0x7FFF;
294 | }
295 | }
296 |
297 | private static Key readKey(InputStream input) throws IOException {
298 | byte[] data = new byte[260];
299 |
300 | input.read(data, 0, 4); // TAG,
301 | // HWPTAG_DISTRIBUTE_DOC_DATA 확인
302 | // long recordHeader = LittleEndian.getUInt(data);
303 | // log.debug("TAG: {}", recordHeader & 0x3FF);
304 | // log.debug("LEVEL: {}", (recordHeader >> 10) & 0x3FF);
305 | // log.debug("SIZE: {}", (recordHeader >> 20) & 0xFFF);
306 |
307 | // https://groups.google.com/forum/#!msg/hwp-foss/d2KL2ypR89Q/lCTkebPcIYYJ
308 |
309 | input.read(data, 0, 256);
310 |
311 | SRand srand = new SRand(LittleEndian.getInt(data));
312 | byte xor = 0;
313 | for (int i = 0, n = 0; i < 256; i++, n--) {
314 | if (n == 0) {
315 | xor = (byte) (srand.rand() & 0xFF);
316 | n = (int) ((srand.rand() & 0xF) + 1);
317 | }
318 | if (i >= 4) {
319 | data[i] = (byte) ((data[i]) ^ (xor));
320 | }
321 | }
322 |
323 | int offset = 4 + (data[0] & 0xF);
324 | byte[] key = Arrays.copyOfRange(data, offset, offset + 16);
325 |
326 | SecretKeySpec secretKey = new SecretKeySpec(key, "AES");
327 | return secretKey;
328 | }
329 |
330 | public static InputStream createDecryptStream(InputStream input, Key key)
331 | throws IOException, NoSuchAlgorithmException,
332 | NoSuchPaddingException, InvalidKeyException {
333 | Cipher cipher = null;
334 |
335 | cipher = Cipher.getInstance("AES/ECB/NoPadding");
336 | cipher.init(Cipher.DECRYPT_MODE, key);
337 |
338 | return new CipherInputStream(input, cipher);
339 | }
340 |
341 | /**
342 | * Section 스트림에서 문자를 추출
343 | *
344 | * @param sectionStream
345 | * @param writer
346 | * @throws IOException
347 | */
348 | private static void extractText(HwpStreamReader sectionStream, Writer writer)
349 | throws IOException {
350 | StringBuffer buf = new StringBuffer(1024);
351 | TagInfo tag = new TagInfo();
352 |
353 | while (true) {
354 | if (!readTag(sectionStream, tag))
355 | break;
356 |
357 | buf.setLength(0);
358 | if (HWPTAG_BEGIN + 50 == tag.id) {
359 | writeParaHeader(sectionStream, tag.length, buf);
360 | } else if (HWPTAG_BEGIN + 51 == tag.id) {
361 | if (tag.length % 2 != 0)
362 | throw new IOException("Invalid block size");
363 |
364 | writeParaText(sectionStream, tag.length, buf);
365 |
366 | if (buf.length() > 0) // 줄바꿈 추가?
367 | writer.append(buf.toString()).append('\n');
368 | } else {
369 | sectionStream.ensureSkip(tag.length);
370 | }
371 |
372 | if (buf.length() > 0) {
373 | log.debug("TAG[{}]({}):{} [{}]", new Object[] { tag.id,
374 | tag.level, tag.length, buf });
375 | }
376 | }
377 | }
378 |
379 | private static void writeParaHeader(HwpStreamReader sectionStream,
380 | long length, StringBuffer buf) throws IOException {
381 | // log.debug("text={}", sectionStream.uint32());
382 | // log.debug("control mask={}", sectionStream.uint32());
383 | // log.debug("문단모양아이디참조값={}", sectionStream.uint16());
384 | // log.debug("문단스타일아이디참조값={}", sectionStream.uint8());
385 | // log.debug("단나누기종류={}", sectionStream.uint8());
386 | // log.debug("글자모양정보수={}", sectionStream.uint16());
387 | // log.debug("range tag정보수={}", sectionStream.uint16());
388 | // log.debug("각줄에 대한 align정보수={}", sectionStream.uint16());
389 | // log.debug("문단 Instance ID={}", sectionStream.uint32());
390 | // sectionStream.ensureSkip(2);
391 |
392 | sectionStream.ensureSkip(length);
393 | }
394 |
395 | /**
396 | * HWPTAG_PARA_TEXT 의 문자스트림을 문자열로 변환
397 | *
398 | * @param sectionStream
399 | * @param datasize
400 | * @param buf
401 | * @throws IOException
402 | */
403 | private static void writeParaText(HwpStreamReader sectionStream,
404 | long datasize, StringBuffer buf) throws IOException {
405 | int[] chars = sectionStream.uint16((int) (datasize / 2));
406 |
407 | for (int index = 0; index < chars.length; index++) {
408 | int ch = chars[index];
409 | if (Arrays.binarySearch(HWP_INLINE_CHARS, ch) >= 0) {
410 | if (ch == 9) {
411 | buf.append('\t');
412 | }
413 | index += 7;
414 | } else if (Arrays.binarySearch(HWP_EXTENDED_CHARS, ch) >= 0) {
415 | index += 7;
416 | } else if (Arrays.binarySearch(HWP_CONTROL_CHARS, ch) >= 0) {
417 | buf.append(' ');
418 | } else {
419 | buf.append((char) ch);
420 | }
421 | }
422 | }
423 |
424 | private static boolean readTag(HwpStreamReader sectionStream, TagInfo tag)
425 | throws IOException {
426 | // p.24
427 |
428 | long recordHeader = sectionStream.uint32();
429 | if (recordHeader == -1)
430 | return false;
431 |
432 | // log.debug("Record Header={} [{}]", recordHeader,
433 | // Long.toHexString(recordHeader));
434 |
435 | tag.id = recordHeader & 0x3FF;
436 | tag.level = (recordHeader >> 10) & 0x3FF;
437 | tag.length = (recordHeader >> 20) & 0xFFF;
438 |
439 | // 확장 데이터 레코드 p.24
440 | if (tag.length == 0xFFF)
441 | tag.length = sectionStream.uint32();
442 |
443 | return true;
444 | }
445 |
446 | static class FileHeader {
447 | HwpVersion version;
448 | boolean compressed; // bit 0
449 | boolean encrypted; // bit 1
450 | boolean viewtext; // bit 2
451 | }
452 |
453 | static class TagInfo {
454 | long id;
455 | long level;
456 | long length;
457 | }
458 |
459 | static class HwpVersion {
460 | int m;
461 | int n;
462 | int p;
463 | int r;
464 |
465 | public String toString() {
466 | return String.format("%d.%d.%d.%d", m, n, p, r);
467 | }
468 |
469 | public static HwpVersion parseVersion(long longVersion) {
470 | HwpVersion version = new HwpVersion();
471 | version.m = (int) ((longVersion & 0xFF000000L) >> 24);
472 | version.n = (int) ((longVersion & 0x00FF0000L) >> 16);
473 | version.p = (int) ((longVersion & 0x0000FF00L) >> 8);
474 | version.r = (int) ((longVersion & 0x000000FFL));
475 | return version;
476 | }
477 | }
478 |
479 | }
480 |
--------------------------------------------------------------------------------