├── src ├── test │ ├── resources │ │ ├── v3 │ │ │ ├── 4e00-62ff.hwp │ │ │ ├── empty-v3.hwp │ │ │ └── han_special_char_3.0.hwp │ │ ├── v5 │ │ │ ├── han_grammar.hwp │ │ │ ├── han_special_char.hwp │ │ │ └── test-distribute.hwp │ │ └── log4j.properties │ └── java │ │ └── com │ │ └── argo │ │ └── hwp │ │ ├── v3 │ │ ├── TestHwpV3Extractor.java │ │ └── TestMapLoad.java │ │ └── v5 │ │ └── TestHwpV5Extractor.java └── main │ └── java │ └── com │ └── argo │ └── hwp │ ├── HwpTextExtractor.java │ ├── v3 │ ├── Hnc2String.java │ └── HwpTextExtractorV3.java │ ├── utils │ └── HwpStreamReader.java │ └── v5 │ └── HwpTextExtractorV5.java ├── target ├── classes │ └── com │ │ └── argo │ │ └── hwp │ │ ├── v3 │ │ ├── Hnc2String.class │ │ └── HwpTextExtractorV3.class │ │ ├── HwpTextExtractor.class │ │ ├── utils │ │ └── HwpStreamReader.class │ │ └── v5 │ │ ├── HwpTextExtractorV5.class │ │ ├── HwpTextExtractorV5$TagInfo.class │ │ ├── HwpTextExtractorV5$FileHeader.class │ │ └── HwpTextExtractorV5$HwpVersion.class ├── test-classes │ ├── com │ │ └── argo │ │ │ └── hwp │ │ │ ├── v3 │ │ │ └── TestHwpV3Extractor.class │ │ │ └── v5 │ │ │ └── TestHwpV5Extractor.class │ └── log4j.properties └── surefire-reports │ ├── com.argo.hwp.v3.TestHwpV3Extractor.txt │ ├── com.argo.hwp.v5.TestHwpV5Extractor.txt │ ├── TEST-com.argo.hwp.v5.TestHwpV5Extractor.xml │ └── TEST-com.argo.hwp.v3.TestHwpV3Extractor.xml ├── .gitattributes ├── README.md ├── pom.xml ├── .gitignore └── LICENSE-2.0.txt /src/test/resources/v3/4e00-62ff.hwp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ddoleye/java-hwp/HEAD/src/test/resources/v3/4e00-62ff.hwp -------------------------------------------------------------------------------- /src/test/resources/v3/empty-v3.hwp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ddoleye/java-hwp/HEAD/src/test/resources/v3/empty-v3.hwp -------------------------------------------------------------------------------- /src/test/resources/v5/han_grammar.hwp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ddoleye/java-hwp/HEAD/src/test/resources/v5/han_grammar.hwp -------------------------------------------------------------------------------- /src/test/resources/v5/han_special_char.hwp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ddoleye/java-hwp/HEAD/src/test/resources/v5/han_special_char.hwp -------------------------------------------------------------------------------- /src/test/resources/v5/test-distribute.hwp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ddoleye/java-hwp/HEAD/src/test/resources/v5/test-distribute.hwp -------------------------------------------------------------------------------- /src/test/resources/v3/han_special_char_3.0.hwp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ddoleye/java-hwp/HEAD/src/test/resources/v3/han_special_char_3.0.hwp -------------------------------------------------------------------------------- /target/classes/com/argo/hwp/v3/Hnc2String.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ddoleye/java-hwp/HEAD/target/classes/com/argo/hwp/v3/Hnc2String.class -------------------------------------------------------------------------------- /target/classes/com/argo/hwp/HwpTextExtractor.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ddoleye/java-hwp/HEAD/target/classes/com/argo/hwp/HwpTextExtractor.class -------------------------------------------------------------------------------- /target/classes/com/argo/hwp/utils/HwpStreamReader.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ddoleye/java-hwp/HEAD/target/classes/com/argo/hwp/utils/HwpStreamReader.class -------------------------------------------------------------------------------- /target/classes/com/argo/hwp/v3/HwpTextExtractorV3.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ddoleye/java-hwp/HEAD/target/classes/com/argo/hwp/v3/HwpTextExtractorV3.class -------------------------------------------------------------------------------- /target/classes/com/argo/hwp/v5/HwpTextExtractorV5.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ddoleye/java-hwp/HEAD/target/classes/com/argo/hwp/v5/HwpTextExtractorV5.class -------------------------------------------------------------------------------- /target/test-classes/com/argo/hwp/v3/TestHwpV3Extractor.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ddoleye/java-hwp/HEAD/target/test-classes/com/argo/hwp/v3/TestHwpV3Extractor.class -------------------------------------------------------------------------------- /target/test-classes/com/argo/hwp/v5/TestHwpV5Extractor.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ddoleye/java-hwp/HEAD/target/test-classes/com/argo/hwp/v5/TestHwpV5Extractor.class -------------------------------------------------------------------------------- /target/classes/com/argo/hwp/v5/HwpTextExtractorV5$TagInfo.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ddoleye/java-hwp/HEAD/target/classes/com/argo/hwp/v5/HwpTextExtractorV5$TagInfo.class -------------------------------------------------------------------------------- /target/surefire-reports/com.argo.hwp.v3.TestHwpV3Extractor.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ddoleye/java-hwp/HEAD/target/surefire-reports/com.argo.hwp.v3.TestHwpV3Extractor.txt -------------------------------------------------------------------------------- /target/classes/com/argo/hwp/v5/HwpTextExtractorV5$FileHeader.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ddoleye/java-hwp/HEAD/target/classes/com/argo/hwp/v5/HwpTextExtractorV5$FileHeader.class -------------------------------------------------------------------------------- /target/classes/com/argo/hwp/v5/HwpTextExtractorV5$HwpVersion.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ddoleye/java-hwp/HEAD/target/classes/com/argo/hwp/v5/HwpTextExtractorV5$HwpVersion.class -------------------------------------------------------------------------------- /src/test/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | #log4j.rootLogger = DEBUG, stdout, dailyfile 2 | log4j.rootLogger = DEBUG, stdout 3 | 4 | log4j.appender.stdout = org.apache.log4j.ConsoleAppender 5 | log4j.appender.stdout.layout = org.apache.log4j.PatternLayout 6 | log4j.appender.stdout.layout.ConversionPattern=%5p ({%t} %F[%M]:%L) [%d] - %m%n 7 | 8 | -------------------------------------------------------------------------------- /target/test-classes/log4j.properties: -------------------------------------------------------------------------------- 1 | #log4j.rootLogger = DEBUG, stdout, dailyfile 2 | log4j.rootLogger = DEBUG, stdout 3 | 4 | log4j.appender.stdout = org.apache.log4j.ConsoleAppender 5 | log4j.appender.stdout.layout = org.apache.log4j.PatternLayout 6 | log4j.appender.stdout.layout.ConversionPattern=%5p ({%t} %F[%M]:%L) [%d] - %m%n 7 | 8 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | 4 | # Custom for Visual Studio 5 | *.cs diff=csharp 6 | *.sln merge=union 7 | *.csproj merge=union 8 | *.vbproj merge=union 9 | *.fsproj merge=union 10 | *.dbproj merge=union 11 | 12 | # Standard to msysgit 13 | *.doc diff=astextplain 14 | *.DOC diff=astextplain 15 | *.docx diff=astextplain 16 | *.DOCX diff=astextplain 17 | *.dot diff=astextplain 18 | *.DOT diff=astextplain 19 | *.pdf diff=astextplain 20 | *.PDF diff=astextplain 21 | *.rtf diff=astextplain 22 | *.RTF diff=astextplain 23 | -------------------------------------------------------------------------------- /target/surefire-reports/com.argo.hwp.v5.TestHwpV5Extractor.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------------------------- 2 | Test set: com.argo.hwp.v5.TestHwpV5Extractor 3 | ------------------------------------------------------------------------------- 4 | Tests run: 1, Failures: 1, Errors: 0, Skipped: 0, Time elapsed: 0 sec <<< FAILURE! 5 | com.argo.hwp.v5.TestHwpV5Extractor.testObjectStreams() Time elapsed: 0 sec <<< FAILURE! 6 | java.io.FileNotFoundException 7 | at com.argo.hwp.v5.HwpTextExtractorV5.extractText(HwpTextExtractorV5.java:67) 8 | at com.argo.hwp.v5.TestHwpV5Extractor.extract(TestHwpV5Extractor.java:16) 9 | at com.argo.hwp.v5.TestHwpV5Extractor.testObjectStreams(TestHwpV5Extractor.java:47) 10 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | java-hwp 2 | ======== 3 | 4 | [![](https://jitpack.io/v/ddoleye/java-hwp.svg)](https://jitpack.io/#ddoleye/java-hwp) 5 | 6 | 7 | 본 제품은 한글과컴퓨터의 한글 문서 파일(.hwp) 공개 문서를 참고하여 개발하였습니다. 8 | 9 | 개발에 많은 도움을 주신 [cogniti](https://github.com/cogniti)님과 [libhwp Google Group](https://groups.google.com/forum/#!forum/libhwp) 그룹에 감사드립니다. 10 | 11 | HWP 파일에서 텍스트를 추출하는 자바 라이브러리이며 [ruby-hwp](https://github.com/cogniti/ruby-hwp) 의 자바 버전입니다. 12 | ruby-hwp의 로직을 대부분 그대로 사용하며 ruby-hwp의 문자매핑 정보(hnc2unicode.rb) 파일을 사용합니다. 13 | 14 | HWP 5.0 버전의 Compound File은 [Apache-POI의 POIFS File System](http://poi.apache.org/poifs/fileformat.html)을 사용하여 처리합니다. 15 | 16 | 17 | ## 사용방법 18 | 19 | File hwp = new File("hangul.hwp"); // 텍스트를 추출할 HWP 파일 20 | Writer writer = new StringWriter(); // 추출된 텍스트를 출력할 버퍼 21 | HwpTextExtractor.extract(hwp, writer); // 파일로부터 텍스트 추출 22 | String text = writer.toString(); // 추출된 텍스트 23 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | com.argo 5 | java-hwp 6 | 0.1 7 | 8 | 9 | UTF-8 10 | 1.7 11 | 12 | 13 | 14 | 15 | org.apache.poi 16 | poi 17 | 3.9 18 | 19 | 20 | 21 | org.slf4j 22 | slf4j-log4j12 23 | 1.7.5 24 | provided 25 | 26 | 27 | junit 28 | junit 29 | 4.12 30 | 31 | 32 | 33 | 34 | 35 | 36 | ${project.artifactId} 37 | 38 | 39 | maven-compiler-plugin 40 | 41 | ${project.source.version} 42 | ${project.source.version} 43 | ${project.source.encoding} 44 | 45 | 46 | 47 | maven-resources-plugin 48 | 49 | ${project.source.encoding} 50 | 51 | 52 | 53 | 54 | -------------------------------------------------------------------------------- /src/main/java/com/argo/hwp/HwpTextExtractor.java: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright [2015] argonet.co.kr 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | /* 17 | * This software has been developed with reference to 18 | * the HWP file format open specification by Hancom, Inc. 19 | * http://www.hancom.co.kr/userofficedata.userofficedataList.do?menuFlag=3 20 | * 한글과컴퓨터의 한/글 문서 파일(.hwp) 공개 문서를 참고하여 개발하였습니다. 21 | * 22 | * 본 제품은 다음의 소스를 참조하였습니다. 23 | * https://github.com/cogniti/ruby-hwp/ 24 | */ 25 | package com.argo.hwp; 26 | 27 | import java.io.File; 28 | import java.io.FileNotFoundException; 29 | import java.io.IOException; 30 | import java.io.Writer; 31 | 32 | import org.slf4j.Logger; 33 | import org.slf4j.LoggerFactory; 34 | 35 | import com.argo.hwp.v3.HwpTextExtractorV3; 36 | import com.argo.hwp.v5.HwpTextExtractorV5; 37 | 38 | public abstract class HwpTextExtractor { 39 | protected static Logger log = LoggerFactory.getLogger(HwpTextExtractor.class); 40 | 41 | public static boolean extract(File source, Writer writer) 42 | throws FileNotFoundException, IOException { 43 | if (source == null || writer == null) 44 | throw new IllegalArgumentException(); 45 | if (!source.exists()) 46 | throw new FileNotFoundException(); 47 | 48 | // 먼저 V5 부터 시도 49 | boolean success = HwpTextExtractorV5.extractText(source, writer); 50 | 51 | // 아니라면 V3 시도 52 | if (!success) 53 | success = HwpTextExtractorV3.extractText(source, writer); 54 | 55 | return success; 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /src/test/java/com/argo/hwp/v3/TestHwpV3Extractor.java: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright [2015] argonet.co.kr 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | /* 17 | * This software has been developed with reference to 18 | * the HWP file format open specification by Hancom, Inc. 19 | * http://www.hancom.co.kr/userofficedata.userofficedataList.do?menuFlag=3 20 | * 한글과컴퓨터의 한/글 문서 파일(.hwp) 공개 문서를 참고하여 개발하였습니다. 21 | * 22 | * 본 제품은 다음의 소스를 참조하였습니다. 23 | * https://github.com/cogniti/ruby-hwp/ 24 | */ 25 | package com.argo.hwp.v3; 26 | 27 | import java.io.File; 28 | import java.io.FileNotFoundException; 29 | import java.io.IOException; 30 | import java.io.StringWriter; 31 | import java.util.StringTokenizer; 32 | 33 | import org.junit.Test; 34 | 35 | public class TestHwpV3Extractor { 36 | private String extract(String path) throws FileNotFoundException, 37 | IOException { 38 | // File file = new File(path); 39 | File file = new File(getClass().getResource("/" + path).getFile()); 40 | // System.out.println(file.getAbsolutePath()); 41 | StringWriter writer = new StringWriter(4096); 42 | HwpTextExtractorV3.extractText(file, writer); 43 | return writer.toString(); 44 | } 45 | 46 | private String tokenize(String t) { 47 | StringWriter writer = new StringWriter(4096); 48 | StringTokenizer token = new StringTokenizer(t); 49 | while (token.hasMoreTokens()) { 50 | writer.append(token.nextToken()).append("\n"); 51 | } 52 | return writer.toString(); 53 | } 54 | 55 | private String extractIgnoreException(String path) { 56 | try { 57 | return extract(path); 58 | } catch (Exception e) { 59 | System.out.println(e.getMessage()); 60 | return null; 61 | } 62 | } 63 | 64 | @Test 65 | public void testCharConversion() { 66 | // 조합형 67 | System.out.println(Hnc2String.convert(Integer.parseInt( 68 | "1000010001100010", 2))); 69 | } 70 | 71 | @Test 72 | public void testExtractText() throws IOException, ClassNotFoundException { 73 | System.out.println(extract("v3/4e00-62ff.hwp")); 74 | System.out.println(extract("v3/han_special_char_3.0.hwp")); 75 | } 76 | } -------------------------------------------------------------------------------- /src/test/java/com/argo/hwp/v5/TestHwpV5Extractor.java: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright [2015] argonet.co.kr 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | /* 17 | * This software has been developed with reference to 18 | * the HWP file format open specification by Hancom, Inc. 19 | * http://www.hancom.co.kr/userofficedata.userofficedataList.do?menuFlag=3 20 | * 한글과컴퓨터의 한/글 문서 파일(.hwp) 공개 문서를 참고하여 개발하였습니다. 21 | * 22 | * 본 제품은 다음의 소스를 참조하였습니다. 23 | * https://github.com/cogniti/ruby-hwp/ 24 | */ 25 | package com.argo.hwp.v5; 26 | 27 | import java.io.File; 28 | import java.io.FileNotFoundException; 29 | import java.io.IOException; 30 | import java.io.StringWriter; 31 | 32 | import org.junit.Test; 33 | 34 | public class TestHwpV5Extractor { 35 | private String extract(String path) throws FileNotFoundException, 36 | IOException { 37 | // File file = new File(path); 38 | File file = new File(getClass().getResource("/"+path).getFile()); 39 | StringWriter writer = new StringWriter(4096); 40 | HwpTextExtractorV5.extractText(file, writer); 41 | return writer.toString(); 42 | } 43 | 44 | /** 45 | * 디버그.. 문자와 코드값 출력 46 | * 47 | * @param t 48 | * @return 49 | */ 50 | private String withCode(String t) { 51 | StringWriter writer = new StringWriter(4096); 52 | for (int ii = 0; ii < t.length(); ii++) { 53 | char ch = t.charAt(ii); 54 | if (ch == ' ' || ch == '\n') 55 | continue; 56 | writer.append(ch); 57 | if (ch >= 128) { 58 | writer.append("\t").append(String.format("0x%1$04x", (int) ch)); 59 | } 60 | writer.append("\n"); 61 | } 62 | return writer.toString(); 63 | } 64 | 65 | private String extractIgnoreException(String path) { 66 | try { 67 | return extract(path); 68 | } catch (Exception e) { 69 | System.out.println(e.getMessage()); 70 | return null; 71 | } 72 | } 73 | 74 | @Test 75 | public void testExtractText() throws IOException, ClassNotFoundException { 76 | // System.out.println(extract("v5/han_grammar.hwp")); 77 | // System.out.println(extract("v5/han_special_char.hwp")); 78 | 79 | System.out.println(extract("v5/test-distribute.hwp")); 80 | } 81 | } -------------------------------------------------------------------------------- /src/test/java/com/argo/hwp/v3/TestMapLoad.java: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright [2015] argonet.co.kr 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | /* 17 | * This software has been developed with reference to 18 | * the HWP file format open specification by Hancom, Inc. 19 | * http://www.hancom.co.kr/userofficedata.userofficedataList.do?menuFlag=3 20 | * 한글과컴퓨터의 한/글 문서 파일(.hwp) 공개 문서를 참고하여 개발하였습니다. 21 | * 22 | * 본 제품은 다음의 소스를 참조하였습니다. 23 | * https://github.com/cogniti/ruby-hwp/ 24 | */ 25 | package com.argo.hwp.v3; 26 | 27 | import java.io.BufferedReader; 28 | import java.io.IOException; 29 | import java.io.InputStream; 30 | import java.io.InputStreamReader; 31 | import java.util.regex.Matcher; 32 | import java.util.regex.Pattern; 33 | 34 | import org.junit.Assert; 35 | import org.junit.Before; 36 | import org.junit.Test; 37 | 38 | public class TestMapLoad { 39 | private static String[] map = new String[65536]; 40 | 41 | @Before 42 | public void test() throws IOException { 43 | Pattern P = Pattern 44 | .compile("0x([0-9a-f]{4})\\s*=>\\s*\\[" + "0x([0-9a-f]{4})" 45 | + "(?:,\\s*0x([0-9a-f]{4}))?" 46 | + "(?:,\\s*0x([0-9a-f]{4}))?" + "\\]", 47 | Pattern.CASE_INSENSITIVE); 48 | 49 | InputStream resource = Hnc2String.class.getClassLoader() 50 | .getResourceAsStream("hnc2unicode.rb"); 51 | int lineNumber = 0; 52 | char[] chars = new char[3]; 53 | 54 | try { 55 | BufferedReader reader = new BufferedReader(new InputStreamReader( 56 | resource, "UTF-8")); 57 | 58 | for (;;) { 59 | String line = reader.readLine(); 60 | if (line == null) 61 | break; 62 | 63 | lineNumber++; 64 | line = line.trim(); 65 | if (line.length() == 0) 66 | continue; 67 | if (line.startsWith("#")) 68 | continue; 69 | 70 | Matcher matcher = P.matcher(line); 71 | if (matcher.find()) { 72 | int code = Integer.parseInt(matcher.group(1), 16); 73 | int len; 74 | 75 | for (len = 1; len < matcher.groupCount(); len++) { 76 | String hex = matcher.group(len + 1); 77 | if (hex == null) 78 | break; 79 | 80 | chars[len - 1] = (char) Integer.parseInt(hex, 16); 81 | } 82 | 83 | map[code] = new String(chars, 0, len - 1); 84 | } else { 85 | System.out.println("[" + lineNumber + "]>>>" + line); 86 | } 87 | } 88 | } finally { 89 | try { 90 | resource.close(); 91 | } catch (IOException e) { 92 | // ignore ? 93 | } 94 | } 95 | } 96 | 97 | @Test 98 | public void test1() { 99 | Assert.assertEquals("가", map[0x8861]); 100 | Assert.assertEquals(new String(new char[] { 0xf7fe, 0xf863, 0xf8e6 }), 101 | map[(int) 0xfff8]); 102 | } 103 | } 104 | -------------------------------------------------------------------------------- /src/main/java/com/argo/hwp/v3/Hnc2String.java: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright [2015] argonet.co.kr 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | /* 17 | * This software has been developed with reference to 18 | * the HWP file format open specification by Hancom, Inc. 19 | * http://www.hancom.co.kr/userofficedata.userofficedataList.do?menuFlag=3 20 | * 한글과컴퓨터의 한/글 문서 파일(.hwp) 공개 문서를 참고하여 개발하였습니다. 21 | * 22 | * 본 제품은 다음의 소스를 참조하였습니다. 23 | * https://github.com/cogniti/ruby-hwp/ 24 | */ 25 | package com.argo.hwp.v3; 26 | 27 | import java.io.BufferedReader; 28 | import java.io.IOException; 29 | import java.io.InputStream; 30 | import java.io.InputStreamReader; 31 | import java.io.UnsupportedEncodingException; 32 | import java.util.regex.Matcher; 33 | import java.util.regex.Pattern; 34 | 35 | import org.slf4j.Logger; 36 | import org.slf4j.LoggerFactory; 37 | 38 | class Hnc2String { 39 | static Logger log = LoggerFactory.getLogger(Hnc2String.class); 40 | static final String[] map = new String[65536]; // max : 0xFFFF 41 | 42 | static { 43 | Pattern P = Pattern 44 | .compile("0x([0-9a-f]{4})\\s*=>\\s*\\[" + "0x([0-9a-f]{4})" 45 | + "(?:,\\s*0x([0-9a-f]{4}))?" 46 | + "(?:,\\s*0x([0-9a-f]{4}))?" + "\\]", 47 | Pattern.CASE_INSENSITIVE); 48 | 49 | InputStream resource = Hnc2String.class.getClassLoader() 50 | .getResourceAsStream("hnc2unicode.rb"); 51 | int lineNumber = 0; 52 | char[] chars = new char[3]; 53 | 54 | try { 55 | BufferedReader reader = new BufferedReader(new InputStreamReader( 56 | resource, "UTF-8")); 57 | 58 | for (;;) { 59 | String line = reader.readLine(); 60 | if (line == null) 61 | break; 62 | 63 | lineNumber++; 64 | line = line.trim(); 65 | if (line.length() == 0) 66 | continue; 67 | if (line.startsWith("#")) 68 | continue; 69 | 70 | Matcher matcher = P.matcher(line); 71 | if (matcher.find()) { 72 | int code = Integer.parseInt(matcher.group(1), 16); 73 | int len; 74 | 75 | for (len = 1; len < matcher.groupCount(); len++) { 76 | String hex = matcher.group(len + 1); 77 | if (hex == null) 78 | break; 79 | 80 | chars[len - 1] = (char) Integer.parseInt(hex, 16); 81 | } 82 | 83 | map[code] = new String(chars, 0, len - 1); 84 | } else { 85 | System.out.println("[" + lineNumber + "]>>>" + line); 86 | } 87 | } 88 | } catch (UnsupportedEncodingException e) { 89 | throw new RuntimeException(e); 90 | } catch (IOException e) { 91 | throw new RuntimeException(e); 92 | } finally { 93 | try { 94 | resource.close(); 95 | } catch (IOException e) { 96 | // ignore ? 97 | } 98 | } 99 | } 100 | 101 | static String convert(int c) { 102 | assert c >= 0 && c < 0xFFFF; 103 | 104 | return map[c]; 105 | } 106 | } 107 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ################# 2 | ## Eclipse 3 | ################# 4 | 5 | *.pydevproject 6 | .project 7 | .metadata 8 | bin/ 9 | tmp/ 10 | *.tmp 11 | *.bak 12 | *.swp 13 | *~.nib 14 | local.properties 15 | .classpath 16 | .settings/ 17 | .loadpath 18 | 19 | # External tool builders 20 | .externalToolBuilders/ 21 | 22 | # Locally stored "Eclipse launch configurations" 23 | *.launch 24 | 25 | # CDT-specific 26 | .cproject 27 | 28 | # PDT-specific 29 | .buildpath 30 | 31 | 32 | ################# 33 | ## Visual Studio 34 | ################# 35 | 36 | ## Ignore Visual Studio temporary files, build results, and 37 | ## files generated by popular Visual Studio add-ons. 38 | 39 | # User-specific files 40 | *.suo 41 | *.user 42 | *.sln.docstates 43 | 44 | # Build results 45 | 46 | [Dd]ebug/ 47 | [Rr]elease/ 48 | x64/ 49 | build/ 50 | [Bb]in/ 51 | [Oo]bj/ 52 | 53 | # MSTest test Results 54 | [Tt]est[Rr]esult*/ 55 | [Bb]uild[Ll]og.* 56 | 57 | *_i.c 58 | *_p.c 59 | *.ilk 60 | *.meta 61 | *.obj 62 | *.pch 63 | *.pdb 64 | *.pgc 65 | *.pgd 66 | *.rsp 67 | *.sbr 68 | *.tlb 69 | *.tli 70 | *.tlh 71 | *.tmp 72 | *.tmp_proj 73 | *.log 74 | *.vspscc 75 | *.vssscc 76 | .builds 77 | *.pidb 78 | *.log 79 | *.scc 80 | 81 | # Visual C++ cache files 82 | ipch/ 83 | *.aps 84 | *.ncb 85 | *.opensdf 86 | *.sdf 87 | *.cachefile 88 | 89 | # Visual Studio profiler 90 | *.psess 91 | *.vsp 92 | *.vspx 93 | 94 | # Guidance Automation Toolkit 95 | *.gpState 96 | 97 | # ReSharper is a .NET coding add-in 98 | _ReSharper*/ 99 | *.[Rr]e[Ss]harper 100 | 101 | # TeamCity is a build add-in 102 | _TeamCity* 103 | 104 | # DotCover is a Code Coverage Tool 105 | *.dotCover 106 | 107 | # NCrunch 108 | *.ncrunch* 109 | .*crunch*.local.xml 110 | 111 | # Installshield output folder 112 | [Ee]xpress/ 113 | 114 | # DocProject is a documentation generator add-in 115 | DocProject/buildhelp/ 116 | DocProject/Help/*.HxT 117 | DocProject/Help/*.HxC 118 | DocProject/Help/*.hhc 119 | DocProject/Help/*.hhk 120 | DocProject/Help/*.hhp 121 | DocProject/Help/Html2 122 | DocProject/Help/html 123 | 124 | # Click-Once directory 125 | publish/ 126 | 127 | # Publish Web Output 128 | *.Publish.xml 129 | *.pubxml 130 | 131 | # NuGet Packages Directory 132 | ## TODO: If you have NuGet Package Restore enabled, uncomment the next line 133 | #packages/ 134 | 135 | # Windows Azure Build Output 136 | csx 137 | *.build.csdef 138 | 139 | # Windows Store app package directory 140 | AppPackages/ 141 | 142 | # Others 143 | sql/ 144 | *.Cache 145 | ClientBin/ 146 | [Ss]tyle[Cc]op.* 147 | ~$* 148 | *~ 149 | *.dbmdl 150 | *.[Pp]ublish.xml 151 | *.pfx 152 | *.publishsettings 153 | 154 | # RIA/Silverlight projects 155 | Generated_Code/ 156 | 157 | # Backup & report files from converting an old project file to a newer 158 | # Visual Studio version. Backup files are not needed, because we have git ;-) 159 | _UpgradeReport_Files/ 160 | Backup*/ 161 | UpgradeLog*.XML 162 | UpgradeLog*.htm 163 | 164 | # SQL Server files 165 | App_Data/*.mdf 166 | App_Data/*.ldf 167 | 168 | ############# 169 | ## Windows detritus 170 | ############# 171 | 172 | # Windows image file caches 173 | Thumbs.db 174 | ehthumbs.db 175 | 176 | # Folder config file 177 | Desktop.ini 178 | 179 | # Recycle Bin used on file shares 180 | $RECYCLE.BIN/ 181 | 182 | # Mac crap 183 | .DS_Store 184 | 185 | 186 | ############# 187 | ## Python 188 | ############# 189 | 190 | *.py[co] 191 | 192 | # Packages 193 | *.egg 194 | *.egg-info 195 | dist/ 196 | build/ 197 | eggs/ 198 | parts/ 199 | var/ 200 | sdist/ 201 | develop-eggs/ 202 | .installed.cfg 203 | 204 | # Installer logs 205 | pip-log.txt 206 | 207 | # Unit test / coverage reports 208 | .coverage 209 | .tox 210 | 211 | #Translations 212 | *.mo 213 | 214 | #Mr Developer 215 | .mr.developer.cfg 216 | /target/ 217 | -------------------------------------------------------------------------------- /src/main/java/com/argo/hwp/utils/HwpStreamReader.java: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright [2015] argonet.co.kr 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | /* 17 | * This software has been developed with reference to 18 | * the HWP file format open specification by Hancom, Inc. 19 | * http://www.hancom.co.kr/userofficedata.userofficedataList.do?menuFlag=3 20 | * 한글과컴퓨터의 한/글 문서 파일(.hwp) 공개 문서를 참고하여 개발하였습니다. 21 | * 22 | * 본 제품은 다음의 소스를 참조하였습니다. 23 | * https://github.com/cogniti/ruby-hwp/ 24 | */ 25 | package com.argo.hwp.utils; 26 | 27 | import java.io.EOFException; 28 | import java.io.IOException; 29 | import java.io.InputStream; 30 | 31 | import org.apache.poi.util.LittleEndian; 32 | import org.slf4j.Logger; 33 | import org.slf4j.LoggerFactory; 34 | 35 | public class HwpStreamReader { 36 | private Logger log = LoggerFactory.getLogger(getClass()); 37 | private InputStream input; 38 | private byte[] buf; 39 | 40 | public HwpStreamReader(InputStream inputStream) { 41 | this.input = inputStream; 42 | buf = new byte[4]; 43 | } 44 | 45 | /** 46 | * 읽을 데이터가 더 있는가? 47 | * 48 | * @return 49 | * @throws IOException 50 | */ 51 | public boolean available() throws IOException { 52 | return input.available() > 0; 53 | } 54 | 55 | /** 56 | * unsigned 1 byte 57 | * 58 | * @return 59 | * @throws IOException 60 | */ 61 | public short uint8() throws IOException { 62 | if (ensure(1) == 0) 63 | return -1; 64 | 65 | return LittleEndian.getUByte(buf); 66 | } 67 | 68 | /** 69 | * unsigned 2 byte 70 | * 71 | * @return 72 | * @throws IOException 73 | */ 74 | public int uint16() throws IOException { 75 | if (ensure(2) == 0) 76 | return -1; 77 | 78 | return LittleEndian.getUShort(buf); 79 | } 80 | 81 | /** 82 | * unsigned 2 byte array 83 | * 84 | * @param i 85 | * @return 86 | * @throws IOException 87 | */ 88 | public int[] uint16(int i) throws IOException { 89 | if (i <= 0) 90 | throw new IllegalArgumentException(); 91 | 92 | int[] uints = new int[i]; 93 | for (int ii = 0; ii < i; ii++) { 94 | if (ensure(2) == 0) 95 | throw new EOFException(); 96 | 97 | uints[ii] = LittleEndian.getUShort(buf); 98 | } 99 | 100 | return uints; 101 | } 102 | 103 | /** 104 | * unsigned 4 byte 105 | * 106 | * @return 107 | * @throws IOException 108 | */ 109 | public long uint32() throws IOException { 110 | if (ensure(4) == 0) 111 | return -1; 112 | 113 | return LittleEndian.getUInt(buf); 114 | } 115 | 116 | /** 117 | * 118 | * @param n 119 | * @return 120 | * @throws IOException 121 | */ 122 | public long skip(long n) throws IOException { 123 | return input.skip(n); 124 | } 125 | 126 | /** 127 | * n만큼 skip 하지 못할 경우 IOException 을 발생한다 128 | * 129 | * @param n 130 | * @throws IOException 131 | */ 132 | public void ensureSkip(long n) throws IOException { 133 | long skipped = skip(n); 134 | if (n != skipped) { 135 | log.error("Skip failed {} => {}", n, skipped); 136 | throw new IOException(); 137 | } 138 | } 139 | 140 | /** 141 | * count만큼 바이트를 읽는다. InflaterInputStream의 경우 한번에 count만큼 read가 안되는 경우가 있다. 142 | * 그래서 count만큼 읽을 때까지 루프를 실행한다 143 | * 144 | * @param count 145 | * @return 146 | * @throws IOException 147 | * @throws EOFException 148 | */ 149 | private int ensure(int count) throws IOException, EOFException { 150 | int total = 0; 151 | while (total < count) { 152 | // if (total > 0) { 153 | // log.warn("한번에 읽기 실패 {}/{}. 다시 읽기 시도함 {}", total, count, input); 154 | // } 155 | 156 | int read = input.read(buf, total, count - total); 157 | if (read <= 0) 158 | break; 159 | 160 | total += read; 161 | } 162 | 163 | if (total == 0) { 164 | // end 165 | } else if (total < count) { 166 | // unexpected end 167 | throw new EOFException(); 168 | } 169 | 170 | return total; 171 | } 172 | } -------------------------------------------------------------------------------- /target/surefire-reports/TEST-com.argo.hwp.v5.TestHwpV5Extractor.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | java.io.FileNotFoundException 66 | at com.argo.hwp.v5.HwpTextExtractorV5.extractText(HwpTextExtractorV5.java:67) 67 | at com.argo.hwp.v5.TestHwpV5Extractor.extract(TestHwpV5Extractor.java:16) 68 | at com.argo.hwp.v5.TestHwpV5Extractor.testObjectStreams(TestHwpV5Extractor.java:47) 69 | 70 | 71 | -------------------------------------------------------------------------------- /target/surefire-reports/TEST-com.argo.hwp.v3.TestHwpV3Extractor.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | java.io.FileNotFoundException: empty-v3.hwp (지정된 파일을 찾을 수 없습니다) 66 | at java.io.FileInputStream.open(Native Method) 67 | at java.io.FileInputStream.<init>(FileInputStream.java:138) 68 | at com.argo.hwp.v3.HwpTextExtractorV3.extractText(HwpTextExtractorV3.java:35) 69 | at com.argo.hwp.v3.TestHwpV3Extractor.extract(TestHwpV3Extractor.java:18) 70 | at com.argo.hwp.v3.TestHwpV3Extractor.testObjectStreams(TestHwpV3Extractor.java:76) 71 | 72 | 73 | -------------------------------------------------------------------------------- /src/main/java/com/argo/hwp/v3/HwpTextExtractorV3.java: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright [2015] argonet.co.kr 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | /* 17 | * This software has been developed with reference to 18 | * the HWP file format open specification by Hancom, Inc. 19 | * http://www.hancom.co.kr/userofficedata.userofficedataList.do?menuFlag=3 20 | * 한글과컴퓨터의 한/글 문서 파일(.hwp) 공개 문서를 참고하여 개발하였습니다. 21 | * 22 | * 본 제품은 다음의 소스를 참조하였습니다. 23 | * https://github.com/cogniti/ruby-hwp/ 24 | */ 25 | package com.argo.hwp.v3; 26 | 27 | import java.io.File; 28 | import java.io.FileInputStream; 29 | import java.io.IOException; 30 | import java.io.InputStream; 31 | import java.io.Writer; 32 | import java.util.Arrays; 33 | import java.util.zip.Inflater; 34 | import java.util.zip.InflaterInputStream; 35 | 36 | import org.slf4j.Logger; 37 | import org.slf4j.LoggerFactory; 38 | 39 | import com.argo.hwp.utils.HwpStreamReader; 40 | 41 | public abstract class HwpTextExtractorV3 { 42 | private static Logger log = LoggerFactory 43 | .getLogger(HwpTextExtractorV3.class); 44 | 45 | // 1byte 문자들.. 46 | private static final byte[] HWP_V3_SIGNATURE = ("HWP Document File V3.00" 47 | + " \u001A\u0001\u0002\u0003\u0004\u0005").getBytes(); 48 | 49 | public static boolean extractText(File source, Writer writer) 50 | throws IOException { 51 | InputStream input = new FileInputStream(source); 52 | 53 | try { 54 | // 한글V3 시그니처 확인 55 | try { 56 | byte[] buf = new byte[HWP_V3_SIGNATURE.length]; 57 | int read = input.read(buf); 58 | if (read < HWP_V3_SIGNATURE.length) 59 | return false; 60 | 61 | // 시그니처 확인 62 | if (!Arrays.equals(HWP_V3_SIGNATURE, buf)) 63 | return false; 64 | } catch (IOException e) { 65 | log.warn("파일정보 확인 중 오류. HWP 포맷이 아닌 것으로 간주함", e); 66 | return false; 67 | } 68 | 69 | extractText(input, writer); 70 | 71 | return true; 72 | } finally { 73 | try { 74 | // from javadoc. If this file has an associated channel then the 75 | // channel is closed as well. 76 | input.close(); 77 | } catch (IOException e) { 78 | log.warn("exception while file.close", e); 79 | } 80 | } 81 | } 82 | 83 | private static void extractText(InputStream inputStream, Writer writer) 84 | throws IOException { 85 | // 시그니처를 위해서 30바이트 읽은 상태 86 | 87 | HwpStreamReader input = new HwpStreamReader(inputStream); 88 | 89 | // 문서 정보 p.72 90 | 91 | // 암호 걸린 파일 확인 92 | input.ensureSkip(96); 93 | int t = input.uint16(); 94 | if (t != 0) 95 | throw new IOException("암호화된 문서는 해석할 수 없습니다"); 96 | 97 | // 압축 확인 98 | input.ensureSkip(26); // 124 99 | boolean compressed = input.uint8() != 0; 100 | log.debug("압축 확인 : {}", compressed); 101 | 102 | // 정보 블럭 길이 103 | input.ensureSkip(1); 104 | int blockSize = input.uint16(); 105 | 106 | // 문서 요약 건너뛰기 107 | input.ensureSkip(1008); 108 | // 정보 블럭 건너뛰기 109 | input.ensureSkip(blockSize); 110 | 111 | // 압축 풀기 112 | if (compressed) { 113 | log.info("본문 압축 해제"); 114 | input = new HwpStreamReader(new InflaterInputStream(inputStream, 115 | new Inflater(true))); 116 | } 117 | 118 | // p.73 글꼴이름 건너뛰기 119 | for (int ii = 0; ii < 7; ii++) 120 | input.ensureSkip(input.uint16() * 40); 121 | 122 | // p.74 스타일 건너뛰기 123 | input.ensureSkip(input.uint16() * (20 + 31 + 187)); 124 | 125 | // <문단 리스트> ::= <문단>+ <빈문단> 126 | // int paraCount = 0; 127 | while (input.available()) { 128 | // paraCount++; 129 | // log.debug("문단 {}", paraCount); 130 | if (!writeParaText(input, writer)) 131 | break; 132 | } 133 | } 134 | 135 | private static boolean writeParaText(HwpStreamReader input, Writer writer) 136 | throws IOException { 137 | // # 문단 정보 138 | short prev_paragraph_shape = input.uint8(); 139 | int n_chars = input.uint16(); 140 | int n_lines = input.uint16(); 141 | short char_shape_included = input.uint8(); 142 | 143 | StringBuilder buf = new StringBuilder(); 144 | 145 | // p.77 기타 플래그부터.. 146 | input.ensureSkip(1 + 4 + 1 + 31); 147 | // # 여기까지 43 bytes 148 | if (prev_paragraph_shape == 0 && n_chars > 0) 149 | input.ensureSkip(187); 150 | 151 | // # 빈문단이면 false 반환 152 | if (n_chars == 0) { 153 | // log.debug("빈문단"); 154 | return false; 155 | } 156 | 157 | // # 줄 정보 158 | input.ensureSkip(n_lines * 14); 159 | 160 | // # 글자 모양 정보 p.78 161 | if (char_shape_included != 0) { 162 | for (int ii = 0; ii < n_chars; ii++) { 163 | short flag = input.uint8(); 164 | if (flag != 1) 165 | input.ensureSkip(31); 166 | } 167 | } 168 | 169 | log.trace("n_chars = {}", n_chars); 170 | 171 | // # 글자들 172 | int n_chars_read = 0; 173 | 174 | while (n_chars_read < n_chars) { 175 | int c = input.uint16(); // # 2바이트씩 읽는다. 176 | // log.debug("구분 : {}", Integer.toHexString(c)); 177 | n_chars_read++; 178 | 179 | switch (c) { 180 | case 5: // 필드코드(덧말, 계산식, 환경정보, 누름틀) 181 | { 182 | long len = input.uint32(); // 정보 길이 183 | input.uint16(); // 5 184 | n_chars_read += 3; 185 | input.ensureSkip(len); 186 | } 187 | break; 188 | case 6: // 책갈피 189 | n_chars_read += 3; 190 | input.ensureSkip(6 + 34); 191 | break; 192 | case 9: // tab 193 | n_chars_read += 3; 194 | input.ensureSkip(6); 195 | writer.write('\t'); 196 | break; 197 | case 10: // 표 198 | n_chars_read += 3; 199 | input.ensureSkip(6); 200 | 201 | // # 테이블 식별 정보 84 바이트 202 | input.ensureSkip(80); 203 | int n_cells = input.uint16(); 204 | input.ensureSkip(2); 205 | input.ensureSkip(27 * n_cells); 206 | 207 | // # <셀 문단 리스트>+ 208 | for (int ii = 0; ii < n_cells; ii++) { 209 | // # <셀 문단 리스트> ::= <셀 문단>+ <빈문단> 210 | // log.debug("셀 {}/{}", ii, n_cells); 211 | while (writeParaText(input, writer)) 212 | ; 213 | } 214 | // # <캡션 문단 리스트> ::= <캡션 문단>+ <빈문단> 215 | while (writeParaText(input, writer)) 216 | ; 217 | break; 218 | 219 | case 11: // 그림 220 | { 221 | n_chars_read += 3; 222 | input.ensureSkip(6); 223 | long len = input.uint32(); 224 | input.ensureSkip(344); 225 | input.ensureSkip(len); 226 | // # <캡션 문단 리스트> ::= <캡션 문단>+ <빈문단> 227 | while (writeParaText(input, writer)) 228 | ; 229 | } 230 | break; 231 | case 13: // # 글자들 끝 232 | writer.write('\n'); 233 | break; 234 | case 16: // # 머리말/꼬리말 235 | n_chars_read += 3; 236 | input.ensureSkip(6); 237 | input.ensureSkip(10); 238 | 239 | // # <문단 리스트> ::= <문단>+ <빈문단> 240 | while (writeParaText(input, writer)) 241 | ; 242 | break; 243 | 244 | case 17: // # 각주/미주 245 | n_chars_read += 3; 246 | input.ensureSkip(6); 247 | // # 각주/미주 정보 건너 뛰기 248 | input.ensureSkip(14); 249 | while (writeParaText(input, writer)) 250 | ; 251 | break; 252 | case 18: 253 | case 19: 254 | case 20: 255 | case 21: 256 | n_chars_read += 3; 257 | input.ensureSkip(6); 258 | break; 259 | case 23: // # 글자 겹침 260 | n_chars_read += 4; 261 | input.ensureSkip(8); 262 | break; 263 | case 24: 264 | case 25: 265 | n_chars_read += 2; 266 | input.ensureSkip(4); 267 | break; 268 | case 28: // # 개요 모양/번호 269 | n_chars_read += 31; 270 | input.ensureSkip(62); 271 | break; 272 | case 30: 273 | case 31: 274 | n_chars_read += 1; 275 | input.ensureSkip(2); 276 | break; 277 | default: 278 | if (c >= 0x0020 && c <= 0xffff) {// # hnc code range 279 | String s = Hnc2String.convert(c); 280 | if (s == null) { 281 | log.warn("매핑 문자 없음 {}", Integer.toHexString(c)); 282 | writer.write(unknown(c)); 283 | } else { 284 | buf.append(s); 285 | writer.write(s); 286 | } 287 | } else { 288 | log.error("특수 문자 ? : {}", Integer.toHexString(c)); 289 | // throw new NotImplementedException(); 290 | } 291 | } 292 | } 293 | 294 | log.trace(">>> {}", buf.toString()); 295 | 296 | return true; 297 | } 298 | 299 | private static String unknown(int c) { 300 | return String.format("?+0x%1$04x", c); 301 | } 302 | } 303 | -------------------------------------------------------------------------------- /LICENSE-2.0.txt: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /src/main/java/com/argo/hwp/v5/HwpTextExtractorV5.java: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright [2015] argonet.co.kr 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | /* 17 | * This software has been developed with reference to 18 | * the HWP file format open specification by Hancom, Inc. 19 | * http://www.hancom.co.kr/userofficedata.userofficedataList.do?menuFlag=3 20 | * 한글과컴퓨터의 한/글 문서 파일(.hwp) 공개 문서를 참고하여 개발하였습니다. 21 | * 22 | * 본 제품은 다음의 소스를 참조하였습니다. 23 | * https://github.com/cogniti/ruby-hwp/ 24 | */ 25 | package com.argo.hwp.v5; 26 | 27 | import java.io.File; 28 | import java.io.FileNotFoundException; 29 | import java.io.IOException; 30 | import java.io.InputStream; 31 | import java.io.Writer; 32 | import java.security.InvalidKeyException; 33 | import java.security.Key; 34 | import java.security.NoSuchAlgorithmException; 35 | import java.util.Arrays; 36 | import java.util.Iterator; 37 | import java.util.zip.Inflater; 38 | import java.util.zip.InflaterInputStream; 39 | 40 | import javax.crypto.Cipher; 41 | import javax.crypto.CipherInputStream; 42 | import javax.crypto.NoSuchPaddingException; 43 | import javax.crypto.spec.SecretKeySpec; 44 | 45 | import org.apache.poi.poifs.filesystem.DirectoryEntry; 46 | import org.apache.poi.poifs.filesystem.DirectoryNode; 47 | import org.apache.poi.poifs.filesystem.DocumentEntry; 48 | import org.apache.poi.poifs.filesystem.DocumentInputStream; 49 | import org.apache.poi.poifs.filesystem.Entry; 50 | import org.apache.poi.poifs.filesystem.NDocumentInputStream; 51 | import org.apache.poi.poifs.filesystem.NPOIFSFileSystem; 52 | import org.apache.poi.util.LittleEndian; 53 | import org.slf4j.Logger; 54 | import org.slf4j.LoggerFactory; 55 | 56 | import com.argo.hwp.utils.HwpStreamReader; 57 | 58 | public abstract class HwpTextExtractorV5 { 59 | protected static Logger log = LoggerFactory 60 | .getLogger(HwpTextExtractorV5.class); 61 | 62 | private static final byte[] HWP_V5_SIGNATURE = "HWP Document File" 63 | .getBytes(); 64 | 65 | private static final int[] HWP_CONTROL_CHARS = new int[] { 0, 10, 13, 24, 66 | 25, 26, 27, 28, 29, 30, 31 }; 67 | private static final int[] HWP_INLINE_CHARS = new int[] { 4, 5, 6, 7, 8, 9, 68 | 19, 20 }; 69 | private static final int[] HWP_EXTENDED_CHARS = new int[] { 1, 2, 3, 11, 70 | 12, 14, 15, 16, 17, 18, 21, 22, 23 }; 71 | 72 | private static final int HWPTAG_BEGIN = 0x010; 73 | 74 | /** 75 | * HWP 파일에서 텍스트 추출 76 | * 77 | * @param source 78 | * @param writer 79 | * @return 80 | * @throws FileNotFoundException 81 | * @throws IOException 82 | */ 83 | public static boolean extractText(File source, Writer writer) 84 | throws FileNotFoundException, IOException { 85 | if (source == null) 86 | throw new IllegalArgumentException(); 87 | if (!source.exists()) 88 | throw new FileNotFoundException(); 89 | 90 | NPOIFSFileSystem fs = null; 91 | try { 92 | FileHeader header; 93 | 94 | // HWP Document가 맞는지 확인한다 95 | try { 96 | // 우선은 Compound File 97 | fs = new NPOIFSFileSystem(source); 98 | header = getHeader(fs); 99 | } catch (IOException e) { 100 | log.warn("파일정보 확인 중 오류. HWP 포맷이 아닌 것으로 간주함", e); 101 | return false; 102 | } 103 | 104 | if (header == null) 105 | return false; 106 | 107 | // 여기까지 왔다면 HWP 문서가 맞다고 본다 108 | // 이제부터의 IOException 은 HWP 읽는 중 오류이다. 109 | 110 | // 배포용 문서.. BodyText 가 아닌 ViewText에 Section 이 존재 111 | // https://groups.google.com/forum/#!msg/hwp-foss/d2KL2ypR89Q/lCTkebPcIYYJ 112 | if (header.viewtext) { 113 | extractViewText(header, fs, writer); 114 | } else { 115 | extractBodyText(header, fs, writer); 116 | } 117 | 118 | return true; 119 | } finally { 120 | if (fs != null) { 121 | try { 122 | fs.close(); 123 | } catch (IOException e) { 124 | log.warn("Exception", e); 125 | } 126 | } 127 | } 128 | } 129 | 130 | /** 131 | * HWP의 FileHeader 추출 132 | * 133 | * @param fs 134 | * @return 135 | * @throws IOException 136 | */ 137 | private static FileHeader getHeader(NPOIFSFileSystem fs) throws IOException { 138 | DirectoryNode root = fs.getRoot(); 139 | 140 | // 파일인식정보 p.18 141 | 142 | // FileHeader 존재 여부 143 | Entry headerEntry = root.getEntry("FileHeader"); 144 | if (!headerEntry.isDocumentEntry()) 145 | return null; 146 | 147 | // 시그니처 확인 148 | byte[] header = new byte[256]; // FileHeader 길이는 256 149 | DocumentInputStream headerStream = new DocumentInputStream( 150 | (DocumentEntry) headerEntry); 151 | try { 152 | int read = headerStream.read(header); 153 | if (read != 256 154 | || !Arrays.equals(HWP_V5_SIGNATURE, Arrays.copyOfRange( 155 | header, 0, HWP_V5_SIGNATURE.length))) 156 | return null; 157 | } finally { 158 | headerStream.close(); 159 | } 160 | 161 | FileHeader fileHeader = new FileHeader(); 162 | 163 | // 버전. debug 164 | fileHeader.version = HwpVersion.parseVersion(LittleEndian.getUInt( 165 | header, 32)); 166 | long flags = LittleEndian.getUInt(header, 36); 167 | log.debug("Flags={}", Long.toBinaryString(flags).replace(' ', '0')); 168 | 169 | fileHeader.compressed = (flags & 0x01) == 0x01; 170 | fileHeader.encrypted = (flags & 0x02) == 0x02; 171 | fileHeader.viewtext = (flags & 0x04) == 0x04; 172 | 173 | return fileHeader; 174 | } 175 | 176 | /** 177 | * 텍스트 추출 178 | * 179 | * @param writer 180 | * @param source 181 | * 182 | * @return 183 | * @throws IOException 184 | */ 185 | private static void extractBodyText(FileHeader header, NPOIFSFileSystem fs, 186 | Writer writer) throws IOException { 187 | DirectoryNode root = fs.getRoot(); 188 | 189 | // BodyText 읽기 190 | Entry bodyText = root.getEntry("BodyText"); 191 | if (bodyText == null || !bodyText.isDirectoryEntry()) 192 | throw new IOException("Invalid BodyText"); 193 | 194 | Iterator iterator = ((DirectoryEntry) bodyText).getEntries(); 195 | while (iterator.hasNext()) { 196 | Entry entry = iterator.next(); 197 | if (entry.getName().startsWith("Section") 198 | && entry instanceof DocumentEntry) { 199 | log.debug("extract {}", entry.getName()); 200 | 201 | InputStream input = new NDocumentInputStream( 202 | (DocumentEntry) entry); 203 | try { 204 | if (header.compressed) 205 | input = new InflaterInputStream(input, new Inflater( 206 | true)); 207 | 208 | HwpStreamReader sectionStream = new HwpStreamReader(input); 209 | 210 | extractText(sectionStream, writer); 211 | } finally { 212 | // 닫을 필요는 없을 것이다 213 | try { 214 | input.close(); 215 | } catch (IOException e) { 216 | log.error("있을 수 없는 일?", e); 217 | } 218 | } 219 | } else { 220 | log.warn("알수없는 Entry '{}'({})", entry.getName(), entry); 221 | } 222 | } 223 | } 224 | 225 | /** 226 | * 텍스트 추출 227 | * 228 | * @param writer 229 | * @param source 230 | * 231 | * @return 232 | * @throws IOException 233 | */ 234 | private static void extractViewText(FileHeader header, NPOIFSFileSystem fs, 235 | Writer writer) throws IOException { 236 | DirectoryNode root = fs.getRoot(); 237 | 238 | // BodyText 읽기 239 | Entry bodyText = root.getEntry("ViewText"); 240 | if (bodyText == null || !bodyText.isDirectoryEntry()) 241 | throw new IOException("Invalid ViewText"); 242 | 243 | Iterator iterator = ((DirectoryEntry) bodyText).getEntries(); 244 | while (iterator.hasNext()) { 245 | Entry entry = iterator.next(); 246 | if (entry.getName().startsWith("Section") 247 | && entry instanceof DocumentEntry) { 248 | log.debug("extract {}", entry.getName()); 249 | 250 | InputStream input = new NDocumentInputStream( 251 | (DocumentEntry) entry); 252 | 253 | // FIXME 섹션마다 키가 있는가? 254 | Key key = readKey(input); 255 | try { 256 | input = createDecryptStream(input, key); 257 | if (header.compressed) 258 | input = new InflaterInputStream(input, new Inflater( 259 | true)); 260 | 261 | HwpStreamReader sectionStream = new HwpStreamReader(input); 262 | extractText(sectionStream, writer); 263 | } catch (InvalidKeyException e) { 264 | throw new IOException(e); 265 | } catch (NoSuchAlgorithmException e) { 266 | throw new IOException(e); 267 | } catch (NoSuchPaddingException e) { 268 | throw new IOException(e); 269 | } finally { 270 | // 닫을 필요는 없을 것이다 271 | try { 272 | input.close(); 273 | } catch (IOException e) { 274 | log.error("있을 수 없는 일?", e); 275 | } 276 | } 277 | } else { 278 | log.warn("알수없는 Entry '{}'({})", entry.getName(), entry); 279 | } 280 | } 281 | } 282 | 283 | // https://groups.google.com/forum/#!msg/hwp-foss/d2KL2ypR89Q/lCTkebPcIYYJ 284 | private static class SRand { 285 | private int random_seed; 286 | 287 | private SRand(int seed) { 288 | random_seed = seed; 289 | } 290 | 291 | private int rand() { 292 | random_seed = (random_seed * 214013 + 2531011) & 0xFFFFFFFF; 293 | return (random_seed >> 16) & 0x7FFF; 294 | } 295 | } 296 | 297 | private static Key readKey(InputStream input) throws IOException { 298 | byte[] data = new byte[260]; 299 | 300 | input.read(data, 0, 4); // TAG, 301 | // HWPTAG_DISTRIBUTE_DOC_DATA 확인 302 | // long recordHeader = LittleEndian.getUInt(data); 303 | // log.debug("TAG: {}", recordHeader & 0x3FF); 304 | // log.debug("LEVEL: {}", (recordHeader >> 10) & 0x3FF); 305 | // log.debug("SIZE: {}", (recordHeader >> 20) & 0xFFF); 306 | 307 | // https://groups.google.com/forum/#!msg/hwp-foss/d2KL2ypR89Q/lCTkebPcIYYJ 308 | 309 | input.read(data, 0, 256); 310 | 311 | SRand srand = new SRand(LittleEndian.getInt(data)); 312 | byte xor = 0; 313 | for (int i = 0, n = 0; i < 256; i++, n--) { 314 | if (n == 0) { 315 | xor = (byte) (srand.rand() & 0xFF); 316 | n = (int) ((srand.rand() & 0xF) + 1); 317 | } 318 | if (i >= 4) { 319 | data[i] = (byte) ((data[i]) ^ (xor)); 320 | } 321 | } 322 | 323 | int offset = 4 + (data[0] & 0xF); 324 | byte[] key = Arrays.copyOfRange(data, offset, offset + 16); 325 | 326 | SecretKeySpec secretKey = new SecretKeySpec(key, "AES"); 327 | return secretKey; 328 | } 329 | 330 | public static InputStream createDecryptStream(InputStream input, Key key) 331 | throws IOException, NoSuchAlgorithmException, 332 | NoSuchPaddingException, InvalidKeyException { 333 | Cipher cipher = null; 334 | 335 | cipher = Cipher.getInstance("AES/ECB/NoPadding"); 336 | cipher.init(Cipher.DECRYPT_MODE, key); 337 | 338 | return new CipherInputStream(input, cipher); 339 | } 340 | 341 | /** 342 | * Section 스트림에서 문자를 추출 343 | * 344 | * @param sectionStream 345 | * @param writer 346 | * @throws IOException 347 | */ 348 | private static void extractText(HwpStreamReader sectionStream, Writer writer) 349 | throws IOException { 350 | StringBuffer buf = new StringBuffer(1024); 351 | TagInfo tag = new TagInfo(); 352 | 353 | while (true) { 354 | if (!readTag(sectionStream, tag)) 355 | break; 356 | 357 | buf.setLength(0); 358 | if (HWPTAG_BEGIN + 50 == tag.id) { 359 | writeParaHeader(sectionStream, tag.length, buf); 360 | } else if (HWPTAG_BEGIN + 51 == tag.id) { 361 | if (tag.length % 2 != 0) 362 | throw new IOException("Invalid block size"); 363 | 364 | writeParaText(sectionStream, tag.length, buf); 365 | 366 | if (buf.length() > 0) // 줄바꿈 추가? 367 | writer.append(buf.toString()).append('\n'); 368 | } else { 369 | sectionStream.ensureSkip(tag.length); 370 | } 371 | 372 | if (buf.length() > 0) { 373 | log.debug("TAG[{}]({}):{} [{}]", new Object[] { tag.id, 374 | tag.level, tag.length, buf }); 375 | } 376 | } 377 | } 378 | 379 | private static void writeParaHeader(HwpStreamReader sectionStream, 380 | long length, StringBuffer buf) throws IOException { 381 | // log.debug("text={}", sectionStream.uint32()); 382 | // log.debug("control mask={}", sectionStream.uint32()); 383 | // log.debug("문단모양아이디참조값={}", sectionStream.uint16()); 384 | // log.debug("문단스타일아이디참조값={}", sectionStream.uint8()); 385 | // log.debug("단나누기종류={}", sectionStream.uint8()); 386 | // log.debug("글자모양정보수={}", sectionStream.uint16()); 387 | // log.debug("range tag정보수={}", sectionStream.uint16()); 388 | // log.debug("각줄에 대한 align정보수={}", sectionStream.uint16()); 389 | // log.debug("문단 Instance ID={}", sectionStream.uint32()); 390 | // sectionStream.ensureSkip(2); 391 | 392 | sectionStream.ensureSkip(length); 393 | } 394 | 395 | /** 396 | * HWPTAG_PARA_TEXT 의 문자스트림을 문자열로 변환 397 | * 398 | * @param sectionStream 399 | * @param datasize 400 | * @param buf 401 | * @throws IOException 402 | */ 403 | private static void writeParaText(HwpStreamReader sectionStream, 404 | long datasize, StringBuffer buf) throws IOException { 405 | int[] chars = sectionStream.uint16((int) (datasize / 2)); 406 | 407 | for (int index = 0; index < chars.length; index++) { 408 | int ch = chars[index]; 409 | if (Arrays.binarySearch(HWP_INLINE_CHARS, ch) >= 0) { 410 | if (ch == 9) { 411 | buf.append('\t'); 412 | } 413 | index += 7; 414 | } else if (Arrays.binarySearch(HWP_EXTENDED_CHARS, ch) >= 0) { 415 | index += 7; 416 | } else if (Arrays.binarySearch(HWP_CONTROL_CHARS, ch) >= 0) { 417 | buf.append(' '); 418 | } else { 419 | buf.append((char) ch); 420 | } 421 | } 422 | } 423 | 424 | private static boolean readTag(HwpStreamReader sectionStream, TagInfo tag) 425 | throws IOException { 426 | // p.24 427 | 428 | long recordHeader = sectionStream.uint32(); 429 | if (recordHeader == -1) 430 | return false; 431 | 432 | // log.debug("Record Header={} [{}]", recordHeader, 433 | // Long.toHexString(recordHeader)); 434 | 435 | tag.id = recordHeader & 0x3FF; 436 | tag.level = (recordHeader >> 10) & 0x3FF; 437 | tag.length = (recordHeader >> 20) & 0xFFF; 438 | 439 | // 확장 데이터 레코드 p.24 440 | if (tag.length == 0xFFF) 441 | tag.length = sectionStream.uint32(); 442 | 443 | return true; 444 | } 445 | 446 | static class FileHeader { 447 | HwpVersion version; 448 | boolean compressed; // bit 0 449 | boolean encrypted; // bit 1 450 | boolean viewtext; // bit 2 451 | } 452 | 453 | static class TagInfo { 454 | long id; 455 | long level; 456 | long length; 457 | } 458 | 459 | static class HwpVersion { 460 | int m; 461 | int n; 462 | int p; 463 | int r; 464 | 465 | public String toString() { 466 | return String.format("%d.%d.%d.%d", m, n, p, r); 467 | } 468 | 469 | public static HwpVersion parseVersion(long longVersion) { 470 | HwpVersion version = new HwpVersion(); 471 | version.m = (int) ((longVersion & 0xFF000000L) >> 24); 472 | version.n = (int) ((longVersion & 0x00FF0000L) >> 16); 473 | version.p = (int) ((longVersion & 0x0000FF00L) >> 8); 474 | version.r = (int) ((longVersion & 0x000000FFL)); 475 | return version; 476 | } 477 | } 478 | 479 | } 480 | --------------------------------------------------------------------------------