├── .gitignore ├── LICENSE ├── README.md ├── requirements.txt └── textbox ├── docx.py ├── hwp.py ├── pptx.py └── xlsx.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 charsyam 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # textbox 2 | 1. Get text from documents(HWP, DOCX, PPTX, XLSX) 3 | 2. not perfect 4 | 3. python hwp.py filename 5 | 6 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | olefile==0.44 2 | -------------------------------------------------------------------------------- /textbox/docx.py: -------------------------------------------------------------------------------- 1 | try: 2 | from xml.etree.cElementTree import XML 3 | except ImportError: 4 | from xml.etree.ElementTree import XML 5 | import zipfile 6 | 7 | 8 | class DOCXExtractor(object): 9 | NAMESPACE = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}' 10 | PARA = NAMESPACE + 'p' 11 | TEXT = NAMESPACE + 't' 12 | 13 | def __init__(self, filename): 14 | self.text = self._get_text(filename) 15 | 16 | def get_text(self): 17 | return self.text 18 | 19 | def _get_text(self, filename): 20 | document = zipfile.ZipFile(filename) 21 | xml_content = document.read('word/document.xml') 22 | document.close() 23 | tree = XML(xml_content) 24 | 25 | paragraphs = [] 26 | for paragraph in tree.getiterator(self.PARA): 27 | texts = [node.text 28 | for node in paragraph.getiterator(self.TEXT) 29 | if node.text] 30 | if texts: 31 | paragraphs.append(''.join(texts)) 32 | 33 | return '\n\n'.join(paragraphs) 34 | 35 | 36 | def get_text(filename): 37 | docx = DOCXExtractor(filename) 38 | print(docx.get_text()) 39 | 40 | 41 | if __name__ == '__main__': 42 | import sys 43 | get_text(sys.argv[1]) 44 | -------------------------------------------------------------------------------- /textbox/hwp.py: -------------------------------------------------------------------------------- 1 | import olefile 2 | import zlib 3 | import struct 4 | import binascii 5 | 6 | 7 | class HWPExtractor(object): 8 | FILE_HEADER_SECTION = "FileHeader" 9 | HWP_SUMMARY_SECTION = "\x05HwpSummaryInformation" 10 | SECTION_NAME_LENGTH = len("Section") 11 | BODYTEXT_SECTION = "BodyText" 12 | HWP_TEXT_TAGS = [67] 13 | 14 | def __init__(self, filename): 15 | self._ole = self.load(filename) 16 | self._dirs = self._ole.listdir() 17 | 18 | self._valid = self.is_valid(self._dirs) 19 | if (self._valid == False): 20 | raise Exception("Not Valid HwpFile") 21 | 22 | self._compressed = self.is_compressed(self._ole) 23 | self.text = self._get_text() 24 | 25 | def load(self, filename): 26 | return olefile.OleFileIO(filename) 27 | 28 | def is_valid(self, dirs): 29 | if [self.FILE_HEADER_SECTION] not in dirs: 30 | return False 31 | 32 | return [self.HWP_SUMMARY_SECTION] in dirs 33 | 34 | def is_compressed(self, ole): 35 | header = self._ole.openstream("FileHeader") 36 | header_data = header.read() 37 | return (header_data[36] & 1) == 1 38 | 39 | def get_body_sections(self, dirs): 40 | m = [] 41 | for d in dirs: 42 | if d[0] == self.BODYTEXT_SECTION: 43 | m.append(int(d[1][self.SECTION_NAME_LENGTH:])) 44 | 45 | return ["BodyText/Section"+str(x) for x in sorted(m)] 46 | 47 | def get_text(self): 48 | return self.text 49 | 50 | def _get_text(self): 51 | sections = self.get_body_sections(self._dirs) 52 | text = "" 53 | for section in sections: 54 | text += self.get_text_from_section(section) 55 | text += "\n" 56 | 57 | self.text = text 58 | return self.text 59 | 60 | def get_text_from_section(self, section): 61 | bodytext = self._ole.openstream(section) 62 | data = bodytext.read() 63 | 64 | unpacked_data = zlib.decompress(data, -15) if self.is_compressed else data 65 | size = len(unpacked_data) 66 | 67 | i = 0 68 | 69 | text = "" 70 | while i < size: 71 | header = struct.unpack_from("> 10) & 0x3ff 74 | rec_len = (header >> 20) & 0xfff 75 | 76 | if rec_type in self.HWP_TEXT_TAGS: 77 | rec_data = unpacked_data[i+4:i+4+rec_len] 78 | text += rec_data.decode('utf-16') 79 | text += "\n" 80 | 81 | i += 4 + rec_len 82 | 83 | return text 84 | 85 | 86 | def get_text(filename): 87 | hwp = HWPExtractor(filename) 88 | print(hwp.get_text()) 89 | 90 | 91 | if __name__ == "__main__": 92 | import sys 93 | get_text(sys.argv[1]) 94 | -------------------------------------------------------------------------------- /textbox/pptx.py: -------------------------------------------------------------------------------- 1 | try: 2 | from xml.etree.cElementTree import XML 3 | except ImportError: 4 | from xml.etree.ElementTree import XML 5 | import zipfile 6 | 7 | 8 | class PPTXExtractor(object): 9 | NAMESPACE = '{http://schemas.openxmlformats.org/drawingml/2006/main}' 10 | TEXT = NAMESPACE + 't' 11 | 12 | SLIDE_PREFIX = "ppt/slides/slide" 13 | SLIDE_PREFIX_LENGTH = len(SLIDE_PREFIX) 14 | 15 | def __init__(self, filename): 16 | self.text = self._get_text(filename) 17 | 18 | def get_text(self): 19 | return self.text 20 | 21 | def _get_text(self, filename): 22 | document = zipfile.ZipFile(filename) 23 | texts = [self._get_text_from_slide(document, slide)\ 24 | for slide in self._get_slide_names(document.namelist())] 25 | document.close() 26 | return '\n'.join(texts) 27 | 28 | def _get_slide_names(self, dirs): 29 | m = [] 30 | for d in dirs: 31 | if d.startswith(self.SLIDE_PREFIX): 32 | m.append(int(d[self.SLIDE_PREFIX_LENGTH:-4])) 33 | 34 | return [self.SLIDE_PREFIX+str(x)+".xml" for x in sorted(m)] 35 | 36 | def _get_text_from_slide(self, document, slide): 37 | xml_content = document.read(slide) 38 | tree = XML(xml_content) 39 | 40 | texts = [node.text 41 | for node in tree.getiterator(self.TEXT) 42 | if node.text] 43 | 44 | return '\n'.join(texts) 45 | 46 | 47 | def get_text(filename): 48 | pptx = PPTXExtractor(filename) 49 | print(pptx.get_text()) 50 | 51 | 52 | if __name__ == '__main__': 53 | import sys 54 | get_text(sys.argv[1]) 55 | -------------------------------------------------------------------------------- /textbox/xlsx.py: -------------------------------------------------------------------------------- 1 | try: 2 | from xml.etree.cElementTree import XML 3 | except ImportError: 4 | from xml.etree.ElementTree import XML 5 | import zipfile 6 | 7 | 8 | class XLSXExtractor(object): 9 | NAMESPACE = "{http://schemas.openxmlformats.org/spreadsheetml/2006/main}" 10 | TEXT = NAMESPACE + 't' 11 | 12 | def __init__(self, filename): 13 | self.text = self._get_text(filename) 14 | 15 | def get_text(self): 16 | return self.text 17 | 18 | def _get_text(self, filename): 19 | document = zipfile.ZipFile(filename) 20 | xml_content = document.read('xl/sharedStrings.xml') 21 | document.close() 22 | tree = XML(xml_content) 23 | 24 | texts = [node.text 25 | for node in tree.getiterator(self.TEXT) 26 | if node.text] 27 | 28 | return '\n\n'.join(texts) 29 | 30 | 31 | def get_text(filename): 32 | xlsx = XLSXExtractor(filename) 33 | print(xlsx.get_text()) 34 | 35 | 36 | if __name__ == '__main__': 37 | import sys 38 | get_text(sys.argv[1]) 39 | --------------------------------------------------------------------------------