├── .gitignore ├── .idea ├── dictionaries │ └── svolkov.xml └── vagrant.xml ├── LICENSE.md ├── README.rst ├── decoder.rar ├── lingvoreader ├── __init__.py ├── articleheading.py ├── bitstream.py ├── decoder.py ├── lentable.py ├── lsdfile.py ├── lsdreader.py └── tools.py ├── setup.py ├── test ├── __init__.py ├── test_arg_parser.py ├── test_bitStream.py └── test_tools.py └── testdata └── .gitignore /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | CPP/ 3 | build/ 4 | dist/ 5 | lingvoreader.egg-info/ 6 | *.swp 7 | *.pyc 8 | .DS_Store 9 | venv/ 10 | venv2/ 11 | .python-version 12 | -------------------------------------------------------------------------------- /.idea/dictionaries/svolkov.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /.idea/vagrant.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 sv99@inbox.ru 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | lsdreader 2 | ========= 3 | 4 | python analog ABBYY DSL Compiler - decompile LSD dictionary to DSL. 5 | 6 | Support Lingvo 11, 12, X3, X5 and X6 lsd files format. 7 | 8 | Based on C++ source decoder.zip from ru-board user `tr7 `_ 9 | `Source on github `_ 10 | 11 | Russian decompiling team on `ru-board.ru `_ 12 | 13 | Project Page: https://github.com/sv99/lsdreader 14 | 15 | Install 16 | ======= 17 | 18 | Install from pip:: 19 | 20 | pip install setuptools -U 21 | pip install lingvoreader 22 | 23 | Install development version:: 24 | 25 | git clone 26 | pip install -e . 27 | 28 | make tar.gz for PyPi:: 29 | 30 | pip install twine 31 | python setup.py sdist 32 | twine upload dist/lingvoreader-x.x.x.tar.gz 33 | 34 | Usage 35 | ----- 36 | :: 37 | 38 | lsdreader [-h] [--header] (-i INPUT | -a) [-o OUTDIR] [-c] [-v] [--version] 39 | 40 | Decode Lingvo 11, 12, X3, X5 and X6 lsd dictionary to dsl 41 | 42 | optional arguments: 43 | -h, --help show this help message and exit 44 | --header show header info and exit 45 | -i INPUT, --input INPUT 46 | Dictionary to decode 47 | -a, --all All dictionary in current directory 48 | -o OUTDIR, --outdir OUTDIR 49 | Output directory 50 | -c, --codecs print supported languages and their codes 51 | -v, --verbose 52 | --version show program's version number and exit 53 | 54 | Lingvo versions 55 | =============== 56 | 57 | :: 58 | 59 | 11 2005 supported 60 | 12 2006 supported 61 | x3 2008 supported 62 | x5 2011 supported 63 | x6 2014 (current) supported 64 | 65 | -------------------------------------------------------------------------------- /decoder.rar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sv99/lsdreader/45f610c7a011a6e54d7cb521ab61eb6277ae65d5/decoder.rar -------------------------------------------------------------------------------- /lingvoreader/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | __author__ = 'sv99' 4 | # The version as used in the setup.py 5 | __version__ = "0.2.15" 6 | 7 | 8 | class LsdError(Exception): 9 | pass 10 | -------------------------------------------------------------------------------- /lingvoreader/articleheading.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from __future__ import unicode_literals, print_function, division, absolute_import 3 | from lingvoreader.tools import int2unichr 4 | 5 | __author__ = 'svolkov' 6 | 7 | 8 | # info from ArticleHeading.cpp 9 | # Not implemented yet!! 10 | # 11 | # Unsorted parts 12 | # 13 | # Each characted in a LSD heading has a flag indicating if it's 'sorted' or 'unsorted'. 14 | # The unsorted characters aren't used in sortings and searching. Basically they become 15 | # transparent for the indexing mechanism and only visible while presented to the user 16 | # together with an article. Let's annotate each character with an 's' or 'u' to indicate 17 | # this distinction. 18 | # 19 | # In the DSL format curly brackets are used to denote the sortedness of a heading's part. 20 | # 21 | # The heading 'Some{thing}' would be encoded as 22 | # 23 | # Something 24 | # ssssuuuuu 25 | # 26 | # And it's name would be just 'Some', while its extended name would be 'Something' 27 | # There can be any number of unsorted characters in a heading. At any position. 28 | # 29 | # To correctly decompile an LSD heading (where each character is either sorted or unsorted) 30 | # to a DSL heading, it is required to group adjacent characters in groups by their 31 | # sortedness, and then enclose each such group in parentheses. This process is rather 32 | # straightforward. The special case to look for is the slash (/), which is encoded 33 | # as an unsorted character, but is used to escape the subsequent characted (either sorted 34 | # or unsorted) and thus requires a special handling. 35 | # 36 | # Optional parts (variants) 37 | # 38 | # The DSL format has a mechanism for generating several headings with optional parts 39 | # out of a single heading. By enclosing a part of a heading in parentheses, it is possible 40 | # to generate all possible combinations of the heading. 41 | # 42 | # The heading 'aa(bb)cc' would be expanded into two different headings 43 | # 44 | # aa{(bb)}cc and aa{(}bb{)}cc 45 | # ss uuuu ss ss u ss u ss 46 | # 47 | # For more such parts, more headings would be generated. It is 2 headings for 1 part, 48 | # 4 headings for 2 parts, 9 for 3 and so on. 49 | # 50 | # There exists a pattern which helps to combine two headings produced with the variant 51 | # encoding. The pattern looks like this 'u(' denotes an unsorted parenthesis): 52 | # 53 | # (A) ??? u( u*n u) ??? -> ??? s( s|u*n s) ??? 54 | # (B) ??? u( s|u*n u) ??? 55 | # 56 | # An optional part can contain an unsorted part as well, so this pattern allow for a 57 | # combination of sorted/unsorted parts in an optional part (denoted as 's|u*n'). 58 | # 59 | # This pattern becomes insufficient in the presence of spaces, adjacent to the 60 | # parentheses. Here are two examples of such headings (the upper case is used 61 | # to denote spaces): 62 | # 63 | # abc (123) 64 | # 65 | # abc{ (123)} and abc {(}123{)} 66 | # sss Uuuuuu sssS u sss u 67 | # 68 | # bbb (123) z 69 | # 70 | # bbb {(123) }z and bbb {(}123{)} z 71 | # sssS uuuuuU s sssS u sss u Ss 72 | # 73 | # Another two patterns account for these special cases. 74 | # 75 | # (C) ??? Uu( u*n u) -> ??? Ss( s|u*n s) 76 | # (D) ??? Su( s|u*n u) 77 | # 78 | # (E) ??? Su( u*n u)U ??? -> ??? Ss( s|u*n s)S ??? 79 | # (F) ??? Su( s|u*n u)S ??? 80 | # 81 | # The headings that can't be collapsed using one of these three rules are left as is. 82 | # 83 | # 84 | class CharInfo: 85 | def __init__(self): 86 | self.sorted = False 87 | self.escaped = False 88 | self.chr = "" 89 | 90 | # add equal test 91 | 92 | 93 | class Heading: 94 | def __init__(self): 95 | self.text = "" 96 | self.extensions = [] 97 | 98 | @property 99 | def ext_text(self): 100 | if len(self.extensions) == 0: 101 | return self.text 102 | res = self.text 103 | offset = 0 104 | for idx, ext in self.extensions: 105 | add_braces = ext != u"\\" 106 | if add_braces: 107 | ext = u"{" + ext + u"}" 108 | 109 | res = res[:idx + offset] + ext + res[idx + offset:] 110 | if add_braces: 111 | offset += 2 112 | return res 113 | 114 | 115 | class ArticleHeading: 116 | def __init__(self): 117 | # may by many heading for single article 118 | self.headings = [] 119 | self.reference = 0 120 | # refernce for next arcticle, used in the x6 for encoding 121 | self.next_reference = 0 122 | 123 | def read(self, lsd_decoder, bstr, known_prefix): 124 | h = Heading() 125 | prefix_len = lsd_decoder.decode_prefix_len() 126 | postfix_len = lsd_decoder.decode_postfix_len() 127 | h.text = known_prefix[:prefix_len] 128 | h.text += lsd_decoder.decode_heading(postfix_len) 129 | self.reference = lsd_decoder.read_reference2() 130 | if bstr.read_bit(): 131 | # additional not visible formatting item in header 132 | # join multisymbols item 133 | ext_length = bstr.read_bits(8) 134 | if ext_length != 0: 135 | ext = "" 136 | first_idx = prev_idx = 0 137 | for i in range(ext_length): 138 | idx = bstr.read_bits(8) 139 | char = int2unichr(bstr.read_bits(16)) 140 | if ext == "": 141 | ext += char 142 | first_idx = prev_idx = idx 143 | else: 144 | if prev_idx + 1 == idx: 145 | # join item with sequence idx 146 | ext += char 147 | prev_idx = idx 148 | else: 149 | # other item 150 | h.extensions.append((first_idx, ext)) 151 | ext = char 152 | first_idx = prev_idx = idx 153 | # add last item 154 | h.extensions.append((first_idx, ext)) 155 | self.headings.append(h) 156 | return h.text 157 | 158 | def merge(self, heading): 159 | for h in heading.headings: 160 | # print("merge: %s" % h.ext_text) 161 | self.headings.append(h) 162 | 163 | def get_first(self): 164 | return self.headings[0] 165 | 166 | def get_first_ext_text(self): 167 | return self.headings[0].ext_text 168 | 169 | @property 170 | def simple(self): 171 | return len(self.headings) == 1 172 | 173 | def dump(self): 174 | print("%s: %d - %d" % (self.get_first().text, self.reference, self.next_reference)) 175 | 176 | 177 | class ArticleHeadingList(list): 178 | def __init__(self): 179 | super(ArticleHeadingList, self).__init__() 180 | self.appended = 0 181 | self.references = {} 182 | 183 | def append(self, item): 184 | self.appended += 1 185 | # append if item.reference not exists 186 | ref = item.reference 187 | if ref in self.references: 188 | self.references[ref].merge(item) 189 | else: 190 | self.references[ref] = item 191 | # if not first then update next_reference in the previous item 192 | if len(self) > 0: 193 | self[-1].next_reference = ref 194 | super(ArticleHeadingList, self).append(item) 195 | -------------------------------------------------------------------------------- /lingvoreader/bitstream.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import struct 4 | 5 | from lingvoreader.tools import int2unichr 6 | from lingvoreader import LsdError 7 | 8 | __author__ = 'sv99' 9 | 10 | 11 | def reverse32(int_value): 12 | res, = struct.unpack('>L', struct.pack('H', struct.pack('H', self.record, self.pos) 85 | self.pos += 2 86 | self.in_byte_pos = 0 87 | return res 88 | 89 | def read_int(self): 90 | res, = struct.unpack_from('>L', self.record, self.pos) 91 | self.pos += 4 92 | self.in_byte_pos = 0 93 | return res 94 | 95 | def read_symbols(self): 96 | size = self.read_bits(32) 97 | bits_per_symbol = self.read_bits(8) 98 | res = [] 99 | for i in range(size): 100 | res.append(self.read_bits(bits_per_symbol)) 101 | return res 102 | 103 | def read_bit(self): 104 | byte, = struct.unpack_from('B', self.record, self.pos) 105 | byte >>= (7 - self.in_byte_pos) 106 | if self.in_byte_pos == 7: 107 | self.pos += 1 108 | self.in_byte_pos = 0 109 | else: 110 | self.in_byte_pos += 1 111 | return byte & 1 112 | 113 | def read_bits(self, count): 114 | return self.read_bits_o(count) 115 | 116 | # stupid direct implementation 117 | def read_bits_s(self, count): 118 | if count > 32: 119 | raise LsdError("Many bits for read: %d" % count) 120 | res = 0 121 | for i in range(count): 122 | res <<= 1 123 | res += self.read_bit() 124 | return res 125 | 126 | def read_bits_o(self, count): 127 | if count > 32: 128 | raise LsdError("Many bits for read: %d" % count) 129 | masks = (1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF) 130 | count_bytes = (count + self.in_byte_pos) // 8 131 | if count + self.in_byte_pos - 8 * count_bytes > 0: 132 | count_bytes += 1 133 | # if in single raw byt 134 | if count_bytes == 1: 135 | if (self.in_byte_pos + count) < 8: 136 | byte = self.record[self.pos] 137 | byte >>= 7 - self.in_byte_pos - count + 1 138 | byte &= masks[count - 1] 139 | self.in_byte_pos += count 140 | return byte 141 | # many raw bytes 142 | # inBitPos 143 | # | count = 13 | 144 | # 01234567 | 01234567 | 0123456 145 | # 146 | # inBitPos = 5 count_first = 3 count_las = 2 147 | # 148 | p = self.pos 149 | count_last = (count + self.in_byte_pos) % 8 150 | count_first = 8 - self.in_byte_pos 151 | byte_first = self.record[p] 152 | p += 1 153 | byte_first &= masks[count_first - 1] 154 | res = byte_first 155 | # full bytes 156 | full_bytes = (count - count_first) // 8 157 | if full_bytes > 0: 158 | for i in range(full_bytes): 159 | res <<= 8 160 | res += self.record[p] 161 | p += 1 162 | # last byte 163 | if count_last > 0: 164 | byte = self.record[p] 165 | byte >>= 8 - count_last 166 | res <<= count_last 167 | res += byte 168 | self.in_byte_pos = count_last 169 | self.pos = p 170 | return res 171 | 172 | def read_some(self, length): 173 | if length == 1: 174 | return self.read_byte() 175 | elif length == 2: 176 | return self.read_word() 177 | elif length == 4: 178 | return self.read_int() 179 | else: 180 | raise LsdError('Allow read byte, word and int length: %i' % length) 181 | 182 | def read_unicode(self, size, big_endian=True): 183 | res = "" 184 | for i in range(size): 185 | ch = self.read_some(2) 186 | if not big_endian: 187 | ch = reverse16(ch) 188 | res += int2unichr(ch) 189 | # res += unichr(self.read_some(2), big_endian)) 190 | return res 191 | 192 | 193 | -------------------------------------------------------------------------------- /lingvoreader/decoder.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | from __future__ import (division, absolute_import, print_function) 4 | from lingvoreader import tools 5 | from lingvoreader.lentable import LenTable 6 | from lingvoreader.tools import int2unichr 7 | 8 | __author__ = 'sv99' 9 | 10 | 11 | class Decoder: 12 | def __init__(self, bstr): 13 | self.bstr = bstr 14 | self.prefix = "" 15 | self._article_symbols = None 16 | self._heading_symbols = None 17 | self._ltArticles = None 18 | self._ltHeadings = None 19 | self._ltPrefixLengths = None 20 | self._ltPostfixLengths = None 21 | self._huffman1Number = 0 22 | self._huffman2Number = 0 23 | self._readed = False 24 | 25 | @property 26 | def readed(self): 27 | return self._readed 28 | 29 | def decode_prefix_len(self): 30 | return self._ltPrefixLengths.decode() 31 | 32 | def decode_postfix_len(self): 33 | return self._ltPostfixLengths.decode() 34 | 35 | def read_reference1(self): 36 | return self.read_reference(self._huffman1Number) 37 | 38 | def read_reference2(self): 39 | return self.read_reference(self._huffman2Number) 40 | 41 | def read_reference(self, huffman_number): 42 | reference = "" 43 | code = self.bstr.read_bits(2) 44 | if code == 3: 45 | self.bstr.read_bits(32) 46 | return reference 47 | 48 | size = tools.bit_length(huffman_number) 49 | assert(size >= 2) 50 | return (code << (size - 2)) | self.bstr.read_bits(size - 2) 51 | 52 | def decode_heading(self, size): 53 | res = "" 54 | for i in range(size): 55 | sym_idx = self._ltHeadings.decode() 56 | sym = self._heading_symbols[sym_idx] 57 | assert(sym <= 0xffff) # LingvoEngine:2EAB84E8 58 | res += int2unichr(sym) 59 | return res 60 | 61 | def decode_article(self, size): 62 | """ 63 | decode User and Abrv dict 64 | """ 65 | res = "" 66 | while len(res) < size: 67 | sym_idx = self._ltArticles.decode() 68 | sym = self._article_symbols[sym_idx] 69 | if sym >= 0x10000: 70 | if sym >= 0x10040: 71 | start_idx = self.bstr.read_bits(tools.bit_length(size)) 72 | s = sym - 0x1003d 73 | res += res[start_idx:start_idx + s] 74 | else: 75 | prefix_idx = self.bstr.read_bits(tools.bit_length(len(self.prefix))) 76 | s = sym - 0xfffd 77 | res += self.prefix[prefix_idx:prefix_idx + s] 78 | else: 79 | res += int2unichr(sym) 80 | return res 81 | 82 | # need seek(bstr.header.dictionary_encoder_offset) befor call! 83 | def read(self): 84 | return 85 | 86 | def dump(self): 87 | print("Decoder: %s" % self.__class__.__name__) 88 | if self.readed: 89 | print(" ArticleSymbols: %d" % len(self._article_symbols)) 90 | print(" HeadingSymbols: %d" % len(self._heading_symbols)) 91 | self._ltArticles.dump("Articles") 92 | self._ltHeadings.dump("Headings") 93 | self._ltPrefixLengths.dump("PrefixLengths") 94 | self._ltPostfixLengths.dump("PostfixLengths") 95 | 96 | 97 | class UserDictionaryDecoder(Decoder): 98 | def __init__(self, bstr): 99 | Decoder.__init__(self, bstr) 100 | return 101 | 102 | def read(self): 103 | self.prefix = self.bstr.read_unicode(self.bstr.read_int()) 104 | self._article_symbols = self.bstr.read_symbols() 105 | self._heading_symbols = self.bstr.read_symbols() 106 | self._ltArticles = LenTable(self.bstr) 107 | self._ltHeadings = LenTable(self.bstr) 108 | 109 | self._ltPrefixLengths = LenTable(self.bstr) 110 | self._ltPostfixLengths = LenTable(self.bstr) 111 | 112 | self._huffman1Number = self.bstr.read_bits(32) 113 | self._huffman2Number = self.bstr.read_bits(32) 114 | self._readed = True 115 | return 116 | 117 | 118 | class SystemDictionaryDecoder13(Decoder): 119 | def __init__(self, bstr): 120 | Decoder.__init__(self, bstr) 121 | return 122 | 123 | def read(self): 124 | self.prefix = self.bstr.read_unicode(self.bstr.read_int()) 125 | self._article_symbols = self.bstr.read_symbols() 126 | self._heading_symbols = self.bstr.read_symbols() 127 | self._ltArticles = LenTable(self.bstr) 128 | self._ltHeadings = LenTable(self.bstr) 129 | 130 | self._ltPrefixLengths = LenTable(self.bstr) 131 | self._ltPostfixLengths = LenTable(self.bstr) 132 | 133 | self._huffman1Number = self.bstr.read_bits(32) 134 | self._huffman2Number = self.bstr.read_bits(32) 135 | self._readed = True 136 | return 137 | 138 | def decode_article(self, size): 139 | res = "" 140 | while len(res) < size: 141 | sym_idx = self._ltArticles.decode() 142 | sym = self._article_symbols[sym_idx] 143 | if sym <= 0x80: 144 | if sym <= 0x3F: 145 | start_pref_idx = self.bstr.read_bits(tools.bit_length(len(self.prefix))) 146 | s = sym + 3 147 | res += self.prefix[start_pref_idx:start_pref_idx + s] 148 | else: 149 | start_idx = self.bstr.read_bits(tools.bit_length(size)) 150 | s = sym - 0x3d 151 | res += res[start_idx:start_idx + s] 152 | else: 153 | res += int2unichr(sym - 0x80) 154 | return res 155 | 156 | 157 | class SystemDictionaryDecoder14(SystemDictionaryDecoder13): 158 | def __init__(self, bstr): 159 | SystemDictionaryDecoder13.__init__(self, bstr) 160 | return 161 | 162 | def read(self): 163 | prefix_len = self.bstr.read_int() 164 | self.prefix = self.bstr.read_unicode(prefix_len) 165 | self._article_symbols = self.bstr.read_symbols() 166 | self._heading_symbols = self.bstr.read_symbols() 167 | self._ltArticles = LenTable(self.bstr) 168 | self._ltHeadings = LenTable(self.bstr) 169 | 170 | self._ltPostfixLengths = LenTable(self.bstr) 171 | self._dummy = self.bstr.read_bits(32) 172 | self._ltPrefixLengths = LenTable(self.bstr) 173 | 174 | self._huffman1Number = self.bstr.read_bits(32) 175 | self._huffman2Number = self.bstr.read_bits(32) 176 | self._readed = True 177 | return 178 | 179 | # def decode_article(self, size): 180 | # res = "" 181 | # while len(res) < size: 182 | # sym_idx = self._ltArticles.decode() 183 | # sym = self._article_symbols[sym_idx] 184 | # if sym <= 0x80: 185 | # if sym <= 0x3F: 186 | # start_pref_idx = self.bstr.read_bits(tools.bit_length(len(self.prefix))) 187 | # s = sym + 3 188 | # res += self.prefix[start_pref_idx:start_pref_idx + s] 189 | # else: 190 | # start_idx = self.bstr.read_bits(tools.bit_length(size)) 191 | # s = sym - 0x3d 192 | # res += res[start_idx:start_idx + s] 193 | # else: 194 | # res += unichr(sym - 0x80) 195 | # return res 196 | 197 | 198 | class AbbreviationDictionaryDecoder(Decoder): 199 | def __init__(self, bstr): 200 | Decoder.__init__(self, bstr) 201 | return 202 | 203 | def read(self): 204 | self.prefix = self.read_xored_prefix(self.bstr.read_int()) 205 | self._article_symbols = self.read_xored_symbols() 206 | self._heading_symbols = self.read_xored_symbols() 207 | self._ltArticles = LenTable(self.bstr) 208 | self._ltHeadings = LenTable(self.bstr) 209 | 210 | self._ltPrefixLengths = LenTable(self.bstr) 211 | self._ltPostfixLengths = LenTable(self.bstr) 212 | 213 | self._huffman1Number = self.bstr.read_bits(32) 214 | self._huffman2Number = self.bstr.read_bits(32) 215 | self._readed = True 216 | return 217 | 218 | def read_xored_symbols(self): 219 | size = self.bstr.read_bits(32) 220 | bits_per_symbol = self.bstr.read_bits(8) 221 | res = [] 222 | for i in range(size): 223 | res.append(self.bstr.read_bits(bits_per_symbol) ^ 0x1325) 224 | return res 225 | 226 | def read_xored_prefix(self, size): 227 | res = "" 228 | for i in range(size): 229 | res += int2unichr(self.bstr.read_bits(16) ^ 0x879A) 230 | return res 231 | 232 | 233 | class SystemDictionaryDecoder15(Decoder): 234 | def __init__(self, bstr): 235 | Decoder.__init__(self, bstr) 236 | return 237 | 238 | def read(self): 239 | # self.bstr = XoredBitStream(self.bstr) 240 | # self.decode() 241 | 242 | prefix_len = self.bstr.read_some(4) 243 | self.prefix = self.bstr.read_unicode(prefix_len) 244 | self._article_symbols = self.bstr.read_symbols() 245 | self._heading_symbols = self.bstr.read_symbols() 246 | self._ltArticles = LenTable(self.bstr) 247 | self._ltHeadings = LenTable(self.bstr) 248 | 249 | self._ltPostfixLengths = LenTable(self.bstr) 250 | self._dummy = self.bstr.read_bits(32) 251 | self._ltPrefixLengths = LenTable(self.bstr) 252 | 253 | self._huffman1Number = self.bstr.read_bits(32) 254 | self._huffman2Number = self.bstr.read_bits(32) 255 | self._readed = True 256 | return 257 | 258 | def decode_article(self, size): 259 | res = "" 260 | while len(res) < size: 261 | sym_idx = self._ltArticles.decode() 262 | sym = self._article_symbols[sym_idx] 263 | if sym <= 0x80: 264 | if sym <= 0x3F: 265 | start_pref_idx = self.bstr.read_bits(tools.bit_length(len(self.prefix))) 266 | s = sym + 3 267 | res += self.prefix[start_pref_idx:start_pref_idx + s] 268 | else: 269 | start_idx = self.bstr.read_bits(tools.bit_length(size)) 270 | s = sym - 0x3d 271 | res += res[start_idx:start_idx + s] 272 | else: 273 | res += int2unichr(sym - 0x80) 274 | return res 275 | -------------------------------------------------------------------------------- /lingvoreader/lentable.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | from __future__ import (print_function) 4 | from . import tools 5 | 6 | __author__ = 'sv99' 7 | 8 | 9 | class SymInfo: 10 | def __init__(self, sym_idx, size, code): 11 | self.sym_idx = sym_idx 12 | self.size = size 13 | self.code = code 14 | 15 | 16 | class HuffmanNode: 17 | def __init__(self, left, right, parent, weight): 18 | self.left = left 19 | self.right = right 20 | self.parent = parent 21 | self.weight = weight 22 | 23 | 24 | class LenTable: 25 | def __init__(self, bstr): 26 | self.bstr = bstr 27 | self._count = self.bstr.read_bits(32) 28 | self._bits_per_len = self.bstr.read_bits(8) 29 | self._idx_bit_size = tools.bit_length(self._count) 30 | 31 | self.symidx2nodeidx = [-1 for _ in range(self._count)] 32 | self.nodes = [HuffmanNode(0, 0, -1, -1) for _ in range(self._count - 1)] 33 | root_idx = len(self.nodes) - 1 34 | self.next_node_position = 0 35 | for i in range(self._count): 36 | symidx = self.bstr.read_bits(self._idx_bit_size) 37 | length = self.bstr.read_bits(self._bits_per_len) 38 | self.place_sym_idx(symidx, root_idx, length) 39 | 40 | def place_sym_idx(self, sym_idx, node_idx, size): 41 | assert size > 0 42 | if size == 1: # time to place 43 | if self.nodes[node_idx].left == 0: 44 | self.nodes[node_idx].left = -1 - sym_idx 45 | self.symidx2nodeidx[sym_idx] = node_idx 46 | return True 47 | 48 | if self.nodes[node_idx].right == 0: 49 | self.nodes[node_idx].right = -1 - sym_idx 50 | self.symidx2nodeidx[sym_idx] = node_idx 51 | return True 52 | 53 | return False 54 | 55 | if self.nodes[node_idx].left == 0: 56 | self.nodes[self.next_node_position] = HuffmanNode(0, 0, node_idx, -1) 57 | self.next_node_position += 1 58 | self.nodes[node_idx].left = self.next_node_position 59 | 60 | if self.nodes[node_idx].left > 0: 61 | if self.place_sym_idx(sym_idx, self.nodes[node_idx].left - 1, size - 1): 62 | return True 63 | 64 | if self.nodes[node_idx].right == 0: 65 | self.nodes[self.next_node_position] = HuffmanNode(0, 0, node_idx, -1) 66 | self.next_node_position += 1 67 | self.nodes[node_idx].right = self.next_node_position 68 | 69 | if self.nodes[node_idx].right > 0: 70 | if self.place_sym_idx(sym_idx, self.nodes[node_idx].right - 1, size - 1): 71 | return True 72 | 73 | return False 74 | 75 | def decode(self): 76 | node = self.nodes[-1] 77 | length = 0 78 | while True: 79 | length += 1 80 | bit = self.bstr.read_bit() 81 | if bit: # right 82 | if node.right < 0: # leaf 83 | sym_idx = -1 - node.right 84 | return sym_idx 85 | node = self.nodes[node.right - 1] 86 | else: # left 87 | if node.left < 0: # leaf 88 | sym_idx = -1 - node.left 89 | return sym_idx 90 | 91 | node = self.nodes[node.left - 1] 92 | 93 | def dump(self, name): 94 | print("LenTable: %s" % name) 95 | print(" Count: %s" % self._count) 96 | print(" bitsPerLen: %s" % self._bits_per_len) 97 | print(" IdxBitSize: %s" % self._idx_bit_size) 98 | -------------------------------------------------------------------------------- /lingvoreader/lsdfile.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from __future__ import unicode_literals, print_function, division, absolute_import 3 | 4 | import codecs 5 | import os 6 | 7 | from lingvoreader import LsdError 8 | from lingvoreader import tools, decoder 9 | from lingvoreader.articleheading import ArticleHeading, ArticleHeadingList 10 | from lingvoreader.bitstream import reverse32, reverse16, BitStream 11 | 12 | __author__ = 'sv99' 13 | 14 | 15 | class OverlayReader: 16 | def __init__(self, bstr, offset): 17 | self.bstr = bstr 18 | 19 | if self.bstr.seek(offset): 20 | self._entriesCount = self.bstr.read_bits(4) 21 | else: 22 | self._entriesCount = 0 23 | 24 | def dump(self): 25 | print("Overlay:") 26 | print(" EntriesCount: %s" % self._entriesCount) 27 | 28 | 29 | class Header: 30 | def __init__(self, bstr): 31 | self.bstr = bstr 32 | self.magic = self.bstr.read(8).decode().replace('\x00', '') 33 | self.version = reverse32(self.bstr.read_int()) 34 | self.unk = reverse32(self.bstr.read_int()) 35 | self.checksum = reverse32(self.bstr.read_int()) 36 | self.entries_count = reverse32(self.bstr.read_int()) 37 | self.annotation_offset = reverse32(self.bstr.read_int()) 38 | self.dictionary_encoder_offset = reverse32(self.bstr.read_int()) 39 | self.articles_offset = reverse32(self.bstr.read_int()) 40 | self.pages_offset = reverse32(self.bstr.read_int()) 41 | self.unk1 = reverse32(self.bstr.read_int()) 42 | self.unk2 = reverse16(self.bstr.read_word()) 43 | self.unk3 = reverse16(self.bstr.read_word()) 44 | self.source_language = reverse16(self.bstr.read_word()) 45 | self.target_language = reverse16(self.bstr.read_word()) 46 | return 47 | 48 | @property 49 | def hi_version(self): 50 | return self.version >> 16 51 | 52 | def dump(self): 53 | print("Header:") 54 | print(" Magic: %s" % self.magic) 55 | print(" Checksume: %s" % hex(self.checksum)) 56 | print(" Version: %s (%s)" % (hex(self.hi_version), hex(self.version))) 57 | print(" Entries: %d" % self.entries_count) 58 | print(" AnnotationOffset: %s" % hex(self.annotation_offset)) 59 | print(" DictionaryEncoderOffset: %s" % hex(self.dictionary_encoder_offset)) 60 | print(" ArticlesOffset: %s" % hex(self.articles_offset)) 61 | print(" Pages start: %s" % hex(self.pages_offset)) 62 | print(" Source language: %d %s" % (self.source_language, tools.lang_map[self.source_language])) 63 | print(" Target language: %d %s" % (self.target_language, tools.lang_map[self.target_language])) 64 | 65 | 66 | class CachePage: 67 | def __init__(self, bstr): 68 | self.bstr = bstr 69 | self.is_leaf = bstr.read_bit() 70 | self.number = bstr.read_bits(16) 71 | self.prev = bstr.read_bits(16) 72 | self.parent = bstr.read_bits(16) 73 | self.next = bstr.read_bits(16) 74 | self.headings_count = bstr.read_bits(16) 75 | self.bstr.to_nearest_byte() 76 | return 77 | 78 | 79 | xor_pad = ( 80 | 0x9C, 0xDF, 0x9B, 0xF3, 0xBE, 0x3A, 0x83, 0xD8, 81 | 0xC9, 0xF5, 0x50, 0x98, 0x35, 0x4E, 0x7F, 0xBB, 82 | 0x89, 0xC7, 0xE9, 0x6B, 0xC4, 0xC8, 0x4F, 0x85, 83 | 0x1A, 0x10, 0x43, 0x66, 0x65, 0x57, 0x55, 0x54, 84 | 0xB4, 0xFF, 0xD7, 0x17, 0x06, 0x31, 0xAC, 0x4B, 85 | 0x42, 0x53, 0x5A, 0x46, 0xC5, 0xF8, 0xCA, 0x5E, 86 | 0x18, 0x38, 0x5D, 0x91, 0xAA, 0xA5, 0x58, 0x23, 87 | 0x67, 0xBF, 0x30, 0x3C, 0x8C, 0xCF, 0xD5, 0xA8, 88 | 0x20, 0xEE, 0x0B, 0x8E, 0xA6, 0x5B, 0x49, 0x3F, 89 | 0xC0, 0xF4, 0x13, 0x80, 0xCB, 0x7B, 0xA7, 0x1D, 90 | 0x81, 0x8B, 0x01, 0xDD, 0xE3, 0x4C, 0x9A, 0xCE, 91 | 0x40, 0x72, 0xDE, 0x0F, 0x26, 0xBD, 0x3B, 0xA3, 92 | 0x05, 0x37, 0xE1, 0x5F, 0x9D, 0x1E, 0xCD, 0x69, 93 | 0x6E, 0xAB, 0x6D, 0x6C, 0xC3, 0x71, 0x1F, 0xA9, 94 | 0x84, 0x63, 0x45, 0x76, 0x25, 0x70, 0xD6, 0x8F, 95 | 0xFD, 0x04, 0x2E, 0x2A, 0x22, 0xF0, 0xB8, 0xF2, 96 | 0xB6, 0xD0, 0xDA, 0x62, 0x75, 0xB7, 0x77, 0x34, 97 | 0xA2, 0x41, 0xB9, 0xB1, 0x74, 0xE4, 0x95, 0x1B, 98 | 0x3E, 0xE7, 0x00, 0xBC, 0x93, 0x7A, 0xE8, 0x86, 99 | 0x59, 0xA0, 0x92, 0x11, 0xF7, 0xFE, 0x03, 0x2F, 100 | 0x28, 0xFA, 0x27, 0x02, 0xE5, 0x39, 0x21, 0x96, 101 | 0x33, 0xD1, 0xB2, 0x7C, 0xB3, 0x73, 0xC6, 0xE6, 102 | 0xA1, 0x52, 0xFB, 0xD4, 0x9E, 0xB0, 0xE2, 0x16, 103 | 0x97, 0x08, 0xF6, 0x4A, 0x78, 0x29, 0x14, 0x12, 104 | 0x4D, 0xC1, 0x99, 0xBA, 0x0D, 0x3D, 0xEF, 0x19, 105 | 0xAF, 0xF9, 0x6F, 0x0A, 0x6A, 0x47, 0x36, 0x82, 106 | 0x07, 0x9F, 0x7D, 0xA4, 0xEA, 0x44, 0x09, 0x5C, 107 | 0x8D, 0xCC, 0x87, 0x88, 0x2D, 0x8A, 0xEB, 0x2C, 108 | 0xB5, 0xE0, 0x32, 0xAD, 0xD3, 0x61, 0xAE, 0x15, 109 | 0x60, 0xF1, 0x48, 0x0E, 0x7E, 0x94, 0x51, 0x0C, 110 | 0xEC, 0xDB, 0xD2, 0x64, 0xDC, 0xFC, 0xC2, 0x56, 111 | 0x24, 0xED, 0x2B, 0xD9, 0x1C, 0x68, 0x90, 0x79 112 | ) 113 | 114 | 115 | class LsdFile: 116 | def __init__(self, dict_file, verbose=False): 117 | self.filename = dict_file 118 | self._readed = False 119 | self._parsed = False 120 | self.verbose = verbose 121 | with open(dict_file, 'rb') as fp: 122 | self.bstr = BitStream(bytearray(fp.read())) 123 | 124 | self.overlay = None 125 | self.headings = ArticleHeadingList() 126 | self.dict = [] 127 | self.header = Header(self.bstr) 128 | # check magic 129 | if self.header.magic != u'LingVo': 130 | raise LsdError('Allow only Lsd "LingVo" ident: %s' % repr(self.header.magic)) 131 | 132 | # initialize decoder 133 | self.decoder = None 134 | hi_version = self.header.hi_version 135 | version = self.header.version 136 | if hi_version == 0x11: # lingvo 11 dictionary: 0x11001 137 | self.decoder = decoder.UserDictionaryDecoder(self.bstr) 138 | elif hi_version == 0x12: # lingvo 12 dictionary: 0x12001 139 | self.decoder = decoder.UserDictionaryDecoder(self.bstr) 140 | elif hi_version == 0x13: # x3 dictionary: 0x131001 and 0x132001 if pages count > 1000 141 | self.decoder = decoder.SystemDictionaryDecoder13(self.bstr) 142 | elif hi_version == 0x14: # x5 dictionary 143 | if version == 0x142001: # user dictionaries 144 | self.decoder = decoder.UserDictionaryDecoder(self.bstr) 145 | elif version == 0x141004: # system dictionaries 146 | self.decoder = decoder.SystemDictionaryDecoder14(self.bstr) 147 | elif version == 0x145001: # abbreviation dictionaries 148 | self.decoder = decoder.AbbreviationDictionaryDecoder(self.bstr) 149 | elif hi_version == 0x15: # x6 dictionary 150 | if version == 0x152001: # user dictionaries 151 | self.decoder = decoder.UserDictionaryDecoder(self.bstr) 152 | elif version == 0x151005: # system dictionaries 153 | # xor dictionary 154 | self.xor_block_x6(self.header.dictionary_encoder_offset, self.header.articles_offset) 155 | self.decoder = decoder.SystemDictionaryDecoder14(self.bstr) 156 | elif version == 0x155001: # abbreviation dictionaries 157 | self.decoder = decoder.AbbreviationDictionaryDecoder(self.bstr) 158 | 159 | if self.decoder is None: 160 | self.dump() 161 | print("Not supported dictionary version: %s" % hex(self.header.version)) 162 | exit(1) 163 | # raise LsdError("Not supported dict version %s" % hex(self.header.version)) 164 | 165 | name_len = self.bstr.read_some(1) 166 | self.name = self.bstr.read_unicode(name_len, False) 167 | self.first_heading = self.bstr.read_unicode(self.bstr.read_byte(), False) 168 | self.last_heading = self.bstr.read_unicode(self.bstr.read_byte(), False) 169 | capitals_len = reverse32(self.bstr.read_int()) 170 | self.capitals = self.bstr.read_unicode(capitals_len, False) 171 | # icon v12+ 172 | if self.header.version > 0x120000: 173 | self.icon_size = reverse16(self.bstr.read_word()) 174 | self.icon = self.bstr.read(self.icon_size) 175 | else: 176 | self.icon_size = 0 177 | self.icon = None 178 | 179 | if self.header.version > 0x140000: 180 | self.header_checksum = reverse32(self.bstr.read_int()) 181 | else: 182 | self.header_checksum = 0 183 | 184 | if self.header.version > 0x120000: 185 | self.pages_end = reverse32(self.bstr.read_int()) 186 | self.overlay_data = reverse32(self.bstr.read_int()) 187 | else: 188 | self.pages_end = self.bstr.length 189 | self.overlay_data = self.bstr.length # no overlay 190 | 191 | if self.header.version > 0x140000: 192 | self.dummy1 = reverse32(self.bstr.read_int()) 193 | self.dummy2 = reverse32(self.bstr.read_int()) 194 | else: 195 | self.dummy1 = 0 196 | self.dummy2 = 0 197 | 198 | # set bstr pos for decoding 199 | self.bstr.seek(self.header.dictionary_encoder_offset) 200 | 201 | # x6 system dictionary table based xor decoding 202 | # each block xored with start key=0x7f 203 | # 1. dictionary_encoder_offset -> article_offset 204 | # must by decoded befor decoder.read() 205 | # 2. annotation_offset -> dictionary_encoder_offset 206 | # annotation decoded in the read_annotation 207 | # 3. each article encoded individully 208 | # articles_offset + heading.reference -> articles_offset + heading.next-reference 209 | # article decoded in the 210 | def xor_block_x6(self, start, end, key=0x7f): 211 | for i in range(start, end): 212 | byte = self.bstr.record[i] 213 | self.bstr.record[i] = byte ^ key 214 | key = xor_pad[byte] 215 | return key 216 | 217 | @property 218 | def pages_count(self): 219 | return (self.pages_end - self.header.pages_offset) // 512 220 | 221 | def get_page_offset(self, page_number): 222 | return self.header.pages_offset + 512 * page_number 223 | 224 | def read_headings(self): 225 | for i in range(self.pages_count): 226 | self.read_heading_from_page(i) 227 | # set last next_reference 228 | self.headings[-1].next_reference = self.header.pages_offset - self.header.articles_offset 229 | 230 | def merge_headings(self): 231 | res = [] 232 | # fill next_reference in the headings 233 | prev = self.headings[0] 234 | res.append(prev) 235 | for i in range(1, len(self.headings)): 236 | h = self.headings[i] 237 | if prev.reference == h.reference: 238 | # multititle article 239 | prev.merge(h) 240 | else: 241 | res[-1].next_reference = h.reference 242 | res.append(h) 243 | prev = h 244 | # headings[i].next_reference = headings[i+1].reference 245 | # set next_reference for last item to the pages_offset 246 | res[-1].next_reference = self.header.pages_offset - self.header.articles_offset 247 | return res 248 | 249 | def read_heading_from_page(self, page_number): 250 | self.bstr.seek(self.get_page_offset(page_number)) 251 | page = CachePage(self.bstr) 252 | if page.is_leaf: 253 | prefix = "" 254 | for idx in range(page.headings_count): 255 | heading = ArticleHeading() 256 | prefix = heading.read(self.decoder, self.bstr, prefix) 257 | self.headings.append(heading) 258 | 259 | def read_article(self, heading): 260 | self.bstr.seek(self.header.articles_offset + heading.reference) 261 | if self.header.version == 0x151005: 262 | # xor article 263 | self.xor_block_x6(self.header.articles_offset + heading.reference, 264 | self.header.articles_offset + heading.next_reference) 265 | size = self.bstr.read_bits(16) 266 | if size == 0xFFFF: 267 | size = self.bstr.read_bits(32) 268 | 269 | res = self.decoder.decode_article(size) 270 | # assert(res) 271 | return res 272 | 273 | def read_annotation(self): 274 | if self.header.version == 0x151005: 275 | # xor annotation 276 | self.xor_block_x6(self.header.annotation_offset, 277 | self.header.dictionary_encoder_offset) 278 | res = "" 279 | if self.bstr.seek(self.header.annotation_offset): 280 | size = self.bstr.read_bits(16) 281 | res = self.decoder.decode_article(size) 282 | return res 283 | 284 | @property 285 | def readed(self): 286 | return self._readed 287 | 288 | def read(self): 289 | if self.verbose: 290 | print("reading dictionary..") 291 | self.decoder.read() 292 | self._readed = True 293 | 294 | @property 295 | def parsed(self): 296 | return self._parsed 297 | 298 | def parse(self): 299 | if not self.readed: 300 | self.read() 301 | if self.verbose: 302 | print("decoding overlay..") 303 | self.overlay = OverlayReader(self.bstr, self.overlay_data) 304 | 305 | if self.verbose: 306 | print("decoding headings: %d" % self.header.entries_count) 307 | self.read_headings() 308 | if self.headings.appended != self.header.entries_count: 309 | raise LsdError("Decoded not all entries %d != %d" % (self.headings.appended, self.header.entries_count)) 310 | # merge multititle headings 311 | # self.headings = self.merge_headings() 312 | 313 | if self.verbose: 314 | print("decoding articles: %d" % len(self.headings)) 315 | for h in self.headings: 316 | # h.dump() 317 | self.dict.append((h, self.read_article(h))) 318 | self._parsed = True 319 | if self.verbose: 320 | print("OK") 321 | 322 | def write(self, path=""): 323 | """ save decoded dictionary """ 324 | if not self.parsed: 325 | self.parse() 326 | self.write_icon(path) 327 | self.write_annotation(path) 328 | self.write_overlay(path) 329 | self.write_dsl(path) 330 | if self.verbose: 331 | self.write_prefix(path) 332 | 333 | def make_filename(self, path, ext): 334 | base, orig_ext = os.path.splitext(self.filename) 335 | if path != "": 336 | base = os.path.join(path, os.path.basename(base)) 337 | return base + '.' + ext 338 | 339 | def write_icon(self, path=""): 340 | if self.icon_size == 0: 341 | return 342 | ico_file = self.make_filename(path, "bmp") 343 | with open(ico_file, 'wb') as ico: 344 | ico.write(self.icon) 345 | if self.verbose: 346 | print('Write icon: %s' % ico_file) 347 | 348 | def write_annotation(self, path=""): 349 | annotation = self.read_annotation() 350 | if annotation == "": 351 | return 352 | ann_file = self.make_filename(path, "ann") 353 | with codecs.open(ann_file, 'w', encoding='utf-16', errors='surrogatepass') as ann: 354 | ann.write(annotation) 355 | if self.verbose: 356 | print('Write annotation: %s' % ann_file) 357 | 358 | def write_prefix(self, path=""): 359 | if self.decoder.prefix == "": 360 | return 361 | pref_file = self.make_filename(path, "pref") 362 | with codecs.open(pref_file, 'w', encoding='utf-8') as pref: 363 | pref.write(self.decoder.prefix) 364 | if self.verbose: 365 | print('Write prefix: %s' % pref_file) 366 | 367 | def write_overlay(self, path=""): 368 | pass 369 | 370 | @staticmethod 371 | def normalize_article(article): 372 | res = article.replace(u'\n', u'\n\t') 373 | return res 374 | 375 | def write_dsl(self, path=""): 376 | if len(self.dict) == 0: 377 | print("Nothing writing to dsl!") 378 | return 379 | dsl_file = self.make_filename(path, "dsl") 380 | with codecs.open(dsl_file, 'w', encoding='utf-16', errors='surrogatepass') as dsl: 381 | dsl.write(u"#NAME\t\"" + self.name + u"\"\n") 382 | dsl.write(u"#INDEX_LANGUAGE\t\"" + tools.lang_map[self.header.source_language] + u"\"\n") 383 | dsl.write(u"#CONTENTS_LANGUAGE\t\"" + tools.lang_map[self.header.target_language] + u"\"\n") 384 | if self.icon_size > 0: 385 | base, orig_ext = os.path.splitext(os.path.basename(self.filename)) 386 | dsl.write(u"#ICON_FILE\t\"" + base + '.' + "bmp" + u"\"\n") 387 | dsl.write(u"\n") 388 | for h, r in self.dict: 389 | if h.simple: 390 | dsl.write(h.get_first_ext_text()) 391 | dsl.write(u"\n\t") 392 | else: 393 | for item in h.headings: 394 | dsl.write(item.ext_text) 395 | dsl.write(u"\n") 396 | dsl.write(u"\t") 397 | dsl.write(self.normalize_article(r)) 398 | dsl.write(u"\n") 399 | if self.verbose: 400 | print('Write dsl: %s' % dsl_file) 401 | 402 | def dump(self): 403 | self.header.dump() 404 | # dump header for not supported versions 405 | if self.decoder is not None: 406 | print("Name: %s" % self.name) 407 | print("First heading: %s" % self.first_heading) 408 | print("Last heading: %s" % self.last_heading) 409 | print("Capitals: %s" % self.capitals) 410 | print("Pages end: %s" % hex(self.pages_end)) 411 | print("Overlay data: %s" % hex(self.overlay_data)) 412 | print("Pages count: %d" % ((self.pages_end - self.header.pages_offset) // 512)) 413 | if self.header.version > 0x140000: 414 | print("dummy1: %s" % hex(self.dummy1)) 415 | print("dummy2: %s" % hex(self.dummy2)) 416 | print("Icon enable: %s" % (self.icon_size > 0)) 417 | if self.readed: 418 | self.decoder.dump() 419 | self.overlay.dump() 420 | -------------------------------------------------------------------------------- /lingvoreader/lsdreader.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | from __future__ import (division, absolute_import, print_function, with_statement) 4 | 5 | import argparse 6 | import codecs 7 | import os 8 | import sys 9 | from timeit import default_timer as timer 10 | 11 | from lingvoreader import __version__ 12 | from lingvoreader import LsdError 13 | from lingvoreader import tools 14 | from lingvoreader.lsdfile import LsdFile 15 | 16 | __author__ = 'sv99' 17 | 18 | # 19 | # lsd decoder - based on source from tr7 user from ru-board 20 | # http://forum.ru-board.com/profile.cgi?action=show&member=tr7 21 | # ru-board forum - "Lingvo dictionary" 22 | # http://forum.ru-board.com/topic.cgi?forum=93&topic=3774&glp#lt 23 | # current version on the github https://github.com/nongeneric/lsd2dsl 24 | # 25 | # Worked with Lingvo x5 dictionary, other version not supported 26 | # v 0.1 - 16.09.2013 27 | # 28 | # v 0.2.0 - 22.11.2015 29 | # add support lingvo 10, 12, x3 and x6 dictionary format 30 | # 31 | # v 0.2.9 - 28.12.2018 32 | # add python3 support 33 | # 34 | if sys.platform.startswith("win") and (sys.version_info < (3, 0)): 35 | sys.stdout = codecs.getwriter('utf-8')(sys.stdout) 36 | 37 | 38 | def unpack(dicts, dest_dir, verbose): 39 | # dict_ext = os.path.splitext(dict_file)[1].upper() 40 | # if dict_ext != '.LSD': 41 | # raise LsdError("Need Lingvo lsd dictionary.") 42 | 43 | count = len(dicts) 44 | if count == 1: 45 | print("Unpacking dict: %s" % dicts[0]) 46 | for i in range(count): 47 | dict_file = dicts[i] 48 | start = timer() 49 | try: 50 | if count > 1: 51 | print("Unpacking dict (%d from %d): %s" % (i + 1, count, dict_file)) 52 | m = LsdFile(dict_file, verbose) 53 | m.parse() 54 | m.dump() 55 | m.write(dest_dir) 56 | except ValueError as e: 57 | print("Error: %s" % e) 58 | return 1 59 | end = timer() 60 | print("Unpack OK (%s)" % tools.display_time(end - start)) 61 | 62 | return 0 63 | 64 | 65 | def header(dicts): 66 | # dict_ext = os.path.splitext(dict_file)[1].upper() 67 | # if dict_ext != '.LSD': 68 | # raise LsdError("Need Lingvo lsd dictionary.") 69 | 70 | count = len(dicts) 71 | if count == 1: 72 | print("Unpacking dict: %s" % dicts[0]) 73 | for i in range(count): 74 | dict_file = dicts[i] 75 | try: 76 | if count > 1: 77 | print("Unpacking dict (%d from %d): %s" % (i + 1, count, dict_file)) 78 | m = LsdFile(dict_file, True) 79 | m.dump() 80 | # print("Header %s OK" % dict_file) 81 | except ValueError as e: 82 | print("Error: %s" % e) 83 | return 1 84 | 85 | return 0 86 | 87 | 88 | def get_dicts(): 89 | current = os.getcwd() 90 | res = [] 91 | for f in os.listdir(current): 92 | if f.endswith(".lsd"): 93 | res.append(f) 94 | return res 95 | 96 | 97 | class CodecsAction(argparse.Action): 98 | def __init__(self, 99 | option_strings, 100 | dest=argparse.SUPPRESS, 101 | default=argparse.SUPPRESS, 102 | help="print supported languages and their codes"): 103 | super(CodecsAction, self).__init__( 104 | option_strings=option_strings, 105 | dest=dest, 106 | default=default, 107 | nargs=0, 108 | help=help) 109 | 110 | def __call__(self, parser, namespace, values, option_string=None): 111 | parser.exit(message=tools.print_codecs()) 112 | 113 | 114 | def get_arg_parser(): 115 | p = argparse.ArgumentParser(description='Decode Lingvo lsd dictionary to dsl') 116 | g = p.add_mutually_exclusive_group(required=True) 117 | g.add_argument("-i", "--input", help='Dictionary to decode') 118 | g.add_argument("-a", "--all", action="store_true", help='All dictionary in current directory') 119 | p.add_argument("--header", action="store_true", default=False, help='Print dictionary header and exit') 120 | p.add_argument("-o", "--outdir", default="", help="Output directory") 121 | p.add_argument("-c", "--codecs", action=CodecsAction) 122 | p.add_argument("-v", "--verbose", action="store_true", default=False) 123 | p.add_argument('--version', action='version', version='%(prog)s ' + __version__) 124 | return p 125 | 126 | 127 | def main(): 128 | args = get_arg_parser().parse_args() 129 | dicts = [] 130 | if args.all: 131 | # all lsd in directory 132 | print("Decode all lsd in current directory..") 133 | dicts = get_dicts() 134 | print(dicts) 135 | else: 136 | dicts.append(args.input) 137 | 138 | if args.header: 139 | header(dicts) 140 | else: 141 | if args.outdir != "": 142 | # check path 143 | if not os.path.exists(args.outdir): 144 | os.mkdir(args.outdir) 145 | 146 | start = timer() 147 | unpack(dicts, args.outdir, args.verbose) 148 | end = timer() 149 | if len(dicts) > 1: 150 | # print("Files count: %i" % c) 151 | print("Elapsed: %s" % tools.display_time(end - start)) 152 | 153 | return 0 154 | 155 | 156 | if __name__ == '__main__': 157 | sys.exit(main()) 158 | -------------------------------------------------------------------------------- /lingvoreader/tools.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | from __future__ import (print_function) 4 | from builtins import chr as text 5 | 6 | __author__ = 'sv99' 7 | 8 | 9 | lang_map = { 10 | 1555: u"Abazin", 11 | 1556: u"Abkhaz", 12 | 1557: u"Adyghe", 13 | 1078: u"Afrikaans", 14 | 1559: u"Agul", 15 | 1052: u"Albanian", 16 | 1545: u"Altaic", 17 | 1025: u"Arabic", # x5 tested support 18 | # 1025: u"ArabicSaudiArabia", 19 | 5121: u"ArabicAlgeria", 20 | 15361: u"ArabicBahrain", 21 | 3073: u"ArabicEgypt", 22 | 2049: u"ArabicIraq", 23 | 11265: u"ArabicJordan", 24 | 13313: u"ArabicKuwait", 25 | 12289: u"ArabicLebanon", 26 | 4097: u"ArabicLibya", 27 | 6145: u"ArabicMorocco", 28 | 8193: u"ArabicOman", 29 | 16385: u"ArabicQatar", 30 | 10241: u"ArabicSyria", 31 | 7169: u"ArabicTunisia", 32 | 14337: u"ArabicUAE", 33 | 9217: u"ArabicYemen", 34 | 1067: u"Armenian", # x5 tested support 35 | # 1067: u"ArmenianEastern", 36 | 33835: u"ArmenianGrabar", 37 | 32811: u"ArmenianWestern", 38 | 1101: u"Assamese", 39 | 1558: u"Awar", 40 | 1560: u"Aymara", 41 | 2092: u"AzeriCyrillic", 42 | 1068: u"AzeriLatin", 43 | 1561: u"Bashkir", 44 | 1069: u"Basque", 45 | 1059: u"Belarusian", 46 | # 1059: u"Byelorussian", 47 | 1562: u"Bemba", 48 | 1093: u"Bengali", 49 | 1563: u"Blackfoot", 50 | 1536: u"Breton", 51 | 1564: u"Bugotu", 52 | 1026: u"Bulgarian", 53 | 1109: u"Burmese", 54 | 1565: u"Buryat", 55 | 1027: u"Catalan", 56 | 1566: u"Chamorro", 57 | 1544: u"Chechen", 58 | 1028: u"Chinese", # x5 tested support 59 | # 1028: u"ChineseTaiwan", 60 | 3076: u"ChineseHongKong", 61 | 5124: u"ChineseMacau", 62 | 2052: u"ChinesePRC", 63 | 4100: u"ChineseSingapore", 64 | 1567: u"Chukcha", 65 | 1568: u"Chuvash", 66 | 1569: u"Corsican", 67 | 1546: u"CrimeanTatar", 68 | 1050: u"Croatian", 69 | 1570: u"Crow", 70 | 1029: u"Czech", 71 | 1030: u"Danish", 72 | 1572: u"Dungan", 73 | 1043: u"Dutch", # x5 tested 74 | 2067: u"DutchBelgian", # not supported 75 | 1033: u"English", # 1033: u"EnglishUnitedStates", 76 | 3081: u"EnglishAustralian", 77 | 10249: u"EnglishBelize", 78 | 4105: u"EnglishCanadian", 79 | 9225: u"EnglishCaribbean", 80 | 6153: u"EnglishIreland", 81 | 8201: u"EnglishJamaica", 82 | 35849: u"EnglishLaw", 83 | 33801: u"EnglishMedical", 84 | 5129: u"EnglishNewZealand", 85 | 13321: u"EnglishPhilippines", 86 | 34825: u"EnglishProperNames", 87 | 7177: u"EnglishSouthAfrica", 88 | 11273: u"EnglishTrinidad", 89 | 2057: u"EnglishUnitedKingdom", 90 | 12297: u"EnglishZimbabwe", 91 | 1573: u"EskimoCyrillic", 92 | 1537: u"Esperanto", 93 | 1061: u"Estonian", 94 | 1574: u"Even", 95 | 1575: u"Evenki", 96 | 1065: u"Farsi", 97 | 1538: u"Fijian", 98 | 1035: u"Finnish", 99 | 1036: u"French", # x5 supported 100 | 2060: u"FrenchBelgian", 101 | 3084: u"FrenchCanadian", 102 | 5132: u"FrenchLuxembourg", 103 | 6156: u"FrenchMonaco", 104 | 33804: u"FrenchProperNames", 105 | 4108: u"FrenchSwiss", 106 | 1122: u"Frisian", 107 | 1576: u"Frisian_Legacy", # x6 108 | 1577: u"Friulian", 109 | 1084: u"GaelicScottish", 110 | 1578: u"Gagauz", 111 | 1110: u"Galician", 112 | 1579: u"Galician_Legacy", # x6 113 | 1580: u"Ganda", 114 | 1079: u"Georgian", 115 | 1031: u"German", 116 | 3079: u"GermanAustrian", 117 | 34823: u"GermanLaw", 118 | 5127: u"GermanLiechtenstein", 119 | 4103: u"GermanLuxembourg", 120 | 36871: u"GermanMedical", 121 | 32775: u"GermanNewSpelling", 122 | 35847: u"GermanNewSpellingLaw", 123 | 37895: u"GermanNewSpellingMedical", 124 | 39943: u"GermanNewSpellingProperNames", 125 | 38919: u"GermanProperNames", 126 | 2055: u"GermanSwiss", 127 | 1032: u"Greek", 128 | 32776: u"GreekKathareusa", 129 | 1140: u"Guarani", 130 | 1582: u"Guarani_Legacy", # x6 131 | 1095: u"Gujarati", 132 | 1583: u"Hani", 133 | 1128: u"Hausa", # x6 134 | 1652: u"Hausa_Legacy", 135 | 1141: u"Hawaiian", # x6 136 | 1539: u"Hawaiian_Legacy", 137 | 1037: u"Hebrew", 138 | 1081: u"Hindi", 139 | 1038: u"Hungarian", 140 | 1039: u"Icelandic", 141 | 1584: u"Ido", 142 | 1057: u"Indonesian", 143 | 1585: u"Ingush", 144 | 1586: u"Interlingua", 145 | 2108: u"Irish", # x6 146 | 1552: u"Irish_Legacy", # x6 147 | # 2108: u"Gaelic", # x6 148 | # 1552: u"Gaelic_Legacy", # x6 149 | 1040: u"Italian", # x5 tested 150 | 33808: u"ItalianProperNames", 151 | 2064: u"ItalianSwiss", 152 | 1041: u"Japanese", 153 | 1548: u"Kabardian", 154 | 1587: u"Kalmyk", 155 | 1099: u"Kannada", 156 | 1589: u"KarachayBalkar", 157 | 1588: u"Karakalpak", 158 | 1120: u"Kashmiri", 159 | 2144: u"KashmiriIndia", 160 | 1590: u"Kasub", 161 | 1591: u"Kawa", 162 | 1087: u"Kazakh", 163 | 1592: u"Khakas", 164 | 1593: u"Khanty", 165 | 1107: u"Khmer", 166 | 1594: u"Kikuyu", 167 | 1595: u"Kirgiz", 168 | 1597: u"KomiPermian", 169 | 1596: u"KomiZyryan", 170 | 1598: u"Kongo", 171 | 1111: u"Konkani", 172 | 1042: u"Korean", 173 | 2066: u"KoreanJohab", 174 | 1599: u"Koryak", 175 | 1600: u"Kpelle", 176 | 1601: u"Kumyk", 177 | 1602: u"Kurdish", 178 | 1603: u"KurdishCyrillic", 179 | 1604: u"Lak", 180 | 1108: u"Lao", 181 | 1142: u"Latin", # x6 182 | 1540: u"Latin_Legacy", 183 | 1062: u"Latvian", 184 | 1655: u"LatvianGothic", 185 | 1605: u"Lezgin", 186 | 1063: u"Lithuanian", 187 | 2087: u"LithuanianClassic", 188 | 1606: u"Luba", 189 | 1071: u"Macedonian", 190 | 1607: u"Malagasy", 191 | 1086: u"Malay", 192 | 2110: u"MalayBruneiDarussalam", 193 | 1100: u"Malayalam", 194 | 1608: u"Malinke", 195 | 1082: u"Maltese", 196 | 1112: u"Manipuri", 197 | 1609: u"Mansi", 198 | 1153: u"Maori", # x6 199 | 1102: u"Marathi", 200 | 1610: u"Mari", 201 | 1611: u"Maya", 202 | 1612: u"Miao", 203 | 1613: u"Minankabaw", 204 | 1614: u"Mohawk", 205 | 1104: u"Mongol", 206 | 1615: u"Mordvin", 207 | 1616: u"Nahuatl", 208 | 1617: u"Nanai", 209 | 1618: u"Nenets", 210 | 1121: u"Nepali", 211 | 2145: u"NepaliIndia", 212 | 1619: u"Nivkh", 213 | 1620: u"Nogay", 214 | 1044: u"Norwegian", 215 | # 1044: u"NorwegianBokmal", 216 | 2068: u"NorwegianNynorsk", 217 | 1621: u"Nyanja", 218 | 1622: u"Occidental", 219 | 1623: u"Ojibway", 220 | 32777: u"OldEnglish", 221 | 32780: u"OldFrench", 222 | 33799: u"OldGerman", 223 | 32784: u"OldItalian", 224 | 1657: u"OldSlavonic", # x6 225 | 32778: u"OldSpanish", 226 | 1096: u"Oriya", 227 | 1547: u"Ossetic", 228 | 1145: u"Papiamento", # x6 229 | 1624: u"Papiamento_Legacy", 230 | 1625: u"PidginEnglish", 231 | 1654: u"Pinyin", 232 | 1045: u"Polish", 233 | 1046: u"Portuguese", # not supported 234 | # 1046: u"PortugueseBrazilian", 235 | 2070: u"PortugueseStandard", # x5 supported 236 | 1541: u"Provencal", 237 | 1094: u"Punjabi", 238 | 1131: u"Quechua", # x6 239 | # 1131: u"QuechuaBolivia", # x6 240 | 2155: u"QuechuaEcuador", # x6 241 | 3179: u"QuechuaPeru", # x6 242 | 1626: u"Quechua_Legacy", 243 | 1047: u"RhaetoRomanic", 244 | 1048: u"Romanian", 245 | 2072: u"RomanianMoldavia", 246 | 1627: u"Romany", 247 | 1628: u"Ruanda", 248 | 1629: u"Rundi", 249 | 1049: u"Russian", 250 | 2073: u"RussianMoldavia", 251 | 32793: u"RussianOldSpelling", 252 | 34841: u"RussianOldOrtho", # x6 253 | 33817: u"RussianProperNames", 254 | 1083: u"Saami", 255 | 1542: u"Samoan", 256 | 1103: u"Sanskrit", 257 | 1630: u"Selkup", 258 | 3098: u"SerbianCyrillic", 259 | 2074: u"SerbianLatin", 260 | 1631: u"Shona", 261 | 1113: u"Sindhi", 262 | 1051: u"Slovak", 263 | 1060: u"Slovenian", 264 | 1143: u"Somali", # x6 265 | 1633: u"Somali_Legacy", 266 | 1070: u"Sorbian", # not supported 267 | 1634: u"Sotho", 268 | # 1034: u"Spanish", # not supported 269 | 1034: u"SpanishTraditionalSort", # x5 tested 270 | 11274: u"SpanishArgentina", 271 | 16394: u"SpanishBolivia", 272 | 13322: u"SpanishChile", 273 | 9226: u"SpanishColombia", 274 | 5130: u"SpanishCostaRica", 275 | 7178: u"SpanishDominicanRepublic", 276 | 12298: u"SpanishEcuador", 277 | 17418: u"SpanishElSalvador", 278 | 4106: u"SpanishGuatemala", 279 | 18442: u"SpanishHonduras", 280 | 2058: u"SpanishMexican", 281 | 3082: u"SpanishModernSort", 282 | 19466: u"SpanishNicaragua", 283 | 6154: u"SpanishPanama", 284 | 15370: u"SpanishParaguay", 285 | 10250: u"SpanishPeru", 286 | 33802: u"SpanishProperNames", 287 | 20490: u"SpanishPuertoRico", 288 | 14346: u"SpanishUruguay", 289 | 8202: u"SpanishVenezuela", 290 | 1635: u"Sunda", 291 | 1072: u"Sutu", 292 | 1089: u"Swahili", 293 | 1636: u"Swazi", 294 | 1053: u"Swedish", 295 | 2077: u"SwedishFinland", 296 | 1637: u"Tabassaran", 297 | 1553: u"Tagalog", 298 | 1639: u"Tahitian", 299 | 1064: u"Tajik", # x6 300 | 1638: u"Tajik_Legacy", 301 | 1097: u"Tamil", 302 | 1092: u"Tatar", 303 | 1098: u"Telugu", 304 | 1054: u"Thai", 305 | 1105: u"Tibet", 306 | 1641: u"Tongan", 307 | 1073: u"Tsonga", 308 | 1074: u"Tswana", 309 | # 1074, u"Chuana", 310 | 1642: u"Tun", 311 | 1055: u"Turkish", 312 | 1090: u"Turkmen", # x6 313 | 1643: u"Turkmen_Legacy", 314 | 1644: u"Tuvin", 315 | 1645: u"Udmurt", 316 | # 1646: u"Uighur", # not supported 317 | 1646: u"UighurCyrillic", # not supported 318 | 1647: u"UighurLatin", 319 | 1058: u"Ukrainian", 320 | 1653: u"Universal", 321 | 2080: u"UrduIndia", 322 | 1056: u"UrduPakistan", 323 | 1554: u"User", 324 | 2115: u"UzbekCyrillic", 325 | 1091: u"UzbekLatin", 326 | 1075: u"Venda", 327 | 1066: u"Vietnamese", 328 | 1648: u"Visayan", 329 | 1106: u"Welsh", # x6 330 | 1543: u"Welsh_Legacy", 331 | 1160: u"Wolof", # x6 332 | 1649: u"Wolof_Legacy", 333 | 1076: u"Xhosa", 334 | 1157: u"Yakut", # x6 335 | 1650: u"Yakut_Legacy", 336 | 1085: u"Yiddish", 337 | 1651: u"Zapotec", 338 | 1077: u"Zulu", 339 | } 340 | 341 | 342 | def int2unichr(value): 343 | # import sys 344 | # if sys.version_info > (3, 0): 345 | # return chr(value) 346 | # else: 347 | # return unichr(value) 348 | return text(value) 349 | 350 | 351 | def print_codecs(): 352 | # sort by values 353 | for lcid in sorted(lang_map, key=lang_map.get): 354 | print("%6d:\t%s" % (lcid, lang_map[lcid])) 355 | 356 | 357 | def bit_length(num): 358 | res = 1 359 | num >>= 1 360 | while num != 0: 361 | res += 1 362 | num >>= 1 363 | return res 364 | 365 | 366 | def display_time(sec): 367 | result = "" 368 | hour = sec // 3600 369 | if hour: 370 | result += "{} hours".format(hour) 371 | if hour == 1: 372 | result = result.rstrip('s') 373 | sec -= 3600 * hour 374 | min = sec // 60 375 | if min: 376 | if result != "": 377 | result += " " 378 | result += "{} min".format(int(min)) 379 | sec -= 60 * min 380 | if result != "": 381 | result += " " 382 | result += "{0:0.2f} sec".format(sec) 383 | return result 384 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import codecs 4 | import os 5 | import re 6 | 7 | from setuptools import setup, find_packages 8 | 9 | 10 | here = os.path.abspath(os.path.dirname(__file__)) 11 | 12 | 13 | def read(*parts): 14 | # intentionally *not* adding an encoding option to open, See: 15 | # https://github.com/pypa/virtualenv/issues/201#issuecomment-3145690 16 | return codecs.open(os.path.join(here, *parts), 'r').read() 17 | 18 | 19 | def find_version(*file_paths): 20 | version_file = read(*file_paths) 21 | version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]", 22 | version_file, re.M) 23 | if version_match: 24 | return version_match.group(1) 25 | raise RuntimeError("Unable to find version string.") 26 | 27 | 28 | setup( 29 | name="lingvoreader", 30 | version=find_version("lingvoreader", "__init__.py"), 31 | author='sv99', 32 | author_email='sv99@inbox.ru', 33 | url='https://github.com/sv99/lsdreader', 34 | description='Linvo 11, 12, X3, X5 and X6 lsd reader utilities', 35 | long_description=read('README.rst'), 36 | classifiers=[ 37 | 'Development Status :: 4 - Beta', 38 | 'Topic :: Education', 39 | 'Programming Language :: Python :: 2.7', 40 | 'Programming Language :: Python :: 3', 41 | ], 42 | packages=find_packages(), 43 | platforms='any', 44 | install_requires=[ 45 | 'future; python_version == "2.7"', 46 | ], 47 | zip_safe=True, 48 | entry_points={ 49 | 'console_scripts': [ 50 | 'lsdreader = lingvoreader.lsdreader:main', 51 | ] 52 | } 53 | ) 54 | -------------------------------------------------------------------------------- /test/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | __author__ = 'sv99' 4 | -------------------------------------------------------------------------------- /test/test_arg_parser.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | from unittest import TestCase 4 | from lingvoreader import lsdreader 5 | import sys 6 | from io import BytesIO 7 | 8 | __author__ = 'sv99' 9 | 10 | 11 | class ArgumentParserError(Exception): 12 | 13 | def __init__(self, message, stdout=None, stderr=None, error_code=None): 14 | Exception.__init__(self, message, stdout, stderr) 15 | self.message = message 16 | self.stdout = stdout 17 | self.stderr = stderr 18 | self.error_code = error_code 19 | 20 | 21 | class StdIOBuffer(BytesIO): 22 | pass 23 | 24 | 25 | def stderr_to_parser_error(parse_args, *args, **kwargs): 26 | # if this is being called recursively and stderr or stdout is already being 27 | # redirected, simply call the function and let the enclosing function 28 | # catch the exception 29 | if isinstance(sys.stderr, StdIOBuffer) or isinstance(sys.stdout, StdIOBuffer): 30 | return parse_args(*args, **kwargs) 31 | 32 | # if this is not being called recursively, redirect stderr and 33 | # use it as the ArgumentParserError message 34 | old_stdout = sys.stdout 35 | old_stderr = sys.stderr 36 | sys.stdout = StdIOBuffer() 37 | sys.stderr = StdIOBuffer() 38 | try: 39 | try: 40 | result = parse_args(*args, **kwargs) 41 | for key in list(vars(result)): 42 | if getattr(result, key) is sys.stdout: 43 | setattr(result, key, old_stdout) 44 | if getattr(result, key) is sys.stderr: 45 | setattr(result, key, old_stderr) 46 | return result 47 | except SystemExit: 48 | code = sys.exc_info()[1].code 49 | stdout = sys.stdout.getvalue() 50 | stderr = sys.stderr.getvalue() 51 | raise ArgumentParserError("SystemExit", stdout, stderr, code) 52 | finally: 53 | sys.stdout = old_stdout 54 | sys.stderr = old_stderr 55 | 56 | 57 | class TestArgParser(TestCase): 58 | def setUp(self): 59 | self.parser = lsdreader.get_arg_parser() 60 | 61 | def test_empty(self): 62 | self.assertRaisesRegexp(ArgumentParserError, 63 | 'one of the arguments -i/--input -a/--all is required', 64 | stderr_to_parser_error, self.parser.parse_args, []) 65 | 66 | def test_codecs(self): 67 | self.assertRaisesRegexp(ArgumentParserError, 68 | "SystemExit", 69 | stderr_to_parser_error, self.parser.parse_args, ['-c', ]) 70 | 71 | def test_input(self): 72 | args = self.parser.parse_args('-i test'.split()) 73 | self.assertEqual(args.input, 'test') 74 | self.assertFalse(args.verbose) 75 | 76 | def test_input_verbose(self): 77 | args = self.parser.parse_args('-i test -v'.split()) 78 | self.assertEqual(args.input, 'test') 79 | self.assertTrue(args.verbose) 80 | self.assertEqual(args.outdir, '') 81 | 82 | def test_all(self): 83 | args = self.parser.parse_args('-a'.split()) 84 | self.assertTrue(args.all) 85 | self.assertFalse(args.verbose) 86 | self.assertEqual(args.outdir, '') 87 | 88 | def test_all_verbose(self): 89 | args = self.parser.parse_args('-a -v'.split()) 90 | self.assertTrue(args.all) 91 | self.assertTrue(args.verbose) 92 | self.assertEqual(args.outdir, '') 93 | 94 | def test_outdir(self): 95 | args = self.parser.parse_args('-a -o test'.split()) 96 | self.assertTrue(args.all) 97 | self.assertFalse(args.verbose) 98 | self.assertEqual(args.outdir, 'test') 99 | -------------------------------------------------------------------------------- /test/test_bitStream.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | from __future__ import print_function 4 | from unittest import TestCase 5 | from lingvoreader import bitstream 6 | 7 | __author__ = 'sv99' 8 | 9 | 10 | class TestBitStream(TestCase): 11 | def setUp(self): 12 | print("Creating a new BitStream...") 13 | # record - bytearray, not string! 14 | self.record = bytearray('\x00\x01\x02\x03\x04\x05\x06\x07\x08') 15 | self.bst = bitstream.BitStream(self.record) 16 | 17 | def tearDown(self): 18 | print("Destroying the BitStream...") 19 | self.bst = None 20 | 21 | def test_reverse32(self): 22 | self.assertEqual(bitstream.reverse32(0x01020304), 0x04030201) 23 | 24 | def test_reverse16(self): 25 | self.assertEqual(bitstream.reverse16(0x0102), 0x0201) 26 | 27 | def test_length(self): 28 | self.assertEqual(len(self.record), self.bst.length) 29 | 30 | def test_to_nearest_byte(self): 31 | self.bst.seek(0) 32 | self.bst.to_nearest_byte() 33 | self.assertEqual(self.bst.pos, 0) 34 | self.bst.read_bit() 35 | self.bst.to_nearest_byte() 36 | self.assertEqual(self.bst.pos, 1) 37 | 38 | def test_read_byte(self): 39 | self.bst.seek(0) 40 | self.assertEqual(self.bst.read_byte(), 0) 41 | self.bst.seek(1) 42 | self.assertEqual(self.bst.read_byte(), 1) 43 | 44 | def test_read_word(self): 45 | self.bst.seek(1) 46 | self.assertEqual(self.bst.read_word(), 0x0102) 47 | 48 | def test_read_int(self): 49 | self.bst.seek(1) 50 | self.assertEqual(self.bst.read_int(), 0x01020304) 51 | 52 | #def test_read_symbols(self): 53 | # self.fail() 54 | 55 | def test_read_bit(self): 56 | self.bst.seek(1) 57 | self.bst.read_bits(7) 58 | self.assertEqual(self.bst.read_bit(), 1) 59 | self.assertEqual(self.bst.read_bit(), 0) 60 | 61 | def test_read_bits(self): 62 | self.bst.seek(1) 63 | self.assertEqual(self.bst.read_bits(4), 0) 64 | self.assertEqual(self.bst.read_bits(4), 1) 65 | self.assertEqual(self.bst.read_bits(4), 0) 66 | self.assertEqual(self.bst.read_bits(8), 0x20) -------------------------------------------------------------------------------- /test/test_tools.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | from unittest import TestCase 4 | from lingvoreader import tools 5 | 6 | __author__ = 'sv99' 7 | 8 | 9 | class TestTools(TestCase): 10 | def test_bit_length_0(self): 11 | self.assertEqual(tools.bit_length(0), 1) 12 | 13 | def test_bit_length_1(self): 14 | self.assertEqual(tools.bit_length(1), 1) 15 | 16 | def test_bit_length_2(self): 17 | self.assertEqual(tools.bit_length(2), 2) 18 | 19 | def test_bit_length_4(self): 20 | self.assertEqual(tools.bit_length(4), 3) 21 | 22 | def test_display_time(self): 23 | self.assertEquals(tools.display_time(1), 24 | "1.00 sec") 25 | self.assertEquals(tools.display_time(61.2), 26 | "1 min 1.20 sec") 27 | self.assertEquals(tools.display_time(60), 28 | "1 min 0.00 sec") 29 | self.assertEquals(tools.display_time(3600), 30 | "1 hour 0.00 sec") 31 | self.assertEquals(tools.display_time(3661), 32 | "1 hour 1 min 1.00 sec") 33 | -------------------------------------------------------------------------------- /testdata/.gitignore: -------------------------------------------------------------------------------- 1 | *.* 2 | 3 | --------------------------------------------------------------------------------