├── .gitignore
├── .idea
├── dictionaries
│ └── svolkov.xml
└── vagrant.xml
├── LICENSE.md
├── README.rst
├── decoder.rar
├── lingvoreader
├── __init__.py
├── articleheading.py
├── bitstream.py
├── decoder.py
├── lentable.py
├── lsdfile.py
├── lsdreader.py
└── tools.py
├── setup.py
├── test
├── __init__.py
├── test_arg_parser.py
├── test_bitStream.py
└── test_tools.py
└── testdata
└── .gitignore
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/
2 | CPP/
3 | build/
4 | dist/
5 | lingvoreader.egg-info/
6 | *.swp
7 | *.pyc
8 | .DS_Store
9 | venv/
10 | venv2/
11 | .python-version
12 |
--------------------------------------------------------------------------------
/.idea/dictionaries/svolkov.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
--------------------------------------------------------------------------------
/.idea/vagrant.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2023 sv99@inbox.ru
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
1 | lsdreader
2 | =========
3 |
4 | python analog ABBYY DSL Compiler - decompile LSD dictionary to DSL.
5 |
6 | Support Lingvo 11, 12, X3, X5 and X6 lsd files format.
7 |
8 | Based on C++ source decoder.zip from ru-board user `tr7 `_
9 | `Source on github `_
10 |
11 | Russian decompiling team on `ru-board.ru `_
12 |
13 | Project Page: https://github.com/sv99/lsdreader
14 |
15 | Install
16 | =======
17 |
18 | Install from pip::
19 |
20 | pip install setuptools -U
21 | pip install lingvoreader
22 |
23 | Install development version::
24 |
25 | git clone
26 | pip install -e .
27 |
28 | make tar.gz for PyPi::
29 |
30 | pip install twine
31 | python setup.py sdist
32 | twine upload dist/lingvoreader-x.x.x.tar.gz
33 |
34 | Usage
35 | -----
36 | ::
37 |
38 | lsdreader [-h] [--header] (-i INPUT | -a) [-o OUTDIR] [-c] [-v] [--version]
39 |
40 | Decode Lingvo 11, 12, X3, X5 and X6 lsd dictionary to dsl
41 |
42 | optional arguments:
43 | -h, --help show this help message and exit
44 | --header show header info and exit
45 | -i INPUT, --input INPUT
46 | Dictionary to decode
47 | -a, --all All dictionary in current directory
48 | -o OUTDIR, --outdir OUTDIR
49 | Output directory
50 | -c, --codecs print supported languages and their codes
51 | -v, --verbose
52 | --version show program's version number and exit
53 |
54 | Lingvo versions
55 | ===============
56 |
57 | ::
58 |
59 | 11 2005 supported
60 | 12 2006 supported
61 | x3 2008 supported
62 | x5 2011 supported
63 | x6 2014 (current) supported
64 |
65 |
--------------------------------------------------------------------------------
/decoder.rar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sv99/lsdreader/45f610c7a011a6e54d7cb521ab61eb6277ae65d5/decoder.rar
--------------------------------------------------------------------------------
/lingvoreader/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | __author__ = 'sv99'
4 | # The version as used in the setup.py
5 | __version__ = "0.2.15"
6 |
7 |
8 | class LsdError(Exception):
9 | pass
10 |
--------------------------------------------------------------------------------
/lingvoreader/articleheading.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | from __future__ import unicode_literals, print_function, division, absolute_import
3 | from lingvoreader.tools import int2unichr
4 |
5 | __author__ = 'svolkov'
6 |
7 |
8 | # info from ArticleHeading.cpp
9 | # Not implemented yet!!
10 | #
11 | # Unsorted parts
12 | #
13 | # Each characted in a LSD heading has a flag indicating if it's 'sorted' or 'unsorted'.
14 | # The unsorted characters aren't used in sortings and searching. Basically they become
15 | # transparent for the indexing mechanism and only visible while presented to the user
16 | # together with an article. Let's annotate each character with an 's' or 'u' to indicate
17 | # this distinction.
18 | #
19 | # In the DSL format curly brackets are used to denote the sortedness of a heading's part.
20 | #
21 | # The heading 'Some{thing}' would be encoded as
22 | #
23 | # Something
24 | # ssssuuuuu
25 | #
26 | # And it's name would be just 'Some', while its extended name would be 'Something'
27 | # There can be any number of unsorted characters in a heading. At any position.
28 | #
29 | # To correctly decompile an LSD heading (where each character is either sorted or unsorted)
30 | # to a DSL heading, it is required to group adjacent characters in groups by their
31 | # sortedness, and then enclose each such group in parentheses. This process is rather
32 | # straightforward. The special case to look for is the slash (/), which is encoded
33 | # as an unsorted character, but is used to escape the subsequent characted (either sorted
34 | # or unsorted) and thus requires a special handling.
35 | #
36 | # Optional parts (variants)
37 | #
38 | # The DSL format has a mechanism for generating several headings with optional parts
39 | # out of a single heading. By enclosing a part of a heading in parentheses, it is possible
40 | # to generate all possible combinations of the heading.
41 | #
42 | # The heading 'aa(bb)cc' would be expanded into two different headings
43 | #
44 | # aa{(bb)}cc and aa{(}bb{)}cc
45 | # ss uuuu ss ss u ss u ss
46 | #
47 | # For more such parts, more headings would be generated. It is 2 headings for 1 part,
48 | # 4 headings for 2 parts, 9 for 3 and so on.
49 | #
50 | # There exists a pattern which helps to combine two headings produced with the variant
51 | # encoding. The pattern looks like this 'u(' denotes an unsorted parenthesis):
52 | #
53 | # (A) ??? u( u*n u) ??? -> ??? s( s|u*n s) ???
54 | # (B) ??? u( s|u*n u) ???
55 | #
56 | # An optional part can contain an unsorted part as well, so this pattern allow for a
57 | # combination of sorted/unsorted parts in an optional part (denoted as 's|u*n').
58 | #
59 | # This pattern becomes insufficient in the presence of spaces, adjacent to the
60 | # parentheses. Here are two examples of such headings (the upper case is used
61 | # to denote spaces):
62 | #
63 | # abc (123)
64 | #
65 | # abc{ (123)} and abc {(}123{)}
66 | # sss Uuuuuu sssS u sss u
67 | #
68 | # bbb (123) z
69 | #
70 | # bbb {(123) }z and bbb {(}123{)} z
71 | # sssS uuuuuU s sssS u sss u Ss
72 | #
73 | # Another two patterns account for these special cases.
74 | #
75 | # (C) ??? Uu( u*n u) -> ??? Ss( s|u*n s)
76 | # (D) ??? Su( s|u*n u)
77 | #
78 | # (E) ??? Su( u*n u)U ??? -> ??? Ss( s|u*n s)S ???
79 | # (F) ??? Su( s|u*n u)S ???
80 | #
81 | # The headings that can't be collapsed using one of these three rules are left as is.
82 | #
83 | #
84 | class CharInfo:
85 | def __init__(self):
86 | self.sorted = False
87 | self.escaped = False
88 | self.chr = ""
89 |
90 | # add equal test
91 |
92 |
93 | class Heading:
94 | def __init__(self):
95 | self.text = ""
96 | self.extensions = []
97 |
98 | @property
99 | def ext_text(self):
100 | if len(self.extensions) == 0:
101 | return self.text
102 | res = self.text
103 | offset = 0
104 | for idx, ext in self.extensions:
105 | add_braces = ext != u"\\"
106 | if add_braces:
107 | ext = u"{" + ext + u"}"
108 |
109 | res = res[:idx + offset] + ext + res[idx + offset:]
110 | if add_braces:
111 | offset += 2
112 | return res
113 |
114 |
115 | class ArticleHeading:
116 | def __init__(self):
117 | # may by many heading for single article
118 | self.headings = []
119 | self.reference = 0
120 | # refernce for next arcticle, used in the x6 for encoding
121 | self.next_reference = 0
122 |
123 | def read(self, lsd_decoder, bstr, known_prefix):
124 | h = Heading()
125 | prefix_len = lsd_decoder.decode_prefix_len()
126 | postfix_len = lsd_decoder.decode_postfix_len()
127 | h.text = known_prefix[:prefix_len]
128 | h.text += lsd_decoder.decode_heading(postfix_len)
129 | self.reference = lsd_decoder.read_reference2()
130 | if bstr.read_bit():
131 | # additional not visible formatting item in header
132 | # join multisymbols item
133 | ext_length = bstr.read_bits(8)
134 | if ext_length != 0:
135 | ext = ""
136 | first_idx = prev_idx = 0
137 | for i in range(ext_length):
138 | idx = bstr.read_bits(8)
139 | char = int2unichr(bstr.read_bits(16))
140 | if ext == "":
141 | ext += char
142 | first_idx = prev_idx = idx
143 | else:
144 | if prev_idx + 1 == idx:
145 | # join item with sequence idx
146 | ext += char
147 | prev_idx = idx
148 | else:
149 | # other item
150 | h.extensions.append((first_idx, ext))
151 | ext = char
152 | first_idx = prev_idx = idx
153 | # add last item
154 | h.extensions.append((first_idx, ext))
155 | self.headings.append(h)
156 | return h.text
157 |
158 | def merge(self, heading):
159 | for h in heading.headings:
160 | # print("merge: %s" % h.ext_text)
161 | self.headings.append(h)
162 |
163 | def get_first(self):
164 | return self.headings[0]
165 |
166 | def get_first_ext_text(self):
167 | return self.headings[0].ext_text
168 |
169 | @property
170 | def simple(self):
171 | return len(self.headings) == 1
172 |
173 | def dump(self):
174 | print("%s: %d - %d" % (self.get_first().text, self.reference, self.next_reference))
175 |
176 |
177 | class ArticleHeadingList(list):
178 | def __init__(self):
179 | super(ArticleHeadingList, self).__init__()
180 | self.appended = 0
181 | self.references = {}
182 |
183 | def append(self, item):
184 | self.appended += 1
185 | # append if item.reference not exists
186 | ref = item.reference
187 | if ref in self.references:
188 | self.references[ref].merge(item)
189 | else:
190 | self.references[ref] = item
191 | # if not first then update next_reference in the previous item
192 | if len(self) > 0:
193 | self[-1].next_reference = ref
194 | super(ArticleHeadingList, self).append(item)
195 |
--------------------------------------------------------------------------------
/lingvoreader/bitstream.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import struct
4 |
5 | from lingvoreader.tools import int2unichr
6 | from lingvoreader import LsdError
7 |
8 | __author__ = 'sv99'
9 |
10 |
11 | def reverse32(int_value):
12 | res, = struct.unpack('>L', struct.pack('H', struct.pack('H', self.record, self.pos)
85 | self.pos += 2
86 | self.in_byte_pos = 0
87 | return res
88 |
89 | def read_int(self):
90 | res, = struct.unpack_from('>L', self.record, self.pos)
91 | self.pos += 4
92 | self.in_byte_pos = 0
93 | return res
94 |
95 | def read_symbols(self):
96 | size = self.read_bits(32)
97 | bits_per_symbol = self.read_bits(8)
98 | res = []
99 | for i in range(size):
100 | res.append(self.read_bits(bits_per_symbol))
101 | return res
102 |
103 | def read_bit(self):
104 | byte, = struct.unpack_from('B', self.record, self.pos)
105 | byte >>= (7 - self.in_byte_pos)
106 | if self.in_byte_pos == 7:
107 | self.pos += 1
108 | self.in_byte_pos = 0
109 | else:
110 | self.in_byte_pos += 1
111 | return byte & 1
112 |
113 | def read_bits(self, count):
114 | return self.read_bits_o(count)
115 |
116 | # stupid direct implementation
117 | def read_bits_s(self, count):
118 | if count > 32:
119 | raise LsdError("Many bits for read: %d" % count)
120 | res = 0
121 | for i in range(count):
122 | res <<= 1
123 | res += self.read_bit()
124 | return res
125 |
126 | def read_bits_o(self, count):
127 | if count > 32:
128 | raise LsdError("Many bits for read: %d" % count)
129 | masks = (1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF)
130 | count_bytes = (count + self.in_byte_pos) // 8
131 | if count + self.in_byte_pos - 8 * count_bytes > 0:
132 | count_bytes += 1
133 | # if in single raw byt
134 | if count_bytes == 1:
135 | if (self.in_byte_pos + count) < 8:
136 | byte = self.record[self.pos]
137 | byte >>= 7 - self.in_byte_pos - count + 1
138 | byte &= masks[count - 1]
139 | self.in_byte_pos += count
140 | return byte
141 | # many raw bytes
142 | # inBitPos
143 | # | count = 13 |
144 | # 01234567 | 01234567 | 0123456
145 | #
146 | # inBitPos = 5 count_first = 3 count_las = 2
147 | #
148 | p = self.pos
149 | count_last = (count + self.in_byte_pos) % 8
150 | count_first = 8 - self.in_byte_pos
151 | byte_first = self.record[p]
152 | p += 1
153 | byte_first &= masks[count_first - 1]
154 | res = byte_first
155 | # full bytes
156 | full_bytes = (count - count_first) // 8
157 | if full_bytes > 0:
158 | for i in range(full_bytes):
159 | res <<= 8
160 | res += self.record[p]
161 | p += 1
162 | # last byte
163 | if count_last > 0:
164 | byte = self.record[p]
165 | byte >>= 8 - count_last
166 | res <<= count_last
167 | res += byte
168 | self.in_byte_pos = count_last
169 | self.pos = p
170 | return res
171 |
172 | def read_some(self, length):
173 | if length == 1:
174 | return self.read_byte()
175 | elif length == 2:
176 | return self.read_word()
177 | elif length == 4:
178 | return self.read_int()
179 | else:
180 | raise LsdError('Allow read byte, word and int length: %i' % length)
181 |
182 | def read_unicode(self, size, big_endian=True):
183 | res = ""
184 | for i in range(size):
185 | ch = self.read_some(2)
186 | if not big_endian:
187 | ch = reverse16(ch)
188 | res += int2unichr(ch)
189 | # res += unichr(self.read_some(2), big_endian))
190 | return res
191 |
192 |
193 |
--------------------------------------------------------------------------------
/lingvoreader/decoder.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | from __future__ import (division, absolute_import, print_function)
4 | from lingvoreader import tools
5 | from lingvoreader.lentable import LenTable
6 | from lingvoreader.tools import int2unichr
7 |
8 | __author__ = 'sv99'
9 |
10 |
11 | class Decoder:
12 | def __init__(self, bstr):
13 | self.bstr = bstr
14 | self.prefix = ""
15 | self._article_symbols = None
16 | self._heading_symbols = None
17 | self._ltArticles = None
18 | self._ltHeadings = None
19 | self._ltPrefixLengths = None
20 | self._ltPostfixLengths = None
21 | self._huffman1Number = 0
22 | self._huffman2Number = 0
23 | self._readed = False
24 |
25 | @property
26 | def readed(self):
27 | return self._readed
28 |
29 | def decode_prefix_len(self):
30 | return self._ltPrefixLengths.decode()
31 |
32 | def decode_postfix_len(self):
33 | return self._ltPostfixLengths.decode()
34 |
35 | def read_reference1(self):
36 | return self.read_reference(self._huffman1Number)
37 |
38 | def read_reference2(self):
39 | return self.read_reference(self._huffman2Number)
40 |
41 | def read_reference(self, huffman_number):
42 | reference = ""
43 | code = self.bstr.read_bits(2)
44 | if code == 3:
45 | self.bstr.read_bits(32)
46 | return reference
47 |
48 | size = tools.bit_length(huffman_number)
49 | assert(size >= 2)
50 | return (code << (size - 2)) | self.bstr.read_bits(size - 2)
51 |
52 | def decode_heading(self, size):
53 | res = ""
54 | for i in range(size):
55 | sym_idx = self._ltHeadings.decode()
56 | sym = self._heading_symbols[sym_idx]
57 | assert(sym <= 0xffff) # LingvoEngine:2EAB84E8
58 | res += int2unichr(sym)
59 | return res
60 |
61 | def decode_article(self, size):
62 | """
63 | decode User and Abrv dict
64 | """
65 | res = ""
66 | while len(res) < size:
67 | sym_idx = self._ltArticles.decode()
68 | sym = self._article_symbols[sym_idx]
69 | if sym >= 0x10000:
70 | if sym >= 0x10040:
71 | start_idx = self.bstr.read_bits(tools.bit_length(size))
72 | s = sym - 0x1003d
73 | res += res[start_idx:start_idx + s]
74 | else:
75 | prefix_idx = self.bstr.read_bits(tools.bit_length(len(self.prefix)))
76 | s = sym - 0xfffd
77 | res += self.prefix[prefix_idx:prefix_idx + s]
78 | else:
79 | res += int2unichr(sym)
80 | return res
81 |
82 | # need seek(bstr.header.dictionary_encoder_offset) befor call!
83 | def read(self):
84 | return
85 |
86 | def dump(self):
87 | print("Decoder: %s" % self.__class__.__name__)
88 | if self.readed:
89 | print(" ArticleSymbols: %d" % len(self._article_symbols))
90 | print(" HeadingSymbols: %d" % len(self._heading_symbols))
91 | self._ltArticles.dump("Articles")
92 | self._ltHeadings.dump("Headings")
93 | self._ltPrefixLengths.dump("PrefixLengths")
94 | self._ltPostfixLengths.dump("PostfixLengths")
95 |
96 |
97 | class UserDictionaryDecoder(Decoder):
98 | def __init__(self, bstr):
99 | Decoder.__init__(self, bstr)
100 | return
101 |
102 | def read(self):
103 | self.prefix = self.bstr.read_unicode(self.bstr.read_int())
104 | self._article_symbols = self.bstr.read_symbols()
105 | self._heading_symbols = self.bstr.read_symbols()
106 | self._ltArticles = LenTable(self.bstr)
107 | self._ltHeadings = LenTable(self.bstr)
108 |
109 | self._ltPrefixLengths = LenTable(self.bstr)
110 | self._ltPostfixLengths = LenTable(self.bstr)
111 |
112 | self._huffman1Number = self.bstr.read_bits(32)
113 | self._huffman2Number = self.bstr.read_bits(32)
114 | self._readed = True
115 | return
116 |
117 |
118 | class SystemDictionaryDecoder13(Decoder):
119 | def __init__(self, bstr):
120 | Decoder.__init__(self, bstr)
121 | return
122 |
123 | def read(self):
124 | self.prefix = self.bstr.read_unicode(self.bstr.read_int())
125 | self._article_symbols = self.bstr.read_symbols()
126 | self._heading_symbols = self.bstr.read_symbols()
127 | self._ltArticles = LenTable(self.bstr)
128 | self._ltHeadings = LenTable(self.bstr)
129 |
130 | self._ltPrefixLengths = LenTable(self.bstr)
131 | self._ltPostfixLengths = LenTable(self.bstr)
132 |
133 | self._huffman1Number = self.bstr.read_bits(32)
134 | self._huffman2Number = self.bstr.read_bits(32)
135 | self._readed = True
136 | return
137 |
138 | def decode_article(self, size):
139 | res = ""
140 | while len(res) < size:
141 | sym_idx = self._ltArticles.decode()
142 | sym = self._article_symbols[sym_idx]
143 | if sym <= 0x80:
144 | if sym <= 0x3F:
145 | start_pref_idx = self.bstr.read_bits(tools.bit_length(len(self.prefix)))
146 | s = sym + 3
147 | res += self.prefix[start_pref_idx:start_pref_idx + s]
148 | else:
149 | start_idx = self.bstr.read_bits(tools.bit_length(size))
150 | s = sym - 0x3d
151 | res += res[start_idx:start_idx + s]
152 | else:
153 | res += int2unichr(sym - 0x80)
154 | return res
155 |
156 |
157 | class SystemDictionaryDecoder14(SystemDictionaryDecoder13):
158 | def __init__(self, bstr):
159 | SystemDictionaryDecoder13.__init__(self, bstr)
160 | return
161 |
162 | def read(self):
163 | prefix_len = self.bstr.read_int()
164 | self.prefix = self.bstr.read_unicode(prefix_len)
165 | self._article_symbols = self.bstr.read_symbols()
166 | self._heading_symbols = self.bstr.read_symbols()
167 | self._ltArticles = LenTable(self.bstr)
168 | self._ltHeadings = LenTable(self.bstr)
169 |
170 | self._ltPostfixLengths = LenTable(self.bstr)
171 | self._dummy = self.bstr.read_bits(32)
172 | self._ltPrefixLengths = LenTable(self.bstr)
173 |
174 | self._huffman1Number = self.bstr.read_bits(32)
175 | self._huffman2Number = self.bstr.read_bits(32)
176 | self._readed = True
177 | return
178 |
179 | # def decode_article(self, size):
180 | # res = ""
181 | # while len(res) < size:
182 | # sym_idx = self._ltArticles.decode()
183 | # sym = self._article_symbols[sym_idx]
184 | # if sym <= 0x80:
185 | # if sym <= 0x3F:
186 | # start_pref_idx = self.bstr.read_bits(tools.bit_length(len(self.prefix)))
187 | # s = sym + 3
188 | # res += self.prefix[start_pref_idx:start_pref_idx + s]
189 | # else:
190 | # start_idx = self.bstr.read_bits(tools.bit_length(size))
191 | # s = sym - 0x3d
192 | # res += res[start_idx:start_idx + s]
193 | # else:
194 | # res += unichr(sym - 0x80)
195 | # return res
196 |
197 |
198 | class AbbreviationDictionaryDecoder(Decoder):
199 | def __init__(self, bstr):
200 | Decoder.__init__(self, bstr)
201 | return
202 |
203 | def read(self):
204 | self.prefix = self.read_xored_prefix(self.bstr.read_int())
205 | self._article_symbols = self.read_xored_symbols()
206 | self._heading_symbols = self.read_xored_symbols()
207 | self._ltArticles = LenTable(self.bstr)
208 | self._ltHeadings = LenTable(self.bstr)
209 |
210 | self._ltPrefixLengths = LenTable(self.bstr)
211 | self._ltPostfixLengths = LenTable(self.bstr)
212 |
213 | self._huffman1Number = self.bstr.read_bits(32)
214 | self._huffman2Number = self.bstr.read_bits(32)
215 | self._readed = True
216 | return
217 |
218 | def read_xored_symbols(self):
219 | size = self.bstr.read_bits(32)
220 | bits_per_symbol = self.bstr.read_bits(8)
221 | res = []
222 | for i in range(size):
223 | res.append(self.bstr.read_bits(bits_per_symbol) ^ 0x1325)
224 | return res
225 |
226 | def read_xored_prefix(self, size):
227 | res = ""
228 | for i in range(size):
229 | res += int2unichr(self.bstr.read_bits(16) ^ 0x879A)
230 | return res
231 |
232 |
233 | class SystemDictionaryDecoder15(Decoder):
234 | def __init__(self, bstr):
235 | Decoder.__init__(self, bstr)
236 | return
237 |
238 | def read(self):
239 | # self.bstr = XoredBitStream(self.bstr)
240 | # self.decode()
241 |
242 | prefix_len = self.bstr.read_some(4)
243 | self.prefix = self.bstr.read_unicode(prefix_len)
244 | self._article_symbols = self.bstr.read_symbols()
245 | self._heading_symbols = self.bstr.read_symbols()
246 | self._ltArticles = LenTable(self.bstr)
247 | self._ltHeadings = LenTable(self.bstr)
248 |
249 | self._ltPostfixLengths = LenTable(self.bstr)
250 | self._dummy = self.bstr.read_bits(32)
251 | self._ltPrefixLengths = LenTable(self.bstr)
252 |
253 | self._huffman1Number = self.bstr.read_bits(32)
254 | self._huffman2Number = self.bstr.read_bits(32)
255 | self._readed = True
256 | return
257 |
258 | def decode_article(self, size):
259 | res = ""
260 | while len(res) < size:
261 | sym_idx = self._ltArticles.decode()
262 | sym = self._article_symbols[sym_idx]
263 | if sym <= 0x80:
264 | if sym <= 0x3F:
265 | start_pref_idx = self.bstr.read_bits(tools.bit_length(len(self.prefix)))
266 | s = sym + 3
267 | res += self.prefix[start_pref_idx:start_pref_idx + s]
268 | else:
269 | start_idx = self.bstr.read_bits(tools.bit_length(size))
270 | s = sym - 0x3d
271 | res += res[start_idx:start_idx + s]
272 | else:
273 | res += int2unichr(sym - 0x80)
274 | return res
275 |
--------------------------------------------------------------------------------
/lingvoreader/lentable.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | from __future__ import (print_function)
4 | from . import tools
5 |
6 | __author__ = 'sv99'
7 |
8 |
9 | class SymInfo:
10 | def __init__(self, sym_idx, size, code):
11 | self.sym_idx = sym_idx
12 | self.size = size
13 | self.code = code
14 |
15 |
16 | class HuffmanNode:
17 | def __init__(self, left, right, parent, weight):
18 | self.left = left
19 | self.right = right
20 | self.parent = parent
21 | self.weight = weight
22 |
23 |
24 | class LenTable:
25 | def __init__(self, bstr):
26 | self.bstr = bstr
27 | self._count = self.bstr.read_bits(32)
28 | self._bits_per_len = self.bstr.read_bits(8)
29 | self._idx_bit_size = tools.bit_length(self._count)
30 |
31 | self.symidx2nodeidx = [-1 for _ in range(self._count)]
32 | self.nodes = [HuffmanNode(0, 0, -1, -1) for _ in range(self._count - 1)]
33 | root_idx = len(self.nodes) - 1
34 | self.next_node_position = 0
35 | for i in range(self._count):
36 | symidx = self.bstr.read_bits(self._idx_bit_size)
37 | length = self.bstr.read_bits(self._bits_per_len)
38 | self.place_sym_idx(symidx, root_idx, length)
39 |
40 | def place_sym_idx(self, sym_idx, node_idx, size):
41 | assert size > 0
42 | if size == 1: # time to place
43 | if self.nodes[node_idx].left == 0:
44 | self.nodes[node_idx].left = -1 - sym_idx
45 | self.symidx2nodeidx[sym_idx] = node_idx
46 | return True
47 |
48 | if self.nodes[node_idx].right == 0:
49 | self.nodes[node_idx].right = -1 - sym_idx
50 | self.symidx2nodeidx[sym_idx] = node_idx
51 | return True
52 |
53 | return False
54 |
55 | if self.nodes[node_idx].left == 0:
56 | self.nodes[self.next_node_position] = HuffmanNode(0, 0, node_idx, -1)
57 | self.next_node_position += 1
58 | self.nodes[node_idx].left = self.next_node_position
59 |
60 | if self.nodes[node_idx].left > 0:
61 | if self.place_sym_idx(sym_idx, self.nodes[node_idx].left - 1, size - 1):
62 | return True
63 |
64 | if self.nodes[node_idx].right == 0:
65 | self.nodes[self.next_node_position] = HuffmanNode(0, 0, node_idx, -1)
66 | self.next_node_position += 1
67 | self.nodes[node_idx].right = self.next_node_position
68 |
69 | if self.nodes[node_idx].right > 0:
70 | if self.place_sym_idx(sym_idx, self.nodes[node_idx].right - 1, size - 1):
71 | return True
72 |
73 | return False
74 |
75 | def decode(self):
76 | node = self.nodes[-1]
77 | length = 0
78 | while True:
79 | length += 1
80 | bit = self.bstr.read_bit()
81 | if bit: # right
82 | if node.right < 0: # leaf
83 | sym_idx = -1 - node.right
84 | return sym_idx
85 | node = self.nodes[node.right - 1]
86 | else: # left
87 | if node.left < 0: # leaf
88 | sym_idx = -1 - node.left
89 | return sym_idx
90 |
91 | node = self.nodes[node.left - 1]
92 |
93 | def dump(self, name):
94 | print("LenTable: %s" % name)
95 | print(" Count: %s" % self._count)
96 | print(" bitsPerLen: %s" % self._bits_per_len)
97 | print(" IdxBitSize: %s" % self._idx_bit_size)
98 |
--------------------------------------------------------------------------------
/lingvoreader/lsdfile.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | from __future__ import unicode_literals, print_function, division, absolute_import
3 |
4 | import codecs
5 | import os
6 |
7 | from lingvoreader import LsdError
8 | from lingvoreader import tools, decoder
9 | from lingvoreader.articleheading import ArticleHeading, ArticleHeadingList
10 | from lingvoreader.bitstream import reverse32, reverse16, BitStream
11 |
12 | __author__ = 'sv99'
13 |
14 |
15 | class OverlayReader:
16 | def __init__(self, bstr, offset):
17 | self.bstr = bstr
18 |
19 | if self.bstr.seek(offset):
20 | self._entriesCount = self.bstr.read_bits(4)
21 | else:
22 | self._entriesCount = 0
23 |
24 | def dump(self):
25 | print("Overlay:")
26 | print(" EntriesCount: %s" % self._entriesCount)
27 |
28 |
29 | class Header:
30 | def __init__(self, bstr):
31 | self.bstr = bstr
32 | self.magic = self.bstr.read(8).decode().replace('\x00', '')
33 | self.version = reverse32(self.bstr.read_int())
34 | self.unk = reverse32(self.bstr.read_int())
35 | self.checksum = reverse32(self.bstr.read_int())
36 | self.entries_count = reverse32(self.bstr.read_int())
37 | self.annotation_offset = reverse32(self.bstr.read_int())
38 | self.dictionary_encoder_offset = reverse32(self.bstr.read_int())
39 | self.articles_offset = reverse32(self.bstr.read_int())
40 | self.pages_offset = reverse32(self.bstr.read_int())
41 | self.unk1 = reverse32(self.bstr.read_int())
42 | self.unk2 = reverse16(self.bstr.read_word())
43 | self.unk3 = reverse16(self.bstr.read_word())
44 | self.source_language = reverse16(self.bstr.read_word())
45 | self.target_language = reverse16(self.bstr.read_word())
46 | return
47 |
48 | @property
49 | def hi_version(self):
50 | return self.version >> 16
51 |
52 | def dump(self):
53 | print("Header:")
54 | print(" Magic: %s" % self.magic)
55 | print(" Checksume: %s" % hex(self.checksum))
56 | print(" Version: %s (%s)" % (hex(self.hi_version), hex(self.version)))
57 | print(" Entries: %d" % self.entries_count)
58 | print(" AnnotationOffset: %s" % hex(self.annotation_offset))
59 | print(" DictionaryEncoderOffset: %s" % hex(self.dictionary_encoder_offset))
60 | print(" ArticlesOffset: %s" % hex(self.articles_offset))
61 | print(" Pages start: %s" % hex(self.pages_offset))
62 | print(" Source language: %d %s" % (self.source_language, tools.lang_map[self.source_language]))
63 | print(" Target language: %d %s" % (self.target_language, tools.lang_map[self.target_language]))
64 |
65 |
66 | class CachePage:
67 | def __init__(self, bstr):
68 | self.bstr = bstr
69 | self.is_leaf = bstr.read_bit()
70 | self.number = bstr.read_bits(16)
71 | self.prev = bstr.read_bits(16)
72 | self.parent = bstr.read_bits(16)
73 | self.next = bstr.read_bits(16)
74 | self.headings_count = bstr.read_bits(16)
75 | self.bstr.to_nearest_byte()
76 | return
77 |
78 |
79 | xor_pad = (
80 | 0x9C, 0xDF, 0x9B, 0xF3, 0xBE, 0x3A, 0x83, 0xD8,
81 | 0xC9, 0xF5, 0x50, 0x98, 0x35, 0x4E, 0x7F, 0xBB,
82 | 0x89, 0xC7, 0xE9, 0x6B, 0xC4, 0xC8, 0x4F, 0x85,
83 | 0x1A, 0x10, 0x43, 0x66, 0x65, 0x57, 0x55, 0x54,
84 | 0xB4, 0xFF, 0xD7, 0x17, 0x06, 0x31, 0xAC, 0x4B,
85 | 0x42, 0x53, 0x5A, 0x46, 0xC5, 0xF8, 0xCA, 0x5E,
86 | 0x18, 0x38, 0x5D, 0x91, 0xAA, 0xA5, 0x58, 0x23,
87 | 0x67, 0xBF, 0x30, 0x3C, 0x8C, 0xCF, 0xD5, 0xA8,
88 | 0x20, 0xEE, 0x0B, 0x8E, 0xA6, 0x5B, 0x49, 0x3F,
89 | 0xC0, 0xF4, 0x13, 0x80, 0xCB, 0x7B, 0xA7, 0x1D,
90 | 0x81, 0x8B, 0x01, 0xDD, 0xE3, 0x4C, 0x9A, 0xCE,
91 | 0x40, 0x72, 0xDE, 0x0F, 0x26, 0xBD, 0x3B, 0xA3,
92 | 0x05, 0x37, 0xE1, 0x5F, 0x9D, 0x1E, 0xCD, 0x69,
93 | 0x6E, 0xAB, 0x6D, 0x6C, 0xC3, 0x71, 0x1F, 0xA9,
94 | 0x84, 0x63, 0x45, 0x76, 0x25, 0x70, 0xD6, 0x8F,
95 | 0xFD, 0x04, 0x2E, 0x2A, 0x22, 0xF0, 0xB8, 0xF2,
96 | 0xB6, 0xD0, 0xDA, 0x62, 0x75, 0xB7, 0x77, 0x34,
97 | 0xA2, 0x41, 0xB9, 0xB1, 0x74, 0xE4, 0x95, 0x1B,
98 | 0x3E, 0xE7, 0x00, 0xBC, 0x93, 0x7A, 0xE8, 0x86,
99 | 0x59, 0xA0, 0x92, 0x11, 0xF7, 0xFE, 0x03, 0x2F,
100 | 0x28, 0xFA, 0x27, 0x02, 0xE5, 0x39, 0x21, 0x96,
101 | 0x33, 0xD1, 0xB2, 0x7C, 0xB3, 0x73, 0xC6, 0xE6,
102 | 0xA1, 0x52, 0xFB, 0xD4, 0x9E, 0xB0, 0xE2, 0x16,
103 | 0x97, 0x08, 0xF6, 0x4A, 0x78, 0x29, 0x14, 0x12,
104 | 0x4D, 0xC1, 0x99, 0xBA, 0x0D, 0x3D, 0xEF, 0x19,
105 | 0xAF, 0xF9, 0x6F, 0x0A, 0x6A, 0x47, 0x36, 0x82,
106 | 0x07, 0x9F, 0x7D, 0xA4, 0xEA, 0x44, 0x09, 0x5C,
107 | 0x8D, 0xCC, 0x87, 0x88, 0x2D, 0x8A, 0xEB, 0x2C,
108 | 0xB5, 0xE0, 0x32, 0xAD, 0xD3, 0x61, 0xAE, 0x15,
109 | 0x60, 0xF1, 0x48, 0x0E, 0x7E, 0x94, 0x51, 0x0C,
110 | 0xEC, 0xDB, 0xD2, 0x64, 0xDC, 0xFC, 0xC2, 0x56,
111 | 0x24, 0xED, 0x2B, 0xD9, 0x1C, 0x68, 0x90, 0x79
112 | )
113 |
114 |
115 | class LsdFile:
116 | def __init__(self, dict_file, verbose=False):
117 | self.filename = dict_file
118 | self._readed = False
119 | self._parsed = False
120 | self.verbose = verbose
121 | with open(dict_file, 'rb') as fp:
122 | self.bstr = BitStream(bytearray(fp.read()))
123 |
124 | self.overlay = None
125 | self.headings = ArticleHeadingList()
126 | self.dict = []
127 | self.header = Header(self.bstr)
128 | # check magic
129 | if self.header.magic != u'LingVo':
130 | raise LsdError('Allow only Lsd "LingVo" ident: %s' % repr(self.header.magic))
131 |
132 | # initialize decoder
133 | self.decoder = None
134 | hi_version = self.header.hi_version
135 | version = self.header.version
136 | if hi_version == 0x11: # lingvo 11 dictionary: 0x11001
137 | self.decoder = decoder.UserDictionaryDecoder(self.bstr)
138 | elif hi_version == 0x12: # lingvo 12 dictionary: 0x12001
139 | self.decoder = decoder.UserDictionaryDecoder(self.bstr)
140 | elif hi_version == 0x13: # x3 dictionary: 0x131001 and 0x132001 if pages count > 1000
141 | self.decoder = decoder.SystemDictionaryDecoder13(self.bstr)
142 | elif hi_version == 0x14: # x5 dictionary
143 | if version == 0x142001: # user dictionaries
144 | self.decoder = decoder.UserDictionaryDecoder(self.bstr)
145 | elif version == 0x141004: # system dictionaries
146 | self.decoder = decoder.SystemDictionaryDecoder14(self.bstr)
147 | elif version == 0x145001: # abbreviation dictionaries
148 | self.decoder = decoder.AbbreviationDictionaryDecoder(self.bstr)
149 | elif hi_version == 0x15: # x6 dictionary
150 | if version == 0x152001: # user dictionaries
151 | self.decoder = decoder.UserDictionaryDecoder(self.bstr)
152 | elif version == 0x151005: # system dictionaries
153 | # xor dictionary
154 | self.xor_block_x6(self.header.dictionary_encoder_offset, self.header.articles_offset)
155 | self.decoder = decoder.SystemDictionaryDecoder14(self.bstr)
156 | elif version == 0x155001: # abbreviation dictionaries
157 | self.decoder = decoder.AbbreviationDictionaryDecoder(self.bstr)
158 |
159 | if self.decoder is None:
160 | self.dump()
161 | print("Not supported dictionary version: %s" % hex(self.header.version))
162 | exit(1)
163 | # raise LsdError("Not supported dict version %s" % hex(self.header.version))
164 |
165 | name_len = self.bstr.read_some(1)
166 | self.name = self.bstr.read_unicode(name_len, False)
167 | self.first_heading = self.bstr.read_unicode(self.bstr.read_byte(), False)
168 | self.last_heading = self.bstr.read_unicode(self.bstr.read_byte(), False)
169 | capitals_len = reverse32(self.bstr.read_int())
170 | self.capitals = self.bstr.read_unicode(capitals_len, False)
171 | # icon v12+
172 | if self.header.version > 0x120000:
173 | self.icon_size = reverse16(self.bstr.read_word())
174 | self.icon = self.bstr.read(self.icon_size)
175 | else:
176 | self.icon_size = 0
177 | self.icon = None
178 |
179 | if self.header.version > 0x140000:
180 | self.header_checksum = reverse32(self.bstr.read_int())
181 | else:
182 | self.header_checksum = 0
183 |
184 | if self.header.version > 0x120000:
185 | self.pages_end = reverse32(self.bstr.read_int())
186 | self.overlay_data = reverse32(self.bstr.read_int())
187 | else:
188 | self.pages_end = self.bstr.length
189 | self.overlay_data = self.bstr.length # no overlay
190 |
191 | if self.header.version > 0x140000:
192 | self.dummy1 = reverse32(self.bstr.read_int())
193 | self.dummy2 = reverse32(self.bstr.read_int())
194 | else:
195 | self.dummy1 = 0
196 | self.dummy2 = 0
197 |
198 | # set bstr pos for decoding
199 | self.bstr.seek(self.header.dictionary_encoder_offset)
200 |
201 | # x6 system dictionary table based xor decoding
202 | # each block xored with start key=0x7f
203 | # 1. dictionary_encoder_offset -> article_offset
204 | # must by decoded befor decoder.read()
205 | # 2. annotation_offset -> dictionary_encoder_offset
206 | # annotation decoded in the read_annotation
207 | # 3. each article encoded individully
208 | # articles_offset + heading.reference -> articles_offset + heading.next-reference
209 | # article decoded in the
210 | def xor_block_x6(self, start, end, key=0x7f):
211 | for i in range(start, end):
212 | byte = self.bstr.record[i]
213 | self.bstr.record[i] = byte ^ key
214 | key = xor_pad[byte]
215 | return key
216 |
217 | @property
218 | def pages_count(self):
219 | return (self.pages_end - self.header.pages_offset) // 512
220 |
221 | def get_page_offset(self, page_number):
222 | return self.header.pages_offset + 512 * page_number
223 |
224 | def read_headings(self):
225 | for i in range(self.pages_count):
226 | self.read_heading_from_page(i)
227 | # set last next_reference
228 | self.headings[-1].next_reference = self.header.pages_offset - self.header.articles_offset
229 |
230 | def merge_headings(self):
231 | res = []
232 | # fill next_reference in the headings
233 | prev = self.headings[0]
234 | res.append(prev)
235 | for i in range(1, len(self.headings)):
236 | h = self.headings[i]
237 | if prev.reference == h.reference:
238 | # multititle article
239 | prev.merge(h)
240 | else:
241 | res[-1].next_reference = h.reference
242 | res.append(h)
243 | prev = h
244 | # headings[i].next_reference = headings[i+1].reference
245 | # set next_reference for last item to the pages_offset
246 | res[-1].next_reference = self.header.pages_offset - self.header.articles_offset
247 | return res
248 |
249 | def read_heading_from_page(self, page_number):
250 | self.bstr.seek(self.get_page_offset(page_number))
251 | page = CachePage(self.bstr)
252 | if page.is_leaf:
253 | prefix = ""
254 | for idx in range(page.headings_count):
255 | heading = ArticleHeading()
256 | prefix = heading.read(self.decoder, self.bstr, prefix)
257 | self.headings.append(heading)
258 |
259 | def read_article(self, heading):
260 | self.bstr.seek(self.header.articles_offset + heading.reference)
261 | if self.header.version == 0x151005:
262 | # xor article
263 | self.xor_block_x6(self.header.articles_offset + heading.reference,
264 | self.header.articles_offset + heading.next_reference)
265 | size = self.bstr.read_bits(16)
266 | if size == 0xFFFF:
267 | size = self.bstr.read_bits(32)
268 |
269 | res = self.decoder.decode_article(size)
270 | # assert(res)
271 | return res
272 |
273 | def read_annotation(self):
274 | if self.header.version == 0x151005:
275 | # xor annotation
276 | self.xor_block_x6(self.header.annotation_offset,
277 | self.header.dictionary_encoder_offset)
278 | res = ""
279 | if self.bstr.seek(self.header.annotation_offset):
280 | size = self.bstr.read_bits(16)
281 | res = self.decoder.decode_article(size)
282 | return res
283 |
284 | @property
285 | def readed(self):
286 | return self._readed
287 |
288 | def read(self):
289 | if self.verbose:
290 | print("reading dictionary..")
291 | self.decoder.read()
292 | self._readed = True
293 |
294 | @property
295 | def parsed(self):
296 | return self._parsed
297 |
298 | def parse(self):
299 | if not self.readed:
300 | self.read()
301 | if self.verbose:
302 | print("decoding overlay..")
303 | self.overlay = OverlayReader(self.bstr, self.overlay_data)
304 |
305 | if self.verbose:
306 | print("decoding headings: %d" % self.header.entries_count)
307 | self.read_headings()
308 | if self.headings.appended != self.header.entries_count:
309 | raise LsdError("Decoded not all entries %d != %d" % (self.headings.appended, self.header.entries_count))
310 | # merge multititle headings
311 | # self.headings = self.merge_headings()
312 |
313 | if self.verbose:
314 | print("decoding articles: %d" % len(self.headings))
315 | for h in self.headings:
316 | # h.dump()
317 | self.dict.append((h, self.read_article(h)))
318 | self._parsed = True
319 | if self.verbose:
320 | print("OK")
321 |
322 | def write(self, path=""):
323 | """ save decoded dictionary """
324 | if not self.parsed:
325 | self.parse()
326 | self.write_icon(path)
327 | self.write_annotation(path)
328 | self.write_overlay(path)
329 | self.write_dsl(path)
330 | if self.verbose:
331 | self.write_prefix(path)
332 |
333 | def make_filename(self, path, ext):
334 | base, orig_ext = os.path.splitext(self.filename)
335 | if path != "":
336 | base = os.path.join(path, os.path.basename(base))
337 | return base + '.' + ext
338 |
339 | def write_icon(self, path=""):
340 | if self.icon_size == 0:
341 | return
342 | ico_file = self.make_filename(path, "bmp")
343 | with open(ico_file, 'wb') as ico:
344 | ico.write(self.icon)
345 | if self.verbose:
346 | print('Write icon: %s' % ico_file)
347 |
348 | def write_annotation(self, path=""):
349 | annotation = self.read_annotation()
350 | if annotation == "":
351 | return
352 | ann_file = self.make_filename(path, "ann")
353 | with codecs.open(ann_file, 'w', encoding='utf-16', errors='surrogatepass') as ann:
354 | ann.write(annotation)
355 | if self.verbose:
356 | print('Write annotation: %s' % ann_file)
357 |
358 | def write_prefix(self, path=""):
359 | if self.decoder.prefix == "":
360 | return
361 | pref_file = self.make_filename(path, "pref")
362 | with codecs.open(pref_file, 'w', encoding='utf-8') as pref:
363 | pref.write(self.decoder.prefix)
364 | if self.verbose:
365 | print('Write prefix: %s' % pref_file)
366 |
367 | def write_overlay(self, path=""):
368 | pass
369 |
370 | @staticmethod
371 | def normalize_article(article):
372 | res = article.replace(u'\n', u'\n\t')
373 | return res
374 |
375 | def write_dsl(self, path=""):
376 | if len(self.dict) == 0:
377 | print("Nothing writing to dsl!")
378 | return
379 | dsl_file = self.make_filename(path, "dsl")
380 | with codecs.open(dsl_file, 'w', encoding='utf-16', errors='surrogatepass') as dsl:
381 | dsl.write(u"#NAME\t\"" + self.name + u"\"\n")
382 | dsl.write(u"#INDEX_LANGUAGE\t\"" + tools.lang_map[self.header.source_language] + u"\"\n")
383 | dsl.write(u"#CONTENTS_LANGUAGE\t\"" + tools.lang_map[self.header.target_language] + u"\"\n")
384 | if self.icon_size > 0:
385 | base, orig_ext = os.path.splitext(os.path.basename(self.filename))
386 | dsl.write(u"#ICON_FILE\t\"" + base + '.' + "bmp" + u"\"\n")
387 | dsl.write(u"\n")
388 | for h, r in self.dict:
389 | if h.simple:
390 | dsl.write(h.get_first_ext_text())
391 | dsl.write(u"\n\t")
392 | else:
393 | for item in h.headings:
394 | dsl.write(item.ext_text)
395 | dsl.write(u"\n")
396 | dsl.write(u"\t")
397 | dsl.write(self.normalize_article(r))
398 | dsl.write(u"\n")
399 | if self.verbose:
400 | print('Write dsl: %s' % dsl_file)
401 |
402 | def dump(self):
403 | self.header.dump()
404 | # dump header for not supported versions
405 | if self.decoder is not None:
406 | print("Name: %s" % self.name)
407 | print("First heading: %s" % self.first_heading)
408 | print("Last heading: %s" % self.last_heading)
409 | print("Capitals: %s" % self.capitals)
410 | print("Pages end: %s" % hex(self.pages_end))
411 | print("Overlay data: %s" % hex(self.overlay_data))
412 | print("Pages count: %d" % ((self.pages_end - self.header.pages_offset) // 512))
413 | if self.header.version > 0x140000:
414 | print("dummy1: %s" % hex(self.dummy1))
415 | print("dummy2: %s" % hex(self.dummy2))
416 | print("Icon enable: %s" % (self.icon_size > 0))
417 | if self.readed:
418 | self.decoder.dump()
419 | self.overlay.dump()
420 |
--------------------------------------------------------------------------------
/lingvoreader/lsdreader.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | from __future__ import (division, absolute_import, print_function, with_statement)
4 |
5 | import argparse
6 | import codecs
7 | import os
8 | import sys
9 | from timeit import default_timer as timer
10 |
11 | from lingvoreader import __version__
12 | from lingvoreader import LsdError
13 | from lingvoreader import tools
14 | from lingvoreader.lsdfile import LsdFile
15 |
16 | __author__ = 'sv99'
17 |
18 | #
19 | # lsd decoder - based on source from tr7 user from ru-board
20 | # http://forum.ru-board.com/profile.cgi?action=show&member=tr7
21 | # ru-board forum - "Lingvo dictionary"
22 | # http://forum.ru-board.com/topic.cgi?forum=93&topic=3774&glp#lt
23 | # current version on the github https://github.com/nongeneric/lsd2dsl
24 | #
25 | # Worked with Lingvo x5 dictionary, other version not supported
26 | # v 0.1 - 16.09.2013
27 | #
28 | # v 0.2.0 - 22.11.2015
29 | # add support lingvo 10, 12, x3 and x6 dictionary format
30 | #
31 | # v 0.2.9 - 28.12.2018
32 | # add python3 support
33 | #
34 | if sys.platform.startswith("win") and (sys.version_info < (3, 0)):
35 | sys.stdout = codecs.getwriter('utf-8')(sys.stdout)
36 |
37 |
38 | def unpack(dicts, dest_dir, verbose):
39 | # dict_ext = os.path.splitext(dict_file)[1].upper()
40 | # if dict_ext != '.LSD':
41 | # raise LsdError("Need Lingvo lsd dictionary.")
42 |
43 | count = len(dicts)
44 | if count == 1:
45 | print("Unpacking dict: %s" % dicts[0])
46 | for i in range(count):
47 | dict_file = dicts[i]
48 | start = timer()
49 | try:
50 | if count > 1:
51 | print("Unpacking dict (%d from %d): %s" % (i + 1, count, dict_file))
52 | m = LsdFile(dict_file, verbose)
53 | m.parse()
54 | m.dump()
55 | m.write(dest_dir)
56 | except ValueError as e:
57 | print("Error: %s" % e)
58 | return 1
59 | end = timer()
60 | print("Unpack OK (%s)" % tools.display_time(end - start))
61 |
62 | return 0
63 |
64 |
65 | def header(dicts):
66 | # dict_ext = os.path.splitext(dict_file)[1].upper()
67 | # if dict_ext != '.LSD':
68 | # raise LsdError("Need Lingvo lsd dictionary.")
69 |
70 | count = len(dicts)
71 | if count == 1:
72 | print("Unpacking dict: %s" % dicts[0])
73 | for i in range(count):
74 | dict_file = dicts[i]
75 | try:
76 | if count > 1:
77 | print("Unpacking dict (%d from %d): %s" % (i + 1, count, dict_file))
78 | m = LsdFile(dict_file, True)
79 | m.dump()
80 | # print("Header %s OK" % dict_file)
81 | except ValueError as e:
82 | print("Error: %s" % e)
83 | return 1
84 |
85 | return 0
86 |
87 |
88 | def get_dicts():
89 | current = os.getcwd()
90 | res = []
91 | for f in os.listdir(current):
92 | if f.endswith(".lsd"):
93 | res.append(f)
94 | return res
95 |
96 |
97 | class CodecsAction(argparse.Action):
98 | def __init__(self,
99 | option_strings,
100 | dest=argparse.SUPPRESS,
101 | default=argparse.SUPPRESS,
102 | help="print supported languages and their codes"):
103 | super(CodecsAction, self).__init__(
104 | option_strings=option_strings,
105 | dest=dest,
106 | default=default,
107 | nargs=0,
108 | help=help)
109 |
110 | def __call__(self, parser, namespace, values, option_string=None):
111 | parser.exit(message=tools.print_codecs())
112 |
113 |
114 | def get_arg_parser():
115 | p = argparse.ArgumentParser(description='Decode Lingvo lsd dictionary to dsl')
116 | g = p.add_mutually_exclusive_group(required=True)
117 | g.add_argument("-i", "--input", help='Dictionary to decode')
118 | g.add_argument("-a", "--all", action="store_true", help='All dictionary in current directory')
119 | p.add_argument("--header", action="store_true", default=False, help='Print dictionary header and exit')
120 | p.add_argument("-o", "--outdir", default="", help="Output directory")
121 | p.add_argument("-c", "--codecs", action=CodecsAction)
122 | p.add_argument("-v", "--verbose", action="store_true", default=False)
123 | p.add_argument('--version', action='version', version='%(prog)s ' + __version__)
124 | return p
125 |
126 |
127 | def main():
128 | args = get_arg_parser().parse_args()
129 | dicts = []
130 | if args.all:
131 | # all lsd in directory
132 | print("Decode all lsd in current directory..")
133 | dicts = get_dicts()
134 | print(dicts)
135 | else:
136 | dicts.append(args.input)
137 |
138 | if args.header:
139 | header(dicts)
140 | else:
141 | if args.outdir != "":
142 | # check path
143 | if not os.path.exists(args.outdir):
144 | os.mkdir(args.outdir)
145 |
146 | start = timer()
147 | unpack(dicts, args.outdir, args.verbose)
148 | end = timer()
149 | if len(dicts) > 1:
150 | # print("Files count: %i" % c)
151 | print("Elapsed: %s" % tools.display_time(end - start))
152 |
153 | return 0
154 |
155 |
156 | if __name__ == '__main__':
157 | sys.exit(main())
158 |
--------------------------------------------------------------------------------
/lingvoreader/tools.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | from __future__ import (print_function)
4 | from builtins import chr as text
5 |
6 | __author__ = 'sv99'
7 |
8 |
9 | lang_map = {
10 | 1555: u"Abazin",
11 | 1556: u"Abkhaz",
12 | 1557: u"Adyghe",
13 | 1078: u"Afrikaans",
14 | 1559: u"Agul",
15 | 1052: u"Albanian",
16 | 1545: u"Altaic",
17 | 1025: u"Arabic", # x5 tested support
18 | # 1025: u"ArabicSaudiArabia",
19 | 5121: u"ArabicAlgeria",
20 | 15361: u"ArabicBahrain",
21 | 3073: u"ArabicEgypt",
22 | 2049: u"ArabicIraq",
23 | 11265: u"ArabicJordan",
24 | 13313: u"ArabicKuwait",
25 | 12289: u"ArabicLebanon",
26 | 4097: u"ArabicLibya",
27 | 6145: u"ArabicMorocco",
28 | 8193: u"ArabicOman",
29 | 16385: u"ArabicQatar",
30 | 10241: u"ArabicSyria",
31 | 7169: u"ArabicTunisia",
32 | 14337: u"ArabicUAE",
33 | 9217: u"ArabicYemen",
34 | 1067: u"Armenian", # x5 tested support
35 | # 1067: u"ArmenianEastern",
36 | 33835: u"ArmenianGrabar",
37 | 32811: u"ArmenianWestern",
38 | 1101: u"Assamese",
39 | 1558: u"Awar",
40 | 1560: u"Aymara",
41 | 2092: u"AzeriCyrillic",
42 | 1068: u"AzeriLatin",
43 | 1561: u"Bashkir",
44 | 1069: u"Basque",
45 | 1059: u"Belarusian",
46 | # 1059: u"Byelorussian",
47 | 1562: u"Bemba",
48 | 1093: u"Bengali",
49 | 1563: u"Blackfoot",
50 | 1536: u"Breton",
51 | 1564: u"Bugotu",
52 | 1026: u"Bulgarian",
53 | 1109: u"Burmese",
54 | 1565: u"Buryat",
55 | 1027: u"Catalan",
56 | 1566: u"Chamorro",
57 | 1544: u"Chechen",
58 | 1028: u"Chinese", # x5 tested support
59 | # 1028: u"ChineseTaiwan",
60 | 3076: u"ChineseHongKong",
61 | 5124: u"ChineseMacau",
62 | 2052: u"ChinesePRC",
63 | 4100: u"ChineseSingapore",
64 | 1567: u"Chukcha",
65 | 1568: u"Chuvash",
66 | 1569: u"Corsican",
67 | 1546: u"CrimeanTatar",
68 | 1050: u"Croatian",
69 | 1570: u"Crow",
70 | 1029: u"Czech",
71 | 1030: u"Danish",
72 | 1572: u"Dungan",
73 | 1043: u"Dutch", # x5 tested
74 | 2067: u"DutchBelgian", # not supported
75 | 1033: u"English", # 1033: u"EnglishUnitedStates",
76 | 3081: u"EnglishAustralian",
77 | 10249: u"EnglishBelize",
78 | 4105: u"EnglishCanadian",
79 | 9225: u"EnglishCaribbean",
80 | 6153: u"EnglishIreland",
81 | 8201: u"EnglishJamaica",
82 | 35849: u"EnglishLaw",
83 | 33801: u"EnglishMedical",
84 | 5129: u"EnglishNewZealand",
85 | 13321: u"EnglishPhilippines",
86 | 34825: u"EnglishProperNames",
87 | 7177: u"EnglishSouthAfrica",
88 | 11273: u"EnglishTrinidad",
89 | 2057: u"EnglishUnitedKingdom",
90 | 12297: u"EnglishZimbabwe",
91 | 1573: u"EskimoCyrillic",
92 | 1537: u"Esperanto",
93 | 1061: u"Estonian",
94 | 1574: u"Even",
95 | 1575: u"Evenki",
96 | 1065: u"Farsi",
97 | 1538: u"Fijian",
98 | 1035: u"Finnish",
99 | 1036: u"French", # x5 supported
100 | 2060: u"FrenchBelgian",
101 | 3084: u"FrenchCanadian",
102 | 5132: u"FrenchLuxembourg",
103 | 6156: u"FrenchMonaco",
104 | 33804: u"FrenchProperNames",
105 | 4108: u"FrenchSwiss",
106 | 1122: u"Frisian",
107 | 1576: u"Frisian_Legacy", # x6
108 | 1577: u"Friulian",
109 | 1084: u"GaelicScottish",
110 | 1578: u"Gagauz",
111 | 1110: u"Galician",
112 | 1579: u"Galician_Legacy", # x6
113 | 1580: u"Ganda",
114 | 1079: u"Georgian",
115 | 1031: u"German",
116 | 3079: u"GermanAustrian",
117 | 34823: u"GermanLaw",
118 | 5127: u"GermanLiechtenstein",
119 | 4103: u"GermanLuxembourg",
120 | 36871: u"GermanMedical",
121 | 32775: u"GermanNewSpelling",
122 | 35847: u"GermanNewSpellingLaw",
123 | 37895: u"GermanNewSpellingMedical",
124 | 39943: u"GermanNewSpellingProperNames",
125 | 38919: u"GermanProperNames",
126 | 2055: u"GermanSwiss",
127 | 1032: u"Greek",
128 | 32776: u"GreekKathareusa",
129 | 1140: u"Guarani",
130 | 1582: u"Guarani_Legacy", # x6
131 | 1095: u"Gujarati",
132 | 1583: u"Hani",
133 | 1128: u"Hausa", # x6
134 | 1652: u"Hausa_Legacy",
135 | 1141: u"Hawaiian", # x6
136 | 1539: u"Hawaiian_Legacy",
137 | 1037: u"Hebrew",
138 | 1081: u"Hindi",
139 | 1038: u"Hungarian",
140 | 1039: u"Icelandic",
141 | 1584: u"Ido",
142 | 1057: u"Indonesian",
143 | 1585: u"Ingush",
144 | 1586: u"Interlingua",
145 | 2108: u"Irish", # x6
146 | 1552: u"Irish_Legacy", # x6
147 | # 2108: u"Gaelic", # x6
148 | # 1552: u"Gaelic_Legacy", # x6
149 | 1040: u"Italian", # x5 tested
150 | 33808: u"ItalianProperNames",
151 | 2064: u"ItalianSwiss",
152 | 1041: u"Japanese",
153 | 1548: u"Kabardian",
154 | 1587: u"Kalmyk",
155 | 1099: u"Kannada",
156 | 1589: u"KarachayBalkar",
157 | 1588: u"Karakalpak",
158 | 1120: u"Kashmiri",
159 | 2144: u"KashmiriIndia",
160 | 1590: u"Kasub",
161 | 1591: u"Kawa",
162 | 1087: u"Kazakh",
163 | 1592: u"Khakas",
164 | 1593: u"Khanty",
165 | 1107: u"Khmer",
166 | 1594: u"Kikuyu",
167 | 1595: u"Kirgiz",
168 | 1597: u"KomiPermian",
169 | 1596: u"KomiZyryan",
170 | 1598: u"Kongo",
171 | 1111: u"Konkani",
172 | 1042: u"Korean",
173 | 2066: u"KoreanJohab",
174 | 1599: u"Koryak",
175 | 1600: u"Kpelle",
176 | 1601: u"Kumyk",
177 | 1602: u"Kurdish",
178 | 1603: u"KurdishCyrillic",
179 | 1604: u"Lak",
180 | 1108: u"Lao",
181 | 1142: u"Latin", # x6
182 | 1540: u"Latin_Legacy",
183 | 1062: u"Latvian",
184 | 1655: u"LatvianGothic",
185 | 1605: u"Lezgin",
186 | 1063: u"Lithuanian",
187 | 2087: u"LithuanianClassic",
188 | 1606: u"Luba",
189 | 1071: u"Macedonian",
190 | 1607: u"Malagasy",
191 | 1086: u"Malay",
192 | 2110: u"MalayBruneiDarussalam",
193 | 1100: u"Malayalam",
194 | 1608: u"Malinke",
195 | 1082: u"Maltese",
196 | 1112: u"Manipuri",
197 | 1609: u"Mansi",
198 | 1153: u"Maori", # x6
199 | 1102: u"Marathi",
200 | 1610: u"Mari",
201 | 1611: u"Maya",
202 | 1612: u"Miao",
203 | 1613: u"Minankabaw",
204 | 1614: u"Mohawk",
205 | 1104: u"Mongol",
206 | 1615: u"Mordvin",
207 | 1616: u"Nahuatl",
208 | 1617: u"Nanai",
209 | 1618: u"Nenets",
210 | 1121: u"Nepali",
211 | 2145: u"NepaliIndia",
212 | 1619: u"Nivkh",
213 | 1620: u"Nogay",
214 | 1044: u"Norwegian",
215 | # 1044: u"NorwegianBokmal",
216 | 2068: u"NorwegianNynorsk",
217 | 1621: u"Nyanja",
218 | 1622: u"Occidental",
219 | 1623: u"Ojibway",
220 | 32777: u"OldEnglish",
221 | 32780: u"OldFrench",
222 | 33799: u"OldGerman",
223 | 32784: u"OldItalian",
224 | 1657: u"OldSlavonic", # x6
225 | 32778: u"OldSpanish",
226 | 1096: u"Oriya",
227 | 1547: u"Ossetic",
228 | 1145: u"Papiamento", # x6
229 | 1624: u"Papiamento_Legacy",
230 | 1625: u"PidginEnglish",
231 | 1654: u"Pinyin",
232 | 1045: u"Polish",
233 | 1046: u"Portuguese", # not supported
234 | # 1046: u"PortugueseBrazilian",
235 | 2070: u"PortugueseStandard", # x5 supported
236 | 1541: u"Provencal",
237 | 1094: u"Punjabi",
238 | 1131: u"Quechua", # x6
239 | # 1131: u"QuechuaBolivia", # x6
240 | 2155: u"QuechuaEcuador", # x6
241 | 3179: u"QuechuaPeru", # x6
242 | 1626: u"Quechua_Legacy",
243 | 1047: u"RhaetoRomanic",
244 | 1048: u"Romanian",
245 | 2072: u"RomanianMoldavia",
246 | 1627: u"Romany",
247 | 1628: u"Ruanda",
248 | 1629: u"Rundi",
249 | 1049: u"Russian",
250 | 2073: u"RussianMoldavia",
251 | 32793: u"RussianOldSpelling",
252 | 34841: u"RussianOldOrtho", # x6
253 | 33817: u"RussianProperNames",
254 | 1083: u"Saami",
255 | 1542: u"Samoan",
256 | 1103: u"Sanskrit",
257 | 1630: u"Selkup",
258 | 3098: u"SerbianCyrillic",
259 | 2074: u"SerbianLatin",
260 | 1631: u"Shona",
261 | 1113: u"Sindhi",
262 | 1051: u"Slovak",
263 | 1060: u"Slovenian",
264 | 1143: u"Somali", # x6
265 | 1633: u"Somali_Legacy",
266 | 1070: u"Sorbian", # not supported
267 | 1634: u"Sotho",
268 | # 1034: u"Spanish", # not supported
269 | 1034: u"SpanishTraditionalSort", # x5 tested
270 | 11274: u"SpanishArgentina",
271 | 16394: u"SpanishBolivia",
272 | 13322: u"SpanishChile",
273 | 9226: u"SpanishColombia",
274 | 5130: u"SpanishCostaRica",
275 | 7178: u"SpanishDominicanRepublic",
276 | 12298: u"SpanishEcuador",
277 | 17418: u"SpanishElSalvador",
278 | 4106: u"SpanishGuatemala",
279 | 18442: u"SpanishHonduras",
280 | 2058: u"SpanishMexican",
281 | 3082: u"SpanishModernSort",
282 | 19466: u"SpanishNicaragua",
283 | 6154: u"SpanishPanama",
284 | 15370: u"SpanishParaguay",
285 | 10250: u"SpanishPeru",
286 | 33802: u"SpanishProperNames",
287 | 20490: u"SpanishPuertoRico",
288 | 14346: u"SpanishUruguay",
289 | 8202: u"SpanishVenezuela",
290 | 1635: u"Sunda",
291 | 1072: u"Sutu",
292 | 1089: u"Swahili",
293 | 1636: u"Swazi",
294 | 1053: u"Swedish",
295 | 2077: u"SwedishFinland",
296 | 1637: u"Tabassaran",
297 | 1553: u"Tagalog",
298 | 1639: u"Tahitian",
299 | 1064: u"Tajik", # x6
300 | 1638: u"Tajik_Legacy",
301 | 1097: u"Tamil",
302 | 1092: u"Tatar",
303 | 1098: u"Telugu",
304 | 1054: u"Thai",
305 | 1105: u"Tibet",
306 | 1641: u"Tongan",
307 | 1073: u"Tsonga",
308 | 1074: u"Tswana",
309 | # 1074, u"Chuana",
310 | 1642: u"Tun",
311 | 1055: u"Turkish",
312 | 1090: u"Turkmen", # x6
313 | 1643: u"Turkmen_Legacy",
314 | 1644: u"Tuvin",
315 | 1645: u"Udmurt",
316 | # 1646: u"Uighur", # not supported
317 | 1646: u"UighurCyrillic", # not supported
318 | 1647: u"UighurLatin",
319 | 1058: u"Ukrainian",
320 | 1653: u"Universal",
321 | 2080: u"UrduIndia",
322 | 1056: u"UrduPakistan",
323 | 1554: u"User",
324 | 2115: u"UzbekCyrillic",
325 | 1091: u"UzbekLatin",
326 | 1075: u"Venda",
327 | 1066: u"Vietnamese",
328 | 1648: u"Visayan",
329 | 1106: u"Welsh", # x6
330 | 1543: u"Welsh_Legacy",
331 | 1160: u"Wolof", # x6
332 | 1649: u"Wolof_Legacy",
333 | 1076: u"Xhosa",
334 | 1157: u"Yakut", # x6
335 | 1650: u"Yakut_Legacy",
336 | 1085: u"Yiddish",
337 | 1651: u"Zapotec",
338 | 1077: u"Zulu",
339 | }
340 |
341 |
342 | def int2unichr(value):
343 | # import sys
344 | # if sys.version_info > (3, 0):
345 | # return chr(value)
346 | # else:
347 | # return unichr(value)
348 | return text(value)
349 |
350 |
351 | def print_codecs():
352 | # sort by values
353 | for lcid in sorted(lang_map, key=lang_map.get):
354 | print("%6d:\t%s" % (lcid, lang_map[lcid]))
355 |
356 |
357 | def bit_length(num):
358 | res = 1
359 | num >>= 1
360 | while num != 0:
361 | res += 1
362 | num >>= 1
363 | return res
364 |
365 |
366 | def display_time(sec):
367 | result = ""
368 | hour = sec // 3600
369 | if hour:
370 | result += "{} hours".format(hour)
371 | if hour == 1:
372 | result = result.rstrip('s')
373 | sec -= 3600 * hour
374 | min = sec // 60
375 | if min:
376 | if result != "":
377 | result += " "
378 | result += "{} min".format(int(min))
379 | sec -= 60 * min
380 | if result != "":
381 | result += " "
382 | result += "{0:0.2f} sec".format(sec)
383 | return result
384 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import codecs
4 | import os
5 | import re
6 |
7 | from setuptools import setup, find_packages
8 |
9 |
10 | here = os.path.abspath(os.path.dirname(__file__))
11 |
12 |
13 | def read(*parts):
14 | # intentionally *not* adding an encoding option to open, See:
15 | # https://github.com/pypa/virtualenv/issues/201#issuecomment-3145690
16 | return codecs.open(os.path.join(here, *parts), 'r').read()
17 |
18 |
19 | def find_version(*file_paths):
20 | version_file = read(*file_paths)
21 | version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]",
22 | version_file, re.M)
23 | if version_match:
24 | return version_match.group(1)
25 | raise RuntimeError("Unable to find version string.")
26 |
27 |
28 | setup(
29 | name="lingvoreader",
30 | version=find_version("lingvoreader", "__init__.py"),
31 | author='sv99',
32 | author_email='sv99@inbox.ru',
33 | url='https://github.com/sv99/lsdreader',
34 | description='Linvo 11, 12, X3, X5 and X6 lsd reader utilities',
35 | long_description=read('README.rst'),
36 | classifiers=[
37 | 'Development Status :: 4 - Beta',
38 | 'Topic :: Education',
39 | 'Programming Language :: Python :: 2.7',
40 | 'Programming Language :: Python :: 3',
41 | ],
42 | packages=find_packages(),
43 | platforms='any',
44 | install_requires=[
45 | 'future; python_version == "2.7"',
46 | ],
47 | zip_safe=True,
48 | entry_points={
49 | 'console_scripts': [
50 | 'lsdreader = lingvoreader.lsdreader:main',
51 | ]
52 | }
53 | )
54 |
--------------------------------------------------------------------------------
/test/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | __author__ = 'sv99'
4 |
--------------------------------------------------------------------------------
/test/test_arg_parser.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | from unittest import TestCase
4 | from lingvoreader import lsdreader
5 | import sys
6 | from io import BytesIO
7 |
8 | __author__ = 'sv99'
9 |
10 |
11 | class ArgumentParserError(Exception):
12 |
13 | def __init__(self, message, stdout=None, stderr=None, error_code=None):
14 | Exception.__init__(self, message, stdout, stderr)
15 | self.message = message
16 | self.stdout = stdout
17 | self.stderr = stderr
18 | self.error_code = error_code
19 |
20 |
21 | class StdIOBuffer(BytesIO):
22 | pass
23 |
24 |
25 | def stderr_to_parser_error(parse_args, *args, **kwargs):
26 | # if this is being called recursively and stderr or stdout is already being
27 | # redirected, simply call the function and let the enclosing function
28 | # catch the exception
29 | if isinstance(sys.stderr, StdIOBuffer) or isinstance(sys.stdout, StdIOBuffer):
30 | return parse_args(*args, **kwargs)
31 |
32 | # if this is not being called recursively, redirect stderr and
33 | # use it as the ArgumentParserError message
34 | old_stdout = sys.stdout
35 | old_stderr = sys.stderr
36 | sys.stdout = StdIOBuffer()
37 | sys.stderr = StdIOBuffer()
38 | try:
39 | try:
40 | result = parse_args(*args, **kwargs)
41 | for key in list(vars(result)):
42 | if getattr(result, key) is sys.stdout:
43 | setattr(result, key, old_stdout)
44 | if getattr(result, key) is sys.stderr:
45 | setattr(result, key, old_stderr)
46 | return result
47 | except SystemExit:
48 | code = sys.exc_info()[1].code
49 | stdout = sys.stdout.getvalue()
50 | stderr = sys.stderr.getvalue()
51 | raise ArgumentParserError("SystemExit", stdout, stderr, code)
52 | finally:
53 | sys.stdout = old_stdout
54 | sys.stderr = old_stderr
55 |
56 |
57 | class TestArgParser(TestCase):
58 | def setUp(self):
59 | self.parser = lsdreader.get_arg_parser()
60 |
61 | def test_empty(self):
62 | self.assertRaisesRegexp(ArgumentParserError,
63 | 'one of the arguments -i/--input -a/--all is required',
64 | stderr_to_parser_error, self.parser.parse_args, [])
65 |
66 | def test_codecs(self):
67 | self.assertRaisesRegexp(ArgumentParserError,
68 | "SystemExit",
69 | stderr_to_parser_error, self.parser.parse_args, ['-c', ])
70 |
71 | def test_input(self):
72 | args = self.parser.parse_args('-i test'.split())
73 | self.assertEqual(args.input, 'test')
74 | self.assertFalse(args.verbose)
75 |
76 | def test_input_verbose(self):
77 | args = self.parser.parse_args('-i test -v'.split())
78 | self.assertEqual(args.input, 'test')
79 | self.assertTrue(args.verbose)
80 | self.assertEqual(args.outdir, '')
81 |
82 | def test_all(self):
83 | args = self.parser.parse_args('-a'.split())
84 | self.assertTrue(args.all)
85 | self.assertFalse(args.verbose)
86 | self.assertEqual(args.outdir, '')
87 |
88 | def test_all_verbose(self):
89 | args = self.parser.parse_args('-a -v'.split())
90 | self.assertTrue(args.all)
91 | self.assertTrue(args.verbose)
92 | self.assertEqual(args.outdir, '')
93 |
94 | def test_outdir(self):
95 | args = self.parser.parse_args('-a -o test'.split())
96 | self.assertTrue(args.all)
97 | self.assertFalse(args.verbose)
98 | self.assertEqual(args.outdir, 'test')
99 |
--------------------------------------------------------------------------------
/test/test_bitStream.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | from __future__ import print_function
4 | from unittest import TestCase
5 | from lingvoreader import bitstream
6 |
7 | __author__ = 'sv99'
8 |
9 |
10 | class TestBitStream(TestCase):
11 | def setUp(self):
12 | print("Creating a new BitStream...")
13 | # record - bytearray, not string!
14 | self.record = bytearray('\x00\x01\x02\x03\x04\x05\x06\x07\x08')
15 | self.bst = bitstream.BitStream(self.record)
16 |
17 | def tearDown(self):
18 | print("Destroying the BitStream...")
19 | self.bst = None
20 |
21 | def test_reverse32(self):
22 | self.assertEqual(bitstream.reverse32(0x01020304), 0x04030201)
23 |
24 | def test_reverse16(self):
25 | self.assertEqual(bitstream.reverse16(0x0102), 0x0201)
26 |
27 | def test_length(self):
28 | self.assertEqual(len(self.record), self.bst.length)
29 |
30 | def test_to_nearest_byte(self):
31 | self.bst.seek(0)
32 | self.bst.to_nearest_byte()
33 | self.assertEqual(self.bst.pos, 0)
34 | self.bst.read_bit()
35 | self.bst.to_nearest_byte()
36 | self.assertEqual(self.bst.pos, 1)
37 |
38 | def test_read_byte(self):
39 | self.bst.seek(0)
40 | self.assertEqual(self.bst.read_byte(), 0)
41 | self.bst.seek(1)
42 | self.assertEqual(self.bst.read_byte(), 1)
43 |
44 | def test_read_word(self):
45 | self.bst.seek(1)
46 | self.assertEqual(self.bst.read_word(), 0x0102)
47 |
48 | def test_read_int(self):
49 | self.bst.seek(1)
50 | self.assertEqual(self.bst.read_int(), 0x01020304)
51 |
52 | #def test_read_symbols(self):
53 | # self.fail()
54 |
55 | def test_read_bit(self):
56 | self.bst.seek(1)
57 | self.bst.read_bits(7)
58 | self.assertEqual(self.bst.read_bit(), 1)
59 | self.assertEqual(self.bst.read_bit(), 0)
60 |
61 | def test_read_bits(self):
62 | self.bst.seek(1)
63 | self.assertEqual(self.bst.read_bits(4), 0)
64 | self.assertEqual(self.bst.read_bits(4), 1)
65 | self.assertEqual(self.bst.read_bits(4), 0)
66 | self.assertEqual(self.bst.read_bits(8), 0x20)
--------------------------------------------------------------------------------
/test/test_tools.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | from unittest import TestCase
4 | from lingvoreader import tools
5 |
6 | __author__ = 'sv99'
7 |
8 |
9 | class TestTools(TestCase):
10 | def test_bit_length_0(self):
11 | self.assertEqual(tools.bit_length(0), 1)
12 |
13 | def test_bit_length_1(self):
14 | self.assertEqual(tools.bit_length(1), 1)
15 |
16 | def test_bit_length_2(self):
17 | self.assertEqual(tools.bit_length(2), 2)
18 |
19 | def test_bit_length_4(self):
20 | self.assertEqual(tools.bit_length(4), 3)
21 |
22 | def test_display_time(self):
23 | self.assertEquals(tools.display_time(1),
24 | "1.00 sec")
25 | self.assertEquals(tools.display_time(61.2),
26 | "1 min 1.20 sec")
27 | self.assertEquals(tools.display_time(60),
28 | "1 min 0.00 sec")
29 | self.assertEquals(tools.display_time(3600),
30 | "1 hour 0.00 sec")
31 | self.assertEquals(tools.display_time(3661),
32 | "1 hour 1 min 1.00 sec")
33 |
--------------------------------------------------------------------------------
/testdata/.gitignore:
--------------------------------------------------------------------------------
1 | *.*
2 |
3 |
--------------------------------------------------------------------------------