├── .gitattributes
├── .gitignore
├── LICENSE
├── README.md
├── bat
    ├── csv+msg2msg.bat
    ├── json+msg2msg.bat
    ├── msg2csv.bat
    ├── msg2json.bat
    ├── msg2txt.bat
    └── txt+msg2msg.bat
├── build.bat
├── build_with_portable_python.bat
├── download_portable_python.bat
├── requirements.txt
└── src
    ├── HexTool.py
    ├── REMSG.py
    ├── REMSGUtil.py
    ├── REWString.py
    └── main.py


/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | *.zip
  2 | *.7z
  3 | *.json
  4 | 
  5 | # Byte-compiled / optimized / DLL files
  6 | __pycache__/
  7 | *.py[cod]
  8 | *$py.class
  9 | 
 10 | # C extensions
 11 | *.so
 12 | 
 13 | # Distribution / packaging
 14 | .Python
 15 | build/
 16 | develop-eggs/
 17 | dist/
 18 | downloads/
 19 | eggs/
 20 | .eggs/
 21 | lib/
 22 | lib64/
 23 | parts/
 24 | sdist/
 25 | var/
 26 | wheels/
 27 | share/python-wheels/
 28 | *.egg-info/
 29 | .installed.cfg
 30 | *.egg
 31 | MANIFEST
 32 | *.zip
 33 | python/
 34 | release/
 35 | 
 36 | # PyInstaller
 37 | #  Usually these files are written by a python script from a template
 38 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 39 | *.manifest
 40 | *.spec
 41 | 
 42 | # Installer logs
 43 | pip-log.txt
 44 | pip-delete-this-directory.txt
 45 | 
 46 | # Unit test / coverage reports
 47 | htmlcov/
 48 | .tox/
 49 | .nox/
 50 | .coverage
 51 | .coverage.*
 52 | .cache
 53 | nosetests.xml
 54 | coverage.xml
 55 | *.cover
 56 | *.py,cover
 57 | .hypothesis/
 58 | .pytest_cache/
 59 | cover/
 60 | 
 61 | # Translations
 62 | *.mo
 63 | *.pot
 64 | 
 65 | # Django stuff:
 66 | *.log
 67 | local_settings.py
 68 | db.sqlite3
 69 | db.sqlite3-journal
 70 | 
 71 | # Flask stuff:
 72 | instance/
 73 | .webassets-cache
 74 | 
 75 | # Scrapy stuff:
 76 | .scrapy
 77 | 
 78 | # Sphinx documentation
 79 | docs/_build/
 80 | 
 81 | # PyBuilder
 82 | .pybuilder/
 83 | target/
 84 | 
 85 | # Jupyter Notebook
 86 | .ipynb_checkpoints
 87 | 
 88 | # IPython
 89 | profile_default/
 90 | ipython_config.py
 91 | 
 92 | # pyenv
 93 | #   For a library or package, you might want to ignore these files since the code is
 94 | #   intended to run in multiple environments; otherwise, check them in:
 95 | # .python-version
 96 | 
 97 | # pipenv
 98 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 99 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
100 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
101 | #   install all needed dependencies.
102 | #Pipfile.lock
103 | 
104 | # poetry
105 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
106 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
107 | #   commonly ignored for libraries.
108 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
109 | #poetry.lock
110 | 
111 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
112 | __pypackages__/
113 | 
114 | # Celery stuff
115 | celerybeat-schedule
116 | celerybeat.pid
117 | 
118 | # SageMath parsed files
119 | *.sage.py
120 | 
121 | # Environments
122 | .env
123 | .venv
124 | env/
125 | venv/
126 | ENV/
127 | env.bak/
128 | venv.bak/
129 | 
130 | # Spyder project settings
131 | .spyderproject
132 | .spyproject
133 | 
134 | # Rope project settings
135 | .ropeproject
136 | 
137 | # mkdocs documentation
138 | /site
139 | 
140 | # mypy
141 | .mypy_cache/
142 | .dmypy.json
143 | dmypy.json
144 | 
145 | # Pyre type checker
146 | .pyre/
147 | 
148 | # pytype static type analyzer
149 | .pytype/
150 | 
151 | # Cython debug symbols
152 | cython_debug/
153 | 
154 | # PyCharm
155 | #  JetBrains specific template is maintainted in a separate JetBrains.gitignore that can
156 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
157 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
158 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
159 | #.idea/
160 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 dtlnor
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # REMSG_Converter
 2 |  
 3 |  Python library for converting from RE engine msg text file to json/csv/txt and back.
 4 |  
 5 | # Description
 6 |  For txt, I let it stay similar format to the msg tool. That means one lang one txt file.
 7 |  
 8 |  For csv, I put all the languages into one file, with the msg entry name, its guid, and attributes.  
 9 |  I think this helps for research purposes.
10 |  
11 |  For json, I made it similar to mhrice's, but also added some file format information.  
12 |  Thus json format can convert to msg on its own (but you still need to pass a dummy .msg file with json file together)  
13 |  that means you can modify the attribute, guid, and number of entries, for json file modification.  
14 |  (but IDK if the game works fine when you add/delete an entry for the existing msg)  
15 |  note: if you want to add a new custom msg file and let the game function call it, you may need to edit the `GUIConfig.gcf` file (IDK if this is possible)  
16 |  
17 | # Usage
18 | ## Command Line Usage
19 | print help for command line args usage:
20 | 
21 | ```REMSG_Converter.exe -h```
22 | 
23 | ## Convert msg to json / txt / csv
24 | drag .msg.* file/folder to `msg2{csv/json/txt}.bat`
25 | 
26 | ## Convert json / txt / csv to msg
27 | drag .csv/.json/.txt file/folder **AND** .msg.* file/folder to `{csv/json/txt}+msg2msg.bat`
28 | 
29 | the `filename.msg.{version}.new` file is the modded file
30 | 
31 | ## Use as python module
32 | ```py
33 | # use case could be find at main.py. under DebugTest() or worker()
34 | import REMSGUtil
35 | msg = REMSGUtil.importMSG("abcd.msg.123456") # get MSG object as msg
36 | REMSGUtil.exportMSG(msg, "efgh.msg.123456") # export as msg file
37 | REMSGUtil.exportCSV(msg, "abcd.msg.123456") # export as csv file
38 | ```
39 | # Credits
40 | * wwylele's [mhrice](https://github.com/wwylele/mhrice), for file structure.
41 | * ponaromixxx's [msg tool](https://zenhax.com/viewtopic.php?f=12&t=13337), for file structure.
42 | 


--------------------------------------------------------------------------------
/bat/csv+msg2msg.bat:
--------------------------------------------------------------------------------
1 | @echo off
2 | cd /d %~dp0
3 | REMSG_Converter.exe -m csv %1 %2
4 | pause
5 | 


--------------------------------------------------------------------------------
/bat/json+msg2msg.bat:
--------------------------------------------------------------------------------
1 | @echo off
2 | cd /d %~dp0
3 | REMSG_Converter.exe -m json %1 %2
4 | pause
5 | 


--------------------------------------------------------------------------------
/bat/msg2csv.bat:
--------------------------------------------------------------------------------
1 | @echo off
2 | cd /d %~dp0
3 | REMSG_Converter.exe -i %1 -m csv
4 | pause
5 | 


--------------------------------------------------------------------------------
/bat/msg2json.bat:
--------------------------------------------------------------------------------
1 | @echo off
2 | cd /d %~dp0
3 | REMSG_Converter.exe -i %1 -m json
4 | pause
5 | 


--------------------------------------------------------------------------------
/bat/msg2txt.bat:
--------------------------------------------------------------------------------
 1 | @echo off
 2 | cd /d %~dp0
 3 | echo Japanese: ja
 4 | echo English: en
 5 | echo French: fr
 6 | echo Italian: it
 7 | echo German: de
 8 | echo Spanish: es
 9 | echo Russian: ru
10 | echo Polish: pl
11 | echo Dutch: nl
12 | echo Portuguese: pt
13 | echo PortugueseBr: ptbr
14 | echo Korean: ko
15 | echo TraditionalChinese: zhtw
16 | echo SimplifiedChinese: zhcn
17 | echo Finnish: fi
18 | echo Swedish: sv
19 | echo Danish: da
20 | echo Norwegian: no
21 | echo Czech: cs
22 | echo Hungarian: hu
23 | echo Slovak: sk
24 | echo Arabic: ar
25 | echo Turkish: tr
26 | echo Bulgarian: bg
27 | echo Greek: el
28 | echo Romanian: ro
29 | echo Thai: th
30 | echo Ukrainian: ua
31 | echo Vietnamese: vi
32 | echo Indonesian: id
33 | echo Fiction: cc
34 | echo Hindi: hi
35 | echo LatinAmericanSpanish: es419
36 | set /p lang="Please enter one of the lang to extract: "
37 | REMSG_Converter.exe -i %1 -m txt -l %lang%
38 | pause
39 | 


--------------------------------------------------------------------------------
/bat/txt+msg2msg.bat:
--------------------------------------------------------------------------------
 1 | @echo off
 2 | cd /d %~dp0
 3 | echo Japanese: ja
 4 | echo English: en
 5 | echo French: fr
 6 | echo Italian: it
 7 | echo German: de
 8 | echo Spanish: es
 9 | echo Russian: ru
10 | echo Polish: pl
11 | echo Dutch: nl
12 | echo Portuguese: pt
13 | echo PortugueseBr: ptbr
14 | echo Korean: ko
15 | echo TraditionalChinese: zhtw
16 | echo SimplifiedChinese: zhcn
17 | echo Finnish: fi
18 | echo Swedish: sv
19 | echo Danish: da
20 | echo Norwegian: no
21 | echo Czech: cs
22 | echo Hungarian: hu
23 | echo Slovak: sk
24 | echo Arabic: ar
25 | echo Turkish: tr
26 | echo Bulgarian: bg
27 | echo Greek: el
28 | echo Romanian: ro
29 | echo Thai: th
30 | echo Ukrainian: ua
31 | echo Vietnamese: vi
32 | echo Indonesian: id
33 | echo Fiction: cc
34 | echo Hindi: hi
35 | echo LatinAmericanSpanish: es419
36 | set /p lang="Please enter one of the lang to import: "
37 | REMSG_Converter.exe -m txt -l %lang% %1 %2
38 | pause
39 | 


--------------------------------------------------------------------------------
/build.bat:
--------------------------------------------------------------------------------
1 | pyinstaller -F src\main.py -n REMSG_Converter
2 | 


--------------------------------------------------------------------------------
/build_with_portable_python.bat:
--------------------------------------------------------------------------------
 1 | @echo off
 2 | REM Make a release package (REMSG_Converter.zip)
 3 | if exist release rmdir /s release
 4 | mkdir release\src
 5 | 
 6 | REM Edit batch files and put them in ./release
 7 | set OLD_STR=REMSG_Converter.exe
 8 | set NEW_STR=python\python.exe src\main.py
 9 | 
10 | setlocal enabledelayedexpansion
11 | for %%f in (bat\*.bat) DO (
12 |   for /f "delims=" %%a in (%%f) do (
13 |     set line=%%a
14 |     echo !line:%OLD_STR%=%NEW_STR%!>>release\%%~nxf
15 |   )
16 | )
17 | 
18 | REM Put other files in ./release
19 | copy src\*.py release\src
20 | copy requirements.txt release
21 | cd release
22 | echo|..\download_portable_python.bat
23 | del requirements.txt
24 | 
25 | REM Zip ./release
26 | powershell Compress-Archive -Force -Path * -Destination ../REMSG_Converter.zip
27 | 
28 | echo Done!
29 | pause
30 | 


--------------------------------------------------------------------------------
/download_portable_python.bat:
--------------------------------------------------------------------------------
 1 | @echo off
 2 | REM Make a portable python environment in ./python
 3 | 
 4 | REM Version info
 5 | set PYTHON_VERSION=3.12.4
 6 | set PYTHON_VER_SHORT=312
 7 | 
 8 | REM Delete ./python if exist
 9 | if exist python rmdir /s python
10 | 
11 | REM Download embeddable python
12 | curl -OL https://www.python.org/ftp/python/%PYTHON_VERSION%/python-%PYTHON_VERSION%-embed-amd64.zip
13 | powershell Expand-Archive -Force -Path python-%PYTHON_VERSION%-embed-amd64.zip
14 | del python-%PYTHON_VERSION%-embed-amd64.zip
15 | cd python-%PYTHON_VERSION%-embed-amd64
16 | 
17 | REM Download mmh3 and chardet
18 | (
19 |     echo python%PYTHON_VER_SHORT%.zip
20 |     echo .
21 |     echo import site
22 | ) > python%PYTHON_VER_SHORT%._pth
23 | curl -OL https://bootstrap.pypa.io/get-pip.py
24 | python get-pip.py
25 | REM If you have the requirements installed in any path, uninstall them.
26 | python -m pip install -r ..\requirements.txt
27 | robocopy Lib\site-packages\chardet chardet /E
28 | copy Lib\site-packages\mmh3.cp%PYTHON_VER_SHORT%-win_amd64.pyd .
29 | rmdir /s /q Lib Scripts
30 | del get-pip.py
31 | cd chardet
32 | rmdir /s /q __pycache__ cli metadata\__pycache__
33 | cd ..
34 | 
35 | REM Remove unnecessary files
36 | del pythonw.exe python.cat python%PYTHON_VER_SHORT%._pth
37 | del python3.dll libcrypto-1_1.dll libssl-1_1.dll libcrypto-3.dll libssl-3.dll sqlite3.dll
38 | del vcruntime140.dll vcruntime140_1.dll
39 | del _asyncio.pyd _bz2.pyd _decimal.pyd _elementtree.pyd _hashlib.pyd
40 | del _lzma.pyd _msi.pyd _overlapped.pyd _queue.pyd
41 | del _sqlite3.pyd _ssl.pyd _zoneinfo.pyd
42 | del pyexpat.pyd unicodedata.pyd winsound.pyd
43 | 
44 | REM Remove unnecessary files from pythonXXX.zip
45 | powershell Expand-Archive -Force -Path python%PYTHON_VER_SHORT%.zip
46 | cd python%PYTHON_VER_SHORT%
47 | rmdir /s /q __phello__ curses dbm html http lib2to3 msilib
48 | rmdir /s /q pydoc_data site-packages sqlite3 tomllib urllib
49 | rmdir /s /q wsgiref xml xmlrpc zoneinfo
50 | del ast.pyc calendar.pyc doctest.pyc ftplib.pyc
51 | del imaplib.pyc ipaddress.pyc mailbox.pyc nntplib.pyc
52 | del optparse.pyc pdb.pyc pickletools.pyc pydoc.pyc
53 | del smtpd.pyc smtplib.pyc ssl.pyc tarfile.pyc
54 | powershell Compress-Archive -Force -Path * -Destination ../python%PYTHON_VER_SHORT%.zip
55 | cd ..
56 | rmdir /s /q python%PYTHON_VER_SHORT%
57 | cd ..
58 | 
59 | REM Rename folder
60 | rename python-%PYTHON_VERSION%-embed-amd64 python
61 | 
62 | pause
63 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | ﻿chardet>=5.2.0
2 | mmh3>=4.0.1
3 | 


--------------------------------------------------------------------------------
/src/HexTool.py:
--------------------------------------------------------------------------------
 1 | import io
 2 | import struct
 3 | 
 4 | 
 5 | def pad_align_up(filestream: io.BufferedReader, align: int) -> int | None:
 6 |     """pad to align"""
 7 |     padSize = (align - filestream.tell() % align) % align
 8 |     (padding,) = struct.unpack(f"{padSize}s", filestream.read(padSize))
 9 |     assert all([x == 0 for x in padding]), "padding value should be zero"
10 |     return padding
11 | 
12 | 
13 | def printHexView(bytestream: bytearray | bytes, width=32):
14 |     """print hex bytes similar in hex editor, for debug usage"""
15 |     view = ""
16 |     digit = len(str(len(bytestream)))
17 |     for i, b in enumerate(bytestream):
18 |         sep = " "
19 |         pref = ""
20 |         if i % width == 0:
21 |             pref = ("{0:0" + str(digit) + "d}").format(i) + ": "
22 |         elif i % width == width - 1:
23 |             sep = "\n"
24 |         elif i % 4 == 4 - 1:
25 |             sep = "|"
26 |         view = view + pref + f"{b:02X}" + sep
27 |     print(view)
28 |     return view
29 | 


--------------------------------------------------------------------------------
/src/REMSG.py:
--------------------------------------------------------------------------------
  1 | import io
  2 | import struct
  3 | import uuid
  4 | from typing import Final
  5 | 
  6 | import mmh3
  7 | import REWString as helper
  8 | from HexTool import pad_align_up
  9 | 
 10 | LANG_LIST: Final[dict[int, str]] = {
 11 |     0: "Japanese",
 12 |     1: "English",
 13 |     2: "French",
 14 |     3: "Italian",
 15 |     4: "German",
 16 |     5: "Spanish",
 17 |     6: "Russian",
 18 |     7: "Polish",
 19 |     8: "Dutch",
 20 |     9: "Portuguese",
 21 |     10: "PortugueseBr",
 22 |     11: "Korean",
 23 |     12: "TraditionalChinese",  # only this
 24 |     13: "SimplifiedChinese",  # and this
 25 |     14: "Finnish",
 26 |     15: "Swedish",
 27 |     16: "Danish",
 28 |     17: "Norwegian",
 29 |     18: "Czech",
 30 |     19: "Hungarian",
 31 |     20: "Slovak",
 32 |     21: "Arabic",
 33 |     22: "Turkish",
 34 |     23: "Bulgarian",
 35 |     24: "Greek",
 36 |     25: "Romanian",
 37 |     26: "Thai",
 38 |     27: "Ukrainian",
 39 |     28: "Vietnamese",
 40 |     29: "Indonesian",
 41 |     30: "Fiction",
 42 |     31: "Hindi",
 43 |     32: "LatinAmericanSpanish",
 44 |     33: "Max",
 45 |     -1: "Unused", # defined by me, for version 23 and above
 46 | }
 47 | """via.Language, with fixing the name of cht and chs"""
 48 | 
 49 | LANG_CODE_LIST: Final[dict[int, str]] = {
 50 |     0: "Japanese",
 51 |     1: "English",
 52 |     2: "French",
 53 |     3: "Italian",
 54 |     4: "German",
 55 |     5: "Spanish",
 56 |     6: "Russian",
 57 |     7: "Polish",
 58 |     8: "Dutch",
 59 |     9: "Portuguese",
 60 |     10: "PortugueseBr",
 61 |     11: "Korean",
 62 |     12: "TransitionalChinese",
 63 |     13: "SimplelifiedChinese",
 64 |     14: "Finnish",
 65 |     15: "Swedish",
 66 |     16: "Danish",
 67 |     17: "Norwegian",
 68 |     18: "Czech",
 69 |     19: "Hungarian",
 70 |     20: "Slovak",
 71 |     21: "Arabic",
 72 |     22: "Turkish",
 73 |     23: "Bulgarian",
 74 |     24: "Greek",
 75 |     25: "Romanian",
 76 |     26: "Thai",
 77 |     27: "Ukrainian",
 78 |     28: "Vietnamese",
 79 |     29: "Indonesian",
 80 |     30: "Fiction",
 81 |     31: "Hindi",
 82 |     32: "LatinAmericanSpanish",
 83 |     33: "Max",
 84 | }
 85 | """via.Language with MHRSB 13.0.0.1.
 86 | god damnit, they spell wrong the language name..."""
 87 | 
 88 | MHR_SUPPORTED_LANG: Final[list[int]] = [
 89 |     0,
 90 |     1,
 91 |     2,
 92 |     3,
 93 |     4,
 94 |     5,
 95 |     6,
 96 |     7,
 97 |     10,
 98 |     11,
 99 |     12,
100 |     13,
101 |     21,
102 |     32,
103 | ]
104 | """For MHRSB 15.0.0"""
105 | 
106 | VERSION_2_LANG_COUNT: Final[dict[int, int]] = {
107 |     12: 23,
108 |     0x2022033D: 27,
109 |     14: 28,
110 |     15: 30,
111 |     17: 32,
112 |     20: 33,
113 |     0x20220626: 33,  # before 13.0.0, 0x20220626 has 32 lang count
114 |     22: 33,
115 |     23: 33,
116 | }
117 | """lang count in each msg version.
118 | 0x20220626 has 32 lang count in early version"""
119 | 
120 | 
121 | def isVersionEncrypt(version: int) -> bool:
122 |     """check if dataOffset exist"""
123 |     return version > 12 and version != 0x2022033D
124 | 
125 | 
126 | def isVersionEntryByHash(version: int) -> bool:
127 |     """check if Entry haed index by hash"""
128 |     return version > 15 and version != 0x2022033D
129 | 
130 | def isVersionIgnoreUnusedLang(version: int) -> bool:
131 |     """check if version use -1 to ignore unused lang"""
132 |     return version >= 23 and version != 0x2022033D
133 | 
134 | class Entry:
135 |     """meat of MSG"""
136 | 
137 |     def __init__(self, version):
138 |         self.version = version
139 | 
140 |     def readHead(self, filestream: io.BufferedReader, langCount: int):
141 |         """use when reading file only"""
142 | 
143 |         # we use bytes_le for guid(cuz c# use and store this way)
144 |         self.guid = uuid.UUID(
145 |             bytes_le=struct.unpack("<16s", filestream.read(16))[0],
146 |         )
147 |         (self.crc,) = struct.unpack("<I", filestream.read(4))
148 |         # actually I don't have a version 16 msg file so idk if 16 use hash or index
149 |         if isVersionEntryByHash(self.version):
150 |             (self.hash,) = struct.unpack("<I", filestream.read(4))
151 |         else:
152 |             (self.index,) = struct.unpack("<I", filestream.read(4))
153 | 
154 |         # offsets below should only be use when reading msg, and once you get the string they should not be use anymore
155 |         (self.entryNameOffset,) = struct.unpack("<Q", filestream.read(8))
156 |         (self.attributeOffset,) = struct.unpack("<Q", filestream.read(8))
157 |         self.contentOffsetsByLangs: list[int] = list()
158 |         for _ in range(langCount):
159 |             self.contentOffsetsByLangs.append(struct.unpack("<Q", filestream.read(8))[0])
160 | 
161 |     def writeHead(self, bytestream: bytearray):
162 |         """extend the bytearray by filling entry head"""
163 |         bytestream.extend(struct.pack("<16s", self.guid.bytes_le))
164 |         bytestream.extend(struct.pack("<I", self.crc))
165 |         if isVersionEntryByHash(self.version):
166 |             bytestream.extend(struct.pack("<I", self.hash))
167 |         else:
168 |             bytestream.extend(struct.pack("<I", self.index))
169 |         self.entryNameOffsetPH = len(bytestream)
170 |         bytestream.extend(struct.pack("<q", -1))
171 |         self.attributeOffsetPH = len(bytestream)
172 |         bytestream.extend(struct.pack("<q", -1))
173 |         self.contentOffsetsByLangsPH: list[int] = list()
174 |         for _ in self.langs:
175 |             self.contentOffsetsByLangsPH.append(len(bytestream))
176 |             bytestream.extend(struct.pack("<q", -1))
177 | 
178 |     def readAttributes(self, filestream: io.BufferedReader, attributeHeaders):
179 |         """read the attributes of this msg"""
180 |         self.attributes = list()
181 |         for header in attributeHeaders:
182 |             value = ""
183 |             match header["valueType"]:
184 |                 case -1:  # null wstring
185 |                     (value,) = struct.unpack("<Q", filestream.read(8))
186 |                 case 0:  # int64
187 |                     (value,) = struct.unpack("<q", filestream.read(8))
188 |                 case 1:  # double
189 |                     (value,) = struct.unpack("<d", filestream.read(8))
190 |                 case 2:  # wstring
191 |                     (value,) = struct.unpack("<Q", filestream.read(8))
192 |                 case _:
193 |                     raise NotImplementedError(f"{value} not implemented")
194 |             self.attributes.append(value)
195 | 
196 |     def writeAttributes(self, bytestream: bytearray, attributeHeaders):
197 |         """extend and modify the bytearray by filling attributes"""
198 |         self.attributesPH = list()
199 |         for i, header in enumerate(attributeHeaders):
200 |             value = ""
201 |             match header["valueType"]:
202 |                 case -1:  # null wstring
203 |                     value = struct.pack("<q", -1)
204 |                 case 0:  # int64
205 |                     value = struct.pack("<q", self.attributes[i])
206 |                 case 1:  # double
207 |                     value = struct.pack("<d", self.attributes[i])
208 |                 case 2:  # wstring
209 |                     value = struct.pack("<q", -1)
210 |             self.attributesPH.append(len(bytestream))
211 |             bytestream.extend(value)
212 | 
213 |     def setName(self, name: str):
214 |         """set entry name"""
215 |         self.name = name
216 | 
217 |     def setContent(self, langs: list[str]):
218 |         """set entry contents"""
219 |         self.langs = langs
220 | 
221 |     def buildEntry(self, guid: str, crc: int, name: str, attributeValues: list, langs: list[str], hash: int = 0, index: int = 0):
222 |         """use for file modification"""
223 |         self.guid = uuid.UUID(hex=guid)
224 |         self.crc = crc
225 |         if isVersionEntryByHash(self.version):
226 |             self.hash = hash
227 |         else:
228 |             self.index = index
229 | 
230 |         self.name = name
231 |         self.attributes = list()
232 |         for value in attributeValues:
233 |             self.attributes.append(value)
234 | 
235 |         self.langs = langs
236 | 
237 | 
238 | class MSG:
239 |     """MSG object"""
240 | 
241 |     def __init__(self):
242 |         pass
243 | 
244 |     def readMSG(self, filestream: io.BufferedReader):
245 |         """read msg file and store info into this MSG object"""
246 | 
247 |         # header
248 |         (version,) = struct.unpack("<I", filestream.read(4))
249 |         (magic,) = struct.unpack("<4s", filestream.read(4))
250 |         (headerOffset,) = struct.unpack("<Q", filestream.read(8))
251 |         (entryCount,) = struct.unpack("<I", filestream.read(4))
252 |         (attributeCount,) = struct.unpack("<I", filestream.read(4))
253 |         (langCount,) = struct.unpack("<I", filestream.read(4))
254 |         pad_align_up(filestream, 8)  # pad to 8
255 |         if isVersionEncrypt(version):
256 |             (dataOffset,) = struct.unpack("<Q", filestream.read(8))
257 |         (unknDataOffset,) = struct.unpack("<Q", filestream.read(8))
258 |         (langOffset,) = struct.unpack("<Q", filestream.read(8))
259 |         (attributeOffset,) = struct.unpack("<Q", filestream.read(8))
260 |         (attributeNameOffset,) = struct.unpack("<Q", filestream.read(8))
261 | 
262 |         # entries headers' offset
263 |         entryOffsets: list[int] = list()
264 |         for _ in range(entryCount):
265 |             entryOffsets.append(struct.unpack("<Q", filestream.read(8))[0])
266 | 
267 |         # always 64bit null
268 |         assert unknDataOffset == filestream.tell(), f"expected unknData at {unknDataOffset} but at {filestream.tell()}"
269 |         (unknData,) = struct.unpack("<Q", filestream.read(8))
270 |         assert unknData == 0, f"unknData should be 0 but found {unknData}"
271 | 
272 |         # indexes of all lang (follow via.Language)
273 |         assert langOffset == filestream.tell(), f"expected languages at {langOffset} but at {filestream.tell()}"
274 |         # keep in mind `languages` is a list of indexes could be duplicated and not in sequence now
275 |         languages: list[int] = list()
276 |         for _ in range(langCount):
277 |             languages.append(struct.unpack("<i", filestream.read(4))[0])
278 |         if not all([x in LANG_LIST.keys() and (i == x or x == -1) for i, x in enumerate(languages)]):
279 |             print(f"unkn lang found. {str(languages)}. Please update LANG_LIST from via.Language")
280 | 
281 |         # pad to 8
282 |         pad_align_up(filestream, 8)
283 | 
284 |         # get attribute headers, get type of each attr
285 |         assert attributeOffset == filestream.tell(), f"expected attributeValueTypes at {attributeOffset} but at {filestream.tell()}"
286 |         attributeHeaders: list[dict] = list()
287 |         for i in range(attributeCount):
288 |             attributeHeaders.append(dict(valueType=struct.unpack("<i", filestream.read(4))[0]))
289 | 
290 |         # pad to 8
291 |         pad_align_up(filestream, 8)
292 | 
293 |         # get attribute headers' name but hold the offset at attributeNamesOffsets. string reading will do after decrypt.
294 |         assert attributeNameOffset == filestream.tell(), f"expected attributeNamesOffset at {attributeNameOffset} but at {filestream.tell()}"
295 |         attributeNamesOffsets = list()
296 |         for _ in range(attributeCount):
297 |             attributeNamesOffsets.append(struct.unpack("<Q", filestream.read(8))[0])
298 | 
299 |         # get info(entry head) of each entry
300 |         entrys: list[Entry] = list()
301 |         for entryIndex in range(entryCount):
302 |             assert entryOffsets[entryIndex] == filestream.tell(), f"expected entryOffsets[{entryIndex}] at {entryOffsets[entryIndex]} but at {filestream.tell()}"
303 |             entry = Entry(version)
304 |             entry.readHead(filestream, langCount)
305 |             entrys.append(entry)
306 | 
307 |         # get attributes of each entry
308 |         for entry in entrys:
309 |             assert entry.attributeOffset == filestream.tell(), f"expected entry.attributeOffset at {self.attributeOffset} but at {filestream.tell()}"
310 |             entry.readAttributes(filestream, attributeHeaders)
311 | 
312 |         # read / decrypt string pool
313 |         if isVersionEncrypt(version):
314 |             assert dataOffset == filestream.tell(), f"expected dataOffset at {dataOffset} but at {filestream.tell()}"
315 |         else:
316 |             dataOffset = filestream.tell()
317 |         filestream.seek(0, 2)  # EOF
318 |         dataSize = filestream.tell() - dataOffset
319 |         assert dataSize % 2 == 0, f"wstring pool size should be even: {dataSize}"
320 |         filestream.seek(dataOffset)  # start of string pool
321 |         data = filestream.read(dataSize)
322 |         if isVersionEncrypt(version):
323 |             wcharPool = helper.decrypt(data)
324 |         else:
325 |             wcharPool = data
326 |         stringDict = helper.wcharPool2StrDict(wcharPool)
327 | 
328 |         # read attribute name to attributeHeaders
329 |         for i, attrHead in enumerate(attributeHeaders):
330 |             attrHead["name"] = helper.seekString((attributeNamesOffsets[i] - dataOffset), stringDict)
331 | 
332 |         # get content of each entry
333 |         for entryIndex, entry in enumerate(entrys):
334 |             # set entry name
335 |             entry.setName(helper.seekString((entry.entryNameOffset - dataOffset), stringDict))
336 |             if isVersionEntryByHash(version):
337 |                 nameHash = mmh3.hash(key=entry.name.encode("utf-16-le"), seed=0xFFFFFFFF, signed=False)
338 |                 assert nameHash == entry.hash, f"expected {entry.hash} for {entry.name} but get {nameHash}"
339 |             else:
340 |                 assert entryIndex == entry.index, f"expected {entryIndex} for {entry.name} but get {entry.index}"
341 | 
342 |             # set content by each lang
343 |             lang = list()
344 |             for strOffset in entry.contentOffsetsByLangs:
345 |                 lang.append(helper.seekString((strOffset - dataOffset), stringDict))
346 |             entry.setContent(lang)
347 | 
348 |             # seek string value of each attribute
349 |             for i, attrHead in enumerate(attributeHeaders):
350 |                 if attrHead["valueType"] == 2:
351 |                     entry.attributes[i] = helper.seekString((entry.attributes[i] - dataOffset), stringDict)
352 |                 elif attrHead["valueType"] == -1:
353 |                     temp = helper.seekString((entry.attributes[i] - dataOffset), stringDict)
354 |                     assert temp == "" or temp == "\x00", f"attr value type -1 contain non-null value {temp}"
355 |                     entry.attributes[i] = temp
356 | 
357 |         self.entrys: list[Entry] = entrys
358 |         self.attributeHeaders: list[dict] = attributeHeaders
359 |         self.version: int = version
360 |         self.languages: list[int] = languages
361 | 
362 |         # debug use, to let input output stringpool keeps same
363 |         # self.stringDict = stringDict
364 | 
365 |     def writeMSG(self) -> bytes:
366 |         """write a msg file(bytes) from this object's info"""
367 | 
368 |         # header
369 |         newFile = bytearray()
370 |         newFile.extend(struct.pack("<I", self.version))
371 |         newFile.extend(struct.pack("<4s", b"GMSG"))
372 |         newFile.extend(struct.pack("<Q", 16))
373 |         entryCount = len(self.entrys)
374 |         newFile.extend(struct.pack("<I", entryCount))
375 |         attributeCount = len(self.attributeHeaders)
376 |         newFile.extend(struct.pack("<I", attributeCount))
377 |         langCount = len(self.languages)
378 |         newFile.extend(struct.pack("<I", langCount))
379 |         newFile.extend(b"\x00" * (len(newFile) % 8))  # pad to 8
380 |         if isVersionEncrypt(self.version):
381 |             dataOffsetPH = len(newFile)
382 |             newFile.extend(struct.pack("<q", -1))
383 |         unknDataOffsetPH = len(newFile)
384 |         newFile.extend(struct.pack("<q", -1))
385 |         langOffsetPH = len(newFile)
386 |         newFile.extend(struct.pack("<q", -1))
387 |         attributeOffsetPH = len(newFile)
388 |         newFile.extend(struct.pack("<q", -1))
389 |         attributeNameOffsetPH = len(newFile)
390 |         newFile.extend(struct.pack("<q", -1))
391 | 
392 |         # entries headers' offset
393 |         entryOffsetsPH: list[int] = list()
394 |         for _ in range(entryCount):
395 |             entryOffsetsPH.append(len(newFile))
396 |             newFile.extend(struct.pack("<q", -1))
397 | 
398 |         newFile[unknDataOffsetPH : unknDataOffsetPH + 8] = struct.pack("<Q", len(newFile))
399 |         newFile.extend(struct.pack("<Q", 0))  # unknData
400 |         newFile[langOffsetPH : langOffsetPH + 8] = struct.pack("<Q", len(newFile))
401 |         newFile.extend(struct.pack("<" + "i" * langCount, *self.languages))  # languages
402 | 
403 |         newFile.extend(b"\x00" * (len(newFile) % 8))  # pad to 8
404 |         newFile[attributeOffsetPH : attributeOffsetPH + 8] = struct.pack("<Q", len(newFile))
405 |         newFile.extend(struct.pack("<" + "i" * attributeCount, *list([head["valueType"] for head in self.attributeHeaders])))  # attributeHeaders.valueType
406 |         newFile.extend(b"\x00" * (len(newFile) % 8))  # pad to 8
407 |         newFile[attributeNameOffsetPH : attributeNameOffsetPH + 8] = struct.pack("<Q", len(newFile))
408 |         attributeNamesOffsetsPH: list[int] = list()
409 |         for _ in range(attributeCount):
410 |             attributeNamesOffsetsPH.append(len(newFile))
411 |             newFile.extend(struct.pack("<q", -1))
412 | 
413 |         # info(entry head) of each entry
414 |         for i, entry in enumerate(self.entrys):
415 |             newFile[entryOffsetsPH[i] : entryOffsetsPH[i] + 8] = struct.pack("<Q", len(newFile))
416 |             entry.writeHead(newFile)
417 | 
418 |         # attributes of each entry
419 |         for i, entry in enumerate(self.entrys):
420 |             newFile[entry.attributeOffsetPH : entry.attributeOffsetPH + 8] = struct.pack("<Q", len(newFile))
421 |             entry.writeAttributes(newFile, self.attributeHeaders)
422 | 
423 |         # read / decrypt string pool
424 |         dataOffset = len(newFile)
425 |         if isVersionEncrypt(self.version):
426 |             newFile[dataOffsetPH : dataOffsetPH + 8] = struct.pack("<Q", len(newFile))
427 | 
428 |         # construct string pool
429 |         stringPoolSet = set()
430 |         isStrAttrIdx = list()
431 |         isNullAttrIdx = list()
432 |         for i, a in enumerate(self.attributeHeaders):
433 |             if a["valueType"] == -1:
434 |                 stringPoolSet.add("")
435 |                 isNullAttrIdx.append(i)
436 |             elif a["valueType"] == 2:
437 |                 isStrAttrIdx.append(i)
438 | 
439 |         stringPoolSet.update([a["name"] for a in self.attributeHeaders])
440 |         for entry in self.entrys:
441 |             stringPoolSet.add(entry.name)
442 |             stringPoolSet.update(entry.langs)
443 |             stringPoolSet.update([entry.attributes[idx] for idx in isStrAttrIdx])
444 | 
445 |         strOffsetDict = helper.calcStrPoolOffsets(stringPoolSet)  # not doing string processing here, as it will change the key.
446 |         # debug use, to let input output stringpool keeps same
447 |         # strOffsetDict = dict((v,k) for k,v in self.stringDict.items())
448 |         wcharPool = b"".join(helper.toWcharBytes(x) for x in strOffsetDict.keys())
449 | 
450 |         if isVersionEncrypt(self.version):
451 |             newFile.extend(helper.encrypt(wcharPool))
452 |         else:
453 |             newFile.extend(wcharPool)
454 | 
455 |         # update string offsets
456 |         for i, a in enumerate(self.attributeHeaders):
457 |             newFile[attributeNamesOffsetsPH[i] : attributeNamesOffsetsPH[i] + 8] = struct.pack("<Q", strOffsetDict[a["name"]] + dataOffset)
458 |         for entry in self.entrys:
459 |             newFile[entry.entryNameOffsetPH : entry.entryNameOffsetPH + 8] = struct.pack("<Q", strOffsetDict[entry.name] + dataOffset)
460 |             for i, lang in enumerate(self.languages):
461 |                 newFile[entry.contentOffsetsByLangsPH[i] : entry.contentOffsetsByLangsPH[i] + 8] = struct.pack("<Q", strOffsetDict[entry.langs[i]] + dataOffset)
462 |             for idx in isStrAttrIdx:
463 |                 newFile[entry.attributesPH[idx] : entry.attributesPH[idx] + 8] = struct.pack("<Q", strOffsetDict[entry.attributes[idx]] + dataOffset)
464 |             for idx in isNullAttrIdx:
465 |                 newFile[entry.attributesPH[idx] : entry.attributesPH[idx] + 8] = struct.pack("<Q", strOffsetDict[""] + dataOffset)
466 | 
467 |         # printHexView(newFile)
468 | 
469 |         return bytes(newFile)
470 | 


--------------------------------------------------------------------------------
/src/REMSGUtil.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import csv
  3 | import io
  4 | import json
  5 | import os
  6 | import uuid
  7 | from typing import Final, Iterator
  8 | 
  9 | import chardet
 10 | import mmh3
 11 | import REMSG
 12 | import REWString as helper
 13 | 
 14 | SHORT_LANG_LU: Final[dict[str, int]] = {
 15 |     "ja": 0,  # "Japanese",
 16 |     "en": 1,  # "English",
 17 |     "fr": 2,  # "French",
 18 |     "it": 3,  # "Italian",
 19 |     "de": 4,  # "German",
 20 |     "es": 5,  # "Spanish",
 21 |     "ru": 6,  # "Russian",
 22 |     "pl": 7,  # "Polish",
 23 |     "nl": 8,  # "Dutch",
 24 |     "pt": 9,  # "Portuguese",
 25 |     "ptbr": 10,  # "PortugueseBr",
 26 |     "ko": 11,  # "Korean",
 27 |     "zhtw": 12,  # "TraditionalChinese", # only this
 28 |     "zhcn": 13,  # "SimplifiedChinese", # and this
 29 |     "fi": 14,  # "Finnish",
 30 |     "sv": 15,  # "Swedish",
 31 |     "da": 16,  # "Danish",
 32 |     "no": 17,  # "Norwegian",
 33 |     "cs": 18,  # "Czech",
 34 |     "hu": 19,  # "Hungarian",
 35 |     "sk": 20,  # "Slovak",
 36 |     "ar": 21,  # "Arabic",
 37 |     "tr": 22,  # "Turkish",
 38 |     "bg": 23,  # "Bulgarian",
 39 |     "el": 24,  # "Greek",
 40 |     "ro": 25,  # "Romanian",
 41 |     "th": 26,  # "Thai",
 42 |     "ua": 27,  # "Ukrainian",
 43 |     "vi": 28,  # "Vietnamese",
 44 |     "id": 29,  # "Indonesian",
 45 |     "cc": 30,  # "Fiction",
 46 |     "hi": 31,  # "Hindi",
 47 |     "es419": 32,  # "LatinAmericanSpanish",
 48 |     # "" : 33, # "Max",
 49 | }
 50 | 
 51 | 
 52 | def searchSameGuid(msg: REMSG.MSG) -> None:
 53 |     """research use, print out all entry name with same guid in one file"""
 54 |     guidset = set()
 55 |     for entry in msg.entrys:
 56 |         if entry.guid not in guidset:
 57 |             guidset.add(entry.guid)
 58 |         else:
 59 |             print(str(entry.guid) + ":" + entry.name)
 60 | 
 61 | 
 62 | def searchGuid(msg: REMSG.MSG, guid: uuid.UUID) -> None:
 63 |     """research use, print out the entry name with that guid"""
 64 |     for entry in msg.entrys:
 65 |         if entry.guid.hex == guid.hex:
 66 |             print(str(entry.guid) + ":" + entry.name)
 67 | 
 68 | 
 69 | def getEncoding(filename: str, bufferSize: int = 256 * 1024) -> str:
 70 |     """althoguh I set utf-8 to all output file, but in-case someone copy paste to another file and has diff encoding..."""
 71 |     rawdata = open(filename, "rb").read(bufferSize)
 72 | 
 73 |     CONFIDENCE_MUST_BE = 0.95
 74 |     CONFIDENCE_MOST_LIKELY = 0.75
 75 |     CONFIDENCE_COULD_BE = 0.5
 76 | 
 77 |     allResult = chardet.detect_all(rawdata, ignore_threshold=False)
 78 |     # print(allResult)
 79 |     encode = allResult[0]["encoding"]
 80 |     confidence = allResult[0]["confidence"]
 81 |     if encode is None or confidence < 0.01:
 82 |         # empty file
 83 |         return "utf-8-sig"
 84 |     if confidence < CONFIDENCE_MUST_BE:
 85 |         for result in allResult:
 86 |             if "utf" in result["encoding"] and result["confidence"] > CONFIDENCE_COULD_BE:
 87 |                 encode = result["encoding"]
 88 |                 confidence = result["confidence"]
 89 |                 break
 90 | 
 91 |     if encode is None or encode.lower() in ["ascii", "windows-1254", "iso-8859-1"] or (confidence < CONFIDENCE_MOST_LIKELY and "utf" not in encode.lower()):
 92 |         encode = "utf-8"
 93 |     if encode.lower() == "utf-8":
 94 |         encode = "utf-8-sig"
 95 |     # print(f"Detected Encoding: {encode.lower()} File: {filename}")
 96 |     return encode
 97 | 
 98 | 
 99 | def readAttributeFromStr(inValue: str | int | float, vtype: int) -> str | int | float:
100 |     """return the attribute value with correct data type"""
101 |     value = ""
102 |     match vtype:
103 |         case -1:  # null wstring
104 |             value = ""
105 |         case 0:  # int64
106 |             value = int(inValue)
107 |         case 1:  # double
108 |             value = float(inValue)
109 |         case 2:  # wstring
110 |             value = str(inValue)
111 |     return value
112 | 
113 | 
114 | def printAllAttr(msg: REMSG.MSG, filenameFull: str) -> Iterator[str]:
115 |     """
116 |     Debug: return all attr for debug propose.
117 |     """
118 |     for entry in msg.entrys:
119 |         for j, x in enumerate(entry.attributes):
120 |             name = str(msg.attributeHeaders[j]["name"])
121 |             valueType = str(msg.attributeHeaders[j]["valueType"])
122 |             value = '"' + str(x) + '"'
123 |             yield ",".join((filenameFull, name, valueType, value))
124 | 
125 | 
126 | def searchAttrTy(msg: REMSG.MSG, filenameFull: str, ty: int) -> None:
127 |     """
128 |     Debug: search and print all attr's valueType if is ty type
129 |     """
130 |     for entry in msg.entrys:
131 |         for j, x in enumerate(entry.attributes):
132 |             name = str(msg.attributeHeaders[j]["name"])
133 |             valueType = int(msg.attributeHeaders[j]["valueType"])
134 |             if valueType == ty:
135 |                 value = '"' + str(x) + '"'
136 |                 print(",".join((filenameFull, name, str(valueType), value)))
137 | 
138 | 
139 | def searchEntryName(msg: REMSG.MSG, filename: str, keyword: str) -> None:
140 |     """
141 |     Debug: search entry name if keyword in entry name
142 |     """
143 |     for entry in msg.entrys:
144 |         if keyword in entry.name:
145 |             print(filename + "||" + entry.name)
146 | 
147 | 
148 | def exportCSV(msg: REMSG.MSG, filename: str) -> None:
149 |     """write csv file from REMSG.MSG object"""
150 | 
151 |     # newline = \n, as the original string has \r\n already, set newline as \r\n will replace \r\n to \r\r\n
152 |     with io.open(filename, "w", encoding="utf-8-sig", newline="\n") as csvf:
153 |         writer = csv.writer(csvf, delimiter=",")
154 |         writer.writerow(
155 |             ["guid", "crc?"]
156 |             + ["<" + x["name"] + ">" for x in msg.attributeHeaders]
157 |             + ["entry name",]
158 |             + [REMSG.LANG_LIST.get(lang, f"lang_{lang}") for lang in msg.languages]
159 |         )
160 |         for entry in msg.entrys:
161 |             writer.writerow(
162 |                 [str(x) for x in (entry.guid, entry.crc)]
163 |                 + [str(x) for x in entry.attributes]
164 |                 + [entry.name,]
165 |                 + entry.langs
166 |             )
167 | 
168 | 
169 | def importCSV(msgObj: REMSG.MSG, filename: str, version: int = None, langCount: int = None) -> REMSG.MSG:
170 |     """read csv file, modify the provided msg object, and return the new REMSG.MSG object"""
171 | 
172 |     msg = copy.deepcopy(msgObj)
173 |     if version is None:
174 |         if msg is not None:
175 |             version = msg.version
176 | 
177 |     if langCount is None:
178 |         if msg is not None:
179 |             langCount = len(msg.languages)
180 |         else:
181 |             langCount = REMSG.VERSION_2_LANG_COUNT[version]
182 | 
183 |     with io.open(filename, "r", encoding=getEncoding(filename), newline="\n") as csvf:
184 |         rows = list(csv.reader(csvf))
185 |         # for row in rows:
186 |         #     print(row)
187 |         guididx = rows[0].index("guid")
188 |         crcidx = rows[0].index("crc?")
189 |         nameidx = rows[0].index("entry name")
190 |         attridxs = list([i for i, field in enumerate(rows[0]) if field.startswith("<") and field.endswith(">")])
191 |         fAttrList = list([rows[0][idx].removeprefix("<").removesuffix(">") for idx in attridxs])
192 |         # fAttrNum = len(fAttrList)
193 |         fEntrys = list([row for row in rows[1:]])
194 |         # print(fAttrNum)
195 |         # print(len(fEntrys))
196 | 
197 |     assert sorted(fAttrList) == sorted(list([head["name"] for head in msg.attributeHeaders])), "AttributeList Should be same as original"
198 | 
199 |     missingEntry = list([str(entry.guid) for entry in msg.entrys if str(entry.guid) not in [fEntry[guididx] for fEntry in fEntrys]])
200 |     if len(missingEntry) > 0:
201 |         print("Missing Entry:")
202 |         print("\n".join(missingEntry))
203 |         raise ValueError("Missing Entry")
204 | 
205 |     # oldEntrys = dict([(entry.guid, entry) for entry in msg.entrys])
206 |     newEntrys: list[REMSG.Entry] = list()
207 |     for i, fEntry in enumerate(fEntrys):
208 |         entry = REMSG.Entry(version)  # create a new one.
209 |         attributes = list()
210 |         for ai, header in enumerate(msg.attributeHeaders):
211 |             value = readAttributeFromStr(fEntry[attridxs[ai]], header["valueType"])
212 |             attributes.append(value)
213 | 
214 |         contents = fEntry[(len(fAttrList)+3):]
215 |         assert len(contents) == langCount, f"Invalid number of language / contents.\n{"\n".join(contents)}"
216 |         entry.buildEntry(
217 |             guid=fEntry[guididx],
218 |             crc=int(fEntry[crcidx]),
219 |             name=fEntry[nameidx],
220 |             attributeValues=attributes,
221 |             langs=[helper.forceWindowsLineBreak(content) for content in contents],
222 |             hash=mmh3.hash(key=fEntry[nameidx].encode("utf-16-le"), seed=0xFFFFFFFF, signed=False) if REMSG.isVersionEntryByHash(version) else None,
223 |             index=i if not (REMSG.isVersionEntryByHash(version)) else None,
224 |         )
225 | 
226 |         # not gonna check, left it to user
227 |         # if entry.guid in oldEntrys.keys():
228 |         #     assert entry.crc == oldEntrys[entry.guid].crc
229 |         #     assert entry.name == oldEntrys[entry.guid].name
230 |         #     if isVersionEntryByHash(version):
231 |         #         assert entry.hash == oldEntrys[entry.guid].hash
232 |         #     else:
233 |         #         assert entry.index == entry.index
234 |         # else:
235 |         #     if isVersionEntryByHash(version):
236 |         #         if entry.hash != mmh3.hash(key = entry.name.encode('utf-16-le'), seed = 0xFFFFFFFF, signed = False):
237 |         #             print(f"Incorrect hash value for {entry.name}, filling a correct one")
238 |         #             entry.hash = mmh3.hash(key = entry.name.encode('utf-16-le'), seed = 0xFFFFFFFF, signed = False)
239 |         #     else:
240 |         #         assert entry.index >= len(oldEntrys)
241 | 
242 |         newEntrys.append(entry)
243 | 
244 |     msg.entrys = newEntrys
245 |     return msg
246 | 
247 | 
248 | def exportTXT(msg: REMSG.MSG, filename: str, langIndex: int, encode: str=None, withEntryName: bool=False) -> None:
249 |     """write txt file from REMSG.MSG object with specified language"""
250 | 
251 |     with io.open(filename, "w", encoding=encode if encode is not None else "utf-8") as txtf:
252 |         txtf.writelines([f"<string{'' if not withEntryName else "="+entry.name}>" + entry.langs[langIndex].replace("\r\n", "<lf>") + "\n" for entry in msg.entrys])
253 | 
254 | 
255 | def importTXT(msgObj: REMSG.MSG, filename: str, langIndex: int, encode: str=None) -> REMSG.MSG:
256 |     """read txt file, modify the provided msg object, and return the new REMSG.MSG object"""
257 |     if encode is None:
258 |         encode = getEncoding(filename)
259 |     elif "utf" in encode and "sig" not in encode:
260 |         testEncode = getEncoding(filename)
261 |         if testEncode.endswith("sig"):
262 |             encode = testEncode
263 | 
264 |     msg = copy.deepcopy(msgObj)
265 |     lines = None
266 |     with io.open(filename, mode="r", encoding=encode) as txtf:
267 |         lines = list([s.rstrip("\n").rstrip("\r").removeprefix("<string>").replace("<lf>", "\r\n") for s in txtf.readlines() if s.startswith("<string>")])
268 | 
269 |     assert len(lines) == len(msg.entrys), "Invalid number of entry"
270 |     for i, entry in enumerate(msg.entrys):
271 |         entry.langs[langIndex] = lines[i]
272 | 
273 |     return msg
274 | 
275 | 
276 | def exportMHRTextDump(msg: REMSG.MSG, filename: str, withEntryName: bool=False) -> None:
277 |     """export all the content with all the language seperate by folders."""
278 | 
279 |     folder, file = os.path.split(filename)
280 |     for lang in REMSG.MHR_SUPPORTED_LANG:
281 |         if not os.path.exists(os.path.join(folder, REMSG.LANG_LIST.get(lang, f"lang_{lang}"))):
282 |             try:
283 |                 os.makedirs(os.path.join(folder, REMSG.LANG_LIST.get(lang, f"lang_{lang}")))
284 |             except Exception as e:
285 |                 print(e)
286 | 
287 |         outputPath = os.path.join(folder, REMSG.LANG_LIST.get(lang, f"lang_{lang}"), file)
288 |         exportTXT(msg, outputPath, lang, "utf-8-sig", withEntryName)
289 | 
290 | 
291 | def valueTypeEnum(ty: int) -> str:
292 |     """use mhrice style"""
293 | 
294 |     match ty:
295 |         case -1:
296 |             return "Unknown"
297 |         case 0:
298 |             return "Int"
299 |         case 1:
300 |             return "Float"
301 |         case 2:
302 |             return "String"
303 |         case _:
304 |             return "Unknown"
305 | 
306 | 
307 | def buildmhriceJson(msg: REMSG.MSG) -> dict:
308 |     """build mhrice style json file from REMSG.MSG object.
309 | 
310 |     (with some additional info to let json itslef is able to convert to msg object)"""
311 | 
312 |     infos = {
313 |         "version": msg.version,
314 |         "languages": msg.languages,
315 |         "attribute_headers": list([{"ty": attr["valueType"], "name": attr["name"]} for attr in msg.attributeHeaders]),
316 |         "entries": list(
317 |             [
318 |                 {
319 |                     "name": entry.name,
320 |                     "guid": str(entry.guid),
321 |                     "crc?": entry.crc,
322 |                     "hash": entry.hash if REMSG.isVersionEntryByHash(msg.version) else 0xFFFFFFFF,
323 |                     "attributes": list([{valueTypeEnum(attrh["valueType"]): entry.attributes[i]} for i, attrh in enumerate(msg.attributeHeaders)]),
324 |                     "content": entry.langs,
325 |                 }
326 |                 for entry in msg.entrys
327 |             ]
328 |         ),
329 |     }
330 | 
331 |     return infos
332 | 
333 | 
334 | def exportJson(msg: REMSG.MSG, filename: str) -> None:
335 |     """write mhrice like json file from REMSG.MSG object."""
336 | 
337 |     with io.open(filename, "w", encoding="utf-8") as jsonf:
338 |         json.dump(buildmhriceJson(msg), jsonf, ensure_ascii=False, indent=2)
339 | 
340 | 
341 | def importJson(msgObj: REMSG.MSG, filename: str) -> REMSG.MSG:
342 |     """read json file, and return the new REMSG.MSG object.
343 | 
344 |     @param msgObj: deprecated parameter, you may pass None for this.
345 |     @param filename: filename string.
346 |     """
347 | 
348 |     msg = REMSG.MSG()
349 |     mhriceJson = ""
350 |     with io.open(filename, "r", encoding=getEncoding(filename)) as jsonf:
351 |         mhriceJson = json.load(jsonf)
352 | 
353 |     msg.version = int(mhriceJson["version"])
354 |     if REMSG.isVersionIgnoreUnusedLang(msg.version):
355 |         msg.languages = mhriceJson["languages"]
356 |     else:
357 |         if len(mhriceJson["entries"]) > 0:
358 |             msg.languages = list(range(len(mhriceJson["entries"][0]["content"])))
359 |         else:
360 |             msg.languages = list(range(REMSG.VERSION_2_LANG_COUNT[msg.version]))
361 | 
362 |     # replace Attribute Head
363 |     msg.attributeHeaders = list([{"valueType": head["ty"], "name": head["name"]} for head in mhriceJson["attribute_headers"]])
364 | 
365 |     newEntrys: list[REMSG.Entry] = list()
366 |     for jIndex, jEntry in enumerate(mhriceJson["entries"]):
367 |         entry = REMSG.Entry(msg.version)  # create a new one.
368 |         entry.buildEntry(
369 |             guid=jEntry["guid"],
370 |             crc=jEntry["crc?"],
371 |             name=jEntry["name"],
372 |             attributeValues=list([readAttributeFromStr(next(iter(attr.values())), msg.attributeHeaders[i]["valueType"]) for i, attr in enumerate(jEntry["attributes"])]),
373 |             langs=list([helper.forceWindowsLineBreak(content) for content in jEntry["content"]]),
374 |             hash=mmh3.hash(key=jEntry["name"].encode("utf-16-le"), seed=0xFFFFFFFF, signed=False) if REMSG.isVersionEntryByHash(msg.version) else None,
375 |             index=jIndex if not (REMSG.isVersionEntryByHash(msg.version)) else None,
376 |         )
377 | 
378 |         newEntrys.append(entry)
379 | 
380 |     msg.entrys = newEntrys
381 |     return msg
382 | 
383 | 
384 | def importMSG(filename: str) -> REMSG.MSG:
385 |     """read a msg file and return a REMSG.MSG object"""
386 | 
387 |     with io.open(filename, "rb") as filestream:
388 |         msg = REMSG.MSG()
389 |         msg.readMSG(filestream)
390 |         return msg
391 | 
392 | 
393 | def exportMSG(msg: REMSG.MSG, filename: str) -> None:
394 |     """write a msg file from a REMSG.MSG object"""
395 | 
396 |     with io.open(filename, "wb") as outstream:
397 |         outstream.write(msg.writeMSG())
398 | 


--------------------------------------------------------------------------------
/src/REWString.py:
--------------------------------------------------------------------------------
 1 | from typing import Final
 2 | 
 3 | KEY: Final[list[int]] = [0xCF, 0xCE, 0xFB, 0xF8, 0xEC, 0x0A, 0x33, 0x66, 0x93, 0xA9, 0x1D, 0x93, 0x50, 0x39, 0x5F, 0x09]
 4 | 
 5 | 
 6 | def seekString(offset: int, stringDict: dict[int, str]) -> str:
 7 |     """seek string from string dict"""
 8 |     assert len(stringDict) > 0, "no string pool but seeking string"
 9 |     assert offset in stringDict, f"seeking target not at string pool {offset}"
10 |     return stringDict[offset]
11 | 
12 | 
13 | # @DeprecationWarning
14 | # def seekStringFromStrPool(offset: int, stringPool: str) -> str:
15 | #     assert offset % 2 == 0, "expect offset in string pool is even"
16 | #     startPos = offset // 2
17 | #     if startPos > 0:
18 | #         assert stringPool[startPos-1] == "\x00", f"string not start from end of previous string when seeking({startPos})"
19 | #     elif startPos < 0:
20 | #         print(startPos)
21 | #         raise IndexError(f"seeking target not at string pool {startPos}")
22 | #     endpos = stringPool.find('\x00', startPos)
23 | #     assert endpos >= 0, f"incorrect string offset when seeking({startPos},{endpos})"
24 | #     return stringPool[startPos:endpos]
25 | 
26 | 
27 | def decrypt(rawBytes: bytes) -> bytes:
28 |     """decrypt msg string part"""
29 | 
30 |     rawData = bytearray(rawBytes)
31 |     prev = 0
32 |     for i, cur in enumerate(rawData):
33 |         rawData[i] = cur ^ prev ^ KEY[i & 0xF]
34 |         prev = cur
35 |     return bytes(rawData)
36 | 
37 | 
38 | def encrypt(rawBytes: bytes) -> bytes:
39 |     """encrypt msg string part"""
40 | 
41 |     rawData = bytearray(rawBytes)
42 |     prev = 0
43 |     for i, cur in enumerate(rawData):
44 |         rawData[i] = cur ^ prev ^ KEY[i & 0xF]
45 |         prev = rawData[i]
46 |     return bytes(rawData)
47 | 
48 | 
49 | def wcharPool2StrDict(wcharPool: bytes) -> dict[int, str]:
50 |     """wcharPool to stringDict with {offset: content}"""
51 |     if len(wcharPool) == 0:
52 |         return dict()
53 | 
54 |     stringPool = wcharPool2StrPool(wcharPool)
55 | 
56 |     stringDict: dict[int, str] = dict()
57 |     start_pointer = 0
58 |     for i, wchar in enumerate(stringPool):
59 |         if wchar == "\x00":
60 |             stringDict[start_pointer * 2] = stringPool[start_pointer:i]  # local offset : value without \x00
61 |             start_pointer = i + 1  # update sp
62 |     # print(stringDict)
63 |     return stringDict
64 | 
65 | 
66 | def wcharPool2StrPool(wcharPool: bytes) -> str:
67 |     """convert utf-16-le bytes to string"""
68 |     assert len(wcharPool) % 2 == 0, "wchar pool should have even size"
69 |     stringPool = wcharPool.decode("utf-16-le")  # each char takes 2 bytes
70 |     assert stringPool[-1] == "\x00", "ending wchar not null"
71 |     return stringPool
72 | 
73 | 
74 | def forceWindowsLineBreak(string: str) -> str:
75 |     """Force /r/n for every linebreak"""
76 |     return string.replace("\r\n", "\n").replace("\r", "\n").replace("\n", "\r\n")
77 | 
78 | 
79 | def calcStrPoolOffsets(stringlist: list[str]) -> dict[str, int]:
80 |     """build a offset dict with {string : offset}"""
81 |     newDict = dict()
82 |     sizeCount = 0
83 |     for string in sorted(set(stringlist)):
84 |         # not adding null terminator here, it will done by toWcharBytes()
85 |         newDict[string] = sizeCount
86 |         sizeCount = sizeCount + len(string) * 2 + 2
87 | 
88 |     return newDict
89 | 
90 | 
91 | def toWcharBytes(string: str) -> bytes:
92 |     """convert string to wchar(bytes) in utf-16-le with null terminator"""
93 |     return (string + "\x00").encode("utf-16-le")
94 | 
95 | 
96 | # def StrDict2wcharPool(stringDict: dict[int, str]) -> bytes:
97 | #     return b''.join([toWcharBytes(s) for s in stringDict.values()])
98 | 


--------------------------------------------------------------------------------
/src/main.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import logging
  3 | import os
  4 | import re
  5 | import sys
  6 | 
  7 | import mmh3
  8 | import REMSGUtil
  9 | from typing import List
 10 | 
 11 | logging.basicConfig(level=logging.INFO)
 12 | logger = logging.getLogger(__name__)
 13 | 
 14 | 
 15 | isValidMsgNameRegex = re.compile(r"\.msg.*(?<!\.txt)(?<!\.json)(?<!\.csv)$", re.IGNORECASE)
 16 | def isValidMsgName(name: str) -> bool:
 17 |     return isValidMsgNameRegex.search(name) is not None
 18 | 
 19 | 
 20 | def getAllFileFromFolder(folderName: str, filetype="msg") -> List[str]:
 21 |     filetype = filetype.lower()
 22 |     filenameList = []
 23 |     for file in os.listdir(folderName):
 24 |         if filetype == "msg":
 25 |             if isValidMsgName(file):
 26 |                 filenameList.append(os.path.join(folderName, file))
 27 |         elif file.lower().endswith("." + filetype) and ".msg." in file.lower():
 28 |             filenameList.append(os.path.join(folderName, file))
 29 | 
 30 |     return filenameList
 31 | 
 32 | 
 33 | def fillList(path: str, filetype: str="msg") -> List[str]:
 34 |     path = os.path.abspath(path)
 35 |     filetype = filetype.lower()
 36 |     if os.path.isdir(path):
 37 |         return getAllFileFromFolder(path, filetype)
 38 |     elif os.path.isfile(path):
 39 |         if filetype == "msg":
 40 |             if isValidMsgName(path):
 41 |                 return [path,]
 42 |         elif path.lower().endswith("." + filetype):
 43 |             return [path,]
 44 |     return []
 45 | 
 46 | 
 47 | def worker(item: str, mode: str = "csv", modFile: str = None, lang: int = REMSGUtil.SHORT_LANG_LU["ja"], **kwargs) -> None:
 48 |     try:
 49 |         filenameFull = os.path.abspath(item)
 50 |         print("processing:" + filenameFull)
 51 | 
 52 |         msg = REMSGUtil.importMSG(filenameFull)
 53 | 
 54 |         if mode == "csv":
 55 |             if modFile is None:
 56 |                 REMSGUtil.exportCSV(msg, filenameFull + "." + mode)
 57 |             else:
 58 |                 REMSGUtil.exportMSG(msg=REMSGUtil.importCSV(msg, modFile), filename=filenameFull + ".new")
 59 | 
 60 |         elif mode == "txt":
 61 |             if modFile is None:
 62 |                 REMSGUtil.exportTXT(msg, filenameFull + "." + mode, lang, encode=kwargs["txtformat"])
 63 |             else:
 64 |                 REMSGUtil.exportMSG(msg=REMSGUtil.importTXT(msg, modFile, lang, encode=kwargs["txtformat"]), filename=filenameFull + ".new")
 65 | 
 66 |         elif mode == "json":
 67 |             if modFile is None:
 68 |                 REMSGUtil.exportJson(msg, filenameFull + "." + mode)
 69 |             else:
 70 |                 REMSGUtil.exportMSG(msg=REMSGUtil.importJson(msg, modFile), filename=filenameFull + ".new")
 71 | 
 72 |         elif mode == "dump":
 73 |             REMSGUtil.exportMHRTextDump(msg, filenameFull + ".txt")
 74 | 
 75 |     except Exception as e:
 76 |         print(f"error with file {item}")
 77 |         # print(traceback.format_exc())
 78 |         logger.exception(e)
 79 | 
 80 | def getFolders(parser: argparse.ArgumentParser) -> tuple[List[str], List[str]]:
 81 |     args = parser.parse_args()
 82 | 
 83 |     filenameList = []
 84 |     editList = []
 85 | 
 86 |     editMode = args.edit is not None
 87 | 
 88 |     if args.input is not None:
 89 |         filenameList = fillList(args.input)
 90 |         if args.edit is not None:
 91 |             editList = fillList(args.edit, args.mode)
 92 | 
 93 |     elif args.edit is not None:  # input is none
 94 |         filenameList = []
 95 |         editList = fillList(args.edit, args.mode)
 96 |         # fill file list by edit list
 97 |         for file in list(editList):
 98 |             filename, file_extension = os.path.splitext(file)
 99 |             if os.path.exists(filename):
100 |                 filenameList.append(filename)
101 |             else:
102 |                 print(f"{filename} not found, skiping this file...")
103 |                 editList.remove(file)
104 | 
105 |     else:  # input is none
106 |         remainder = args.args
107 |         if (remainder is None) or (len(remainder) <= 0) or (len(remainder) > 2):
108 |             pass
109 |             # open without any args
110 |             parser.print_help()
111 |             input("\nincorrect args, press enter to exit...")
112 |             sys.exit()
113 | 
114 |         # guessing input... why am I doing this
115 |         elif len(remainder) == 1:
116 |             filenameList = fillList(remainder[0])
117 | 
118 |         elif len(remainder) == 2:
119 |             msgList1 = fillList(remainder[0], "msg")
120 |             msgList2 = fillList(remainder[1], "msg")
121 |             editList1 = fillList(remainder[0], args.mode)
122 |             editList2 = fillList(remainder[1], args.mode)
123 | 
124 |             filenameList = max([msgList1, msgList2], key=len)
125 |             editList = max([editList1, editList2], key=len)
126 | 
127 |             editMode = True
128 | 
129 |     # after getting file list...
130 |     if len(editList) <= 0:
131 |         editList = list([None for _ in filenameList])
132 |     elif len(editList) > 1:
133 |         editfolder, name = os.path.split(editList[0])
134 |         editList = []
135 |         editFiles = dict([(f.lower(), f) for f in os.listdir(editfolder)])
136 |         # find valid file - edit pair
137 |         for file in list(filenameList):
138 |             msgfolder, name = os.path.split(file)
139 |             if (name + "." + args.mode).lower() in editFiles:
140 |                 editList.append(os.path.join(editfolder, editFiles[(name + "." + args.mode).lower()]))
141 |             else:
142 |                 print(f"{name}.{args.mode} not found, skiping this file...")
143 |                 filenameList.remove(file)
144 | 
145 |     if len(filenameList) <= 0:
146 |         print("No valid input file, exiting.")
147 |         sys.exit(1)
148 | 
149 |     if editMode and (len(editList) <= 0 or None in editList):
150 |         print(f"{args.mode} mode with edit file/folder input but no {args.mode} file found.")
151 |         sys.exit(1)
152 | 
153 |     return filenameList, editList
154 | 
155 | def main():
156 |     parser = argparse.ArgumentParser(
157 |                     prog = 'REMSG_Converter.exe',
158 |                     description = 'Encode / Decode .msg file from RE Engine',
159 |                     epilog = "https://github.com/dtlnor/REMSG_Converter")
160 |     parser.add_argument("-i", "--input", type=str,
161 |                         help="input msg file or folder")
162 |     parser.add_argument("-x", "--multiprocess", type=int, default=4,
163 |                         help="when you are processing multiple files. How many processes to use to convert the files")
164 |     parser.add_argument("-m", "--mode", type=str, choices=["csv", "txt", "json"], default="csv",
165 |                         help="choose output file format.\n  txt = msg tool style txt.\n  csv = all lang in one csv with rich info.\n  json = all lang in one json with rich info in mhrice format")
166 |     parser.add_argument("-e", "--edit", type=str,
167 |                         help="input (csv/txt/json) file to edit the content.\n  if input as folder, the filename and number of files\n  should be same as original .msg file\n  (with corresponding (.txt/.csv/.json) extension)")
168 |     parser.add_argument("-l", "--lang", type=str, default="ja", choices=REMSGUtil.SHORT_LANG_LU.keys(),
169 |                         help="input the lang you want to export for txt mode (default ja)\n")
170 |     parser.add_argument("-f", "--txtformat", type=str, default=None, choices=["utf-8", "utf-8-sig"],
171 |                         help="force txt read/write format to be 'utf-8' or 'utf-8-sig'(BOM).\n")
172 |     parser.add_argument("args", nargs=argparse.REMAINDER)
173 |     args = parser.parse_args()
174 | 
175 |     # print('\n'.join([REMSGUtil.LANG_LIST.get(v,f"lang_{v}")+": "+k for k, v in REMSGUtil.SHORT_LANG_LU.items()]))
176 | 
177 |     filenameList, editList = getFolders(parser)
178 | 
179 |     executor = concurrent.futures.ProcessPoolExecutor(args.multiprocess)
180 |     futures = [executor.submit(worker, file, mode=args.mode, modFile=edit, lang=REMSGUtil.SHORT_LANG_LU[args.lang], txtformat=args.txtformat) for file, edit in zip(filenameList, editList)]
181 |     concurrent.futures.wait(futures)
182 | 
183 |     print("All Done.")
184 | 
185 | 
186 | if __name__ == "__main__":
187 |     # import threading
188 |     import concurrent.futures
189 |     import multiprocessing
190 | 
191 |     multiprocessing.freeze_support()
192 |     main()
193 | 


--------------------------------------------------------------------------------