├── .gitignore ├── tsconfig.json ├── .eslintignore ├── viewer.html ├── .eslintrc.js ├── package.json ├── LICENSE ├── data.d.ts ├── viewer.css ├── README.md ├── viewer.tsx ├── viewer-ui.tsx └── dump-messages.js /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules 2 | .parcel-cache 3 | yarn-error.log 4 | dist -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "strict": true, 4 | "noFallthroughCasesInSwitch": true, 5 | "noErrorTruncation": true, 6 | "jsx": "react" 7 | }, 8 | } 9 | -------------------------------------------------------------------------------- /.eslintignore: -------------------------------------------------------------------------------- 1 | # don't ever lint node_modules 2 | node_modules 3 | # don't lint build output (make sure it's set to your correct build folder name) 4 | dist 5 | # don't lint nyc coverage output 6 | coverage 7 | .eslintrc.js -------------------------------------------------------------------------------- /viewer.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | wadump viewer 7 | 8 | 9 | 10 |
11 | 12 | 13 | -------------------------------------------------------------------------------- /.eslintrc.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | root: true, 3 | parser: '@typescript-eslint/parser', 4 | plugins: [ 5 | '@typescript-eslint', 6 | ], 7 | extends: [ 8 | 'eslint:recommended', 9 | 'plugin:@typescript-eslint/recommended', 10 | ], 11 | env: { 12 | browser: true, 13 | }, 14 | rules: { 15 | "@typescript-eslint/no-unused-vars": [ 16 | "error", 17 | { 18 | "argsIgnorePattern": "^_", 19 | "varsIgnorePattern": "^_", 20 | } 21 | ], 22 | "@typescript-eslint/no-non-null-assertion": "off", 23 | "semi": ["error"], 24 | "quotes": ["error", "double"], 25 | }, 26 | }; 27 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "wadump", 3 | "version": "0.0.1", 4 | "license": "ISC", 5 | "devDependencies": { 6 | "@types/react": "^17.0.30", 7 | "@types/react-dom": "^17.0.9", 8 | "@typescript-eslint/eslint-plugin": "^5.1.0", 9 | "@typescript-eslint/parser": "^5.1.0", 10 | "eslint": "^8.0.1", 11 | "parcel": "^2.0.0", 12 | "typescript": "^4.4.4" 13 | }, 14 | "dependencies": { 15 | "@popperjs/core": "^2.10.2", 16 | "bootstrap": "^5.1.3", 17 | "react": "^17.0.2", 18 | "react-dom": "^17.0.2" 19 | }, 20 | "scripts": { 21 | "parcel": "parcel viewer.html", 22 | "lint": "eslint . --ext .js,.jsx,.ts,.tsx", 23 | "lint-fix": "eslint . --fix --ext .js,.jsx,.ts,.tsx" 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2021 Francesco Mazzoli 2 | 3 | Permission to use, copy, modify, and distribute this software for any 4 | purpose with or without fee is hereby granted, provided that the above 5 | copyright notice and this permission notice appear in all copies. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 8 | WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 9 | MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 10 | ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 11 | WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 12 | ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 13 | OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. -------------------------------------------------------------------------------- /data.d.ts: -------------------------------------------------------------------------------- 1 | interface Chat { 2 | id: string; 3 | t?: number; 4 | } 5 | 6 | interface Contact { 7 | id: string; 8 | name: string; 9 | } 10 | 11 | interface Group { 12 | id: string; 13 | subject: string; 14 | } 15 | 16 | interface MessageRowMessage { 17 | body: string; 18 | } 19 | 20 | interface MessageRow { 21 | currentMsg: MessageRowMessage; 22 | quotedMsg?: MessageRowMessage; 23 | } 24 | 25 | interface Message { 26 | id: string; 27 | t: number; 28 | from: string; 29 | to: { 30 | server: string; 31 | user: string; 32 | _serialized: string; 33 | }; 34 | participant: { 35 | server: string; 36 | user: string; 37 | _serialized: string; 38 | }; 39 | author: { 40 | server: string; 41 | user: string; 42 | _serialized: string; 43 | }; 44 | msgRow?: MessageRow; 45 | filehash?: string; 46 | mimetype?: string; 47 | quotedMsg?: { 48 | type: string; 49 | } 50 | } 51 | 52 | interface WhatsAppData { 53 | "message.json": Message[], 54 | "chat.json": Chat[], 55 | "group-metadata.json": Group[], 56 | "contact.json": Contact[], 57 | "media": { [filehash: string]: ArrayBuffer }, 58 | } 59 | -------------------------------------------------------------------------------- /viewer.css: -------------------------------------------------------------------------------- 1 | .chats { 2 | display: grid; 3 | grid-template-columns: max-content minmax(0, 1fr); 4 | grid-template-rows: max-content minmax(0, 1fr); 5 | overflow: hidden; 6 | width: 100vw; 7 | height: 100vh; 8 | } 9 | 10 | .chat-list { 11 | grid-area: 1 / 1 / 3 / 2; 12 | padding: 0.5rem; 13 | overflow-y: scroll; 14 | max-height: 100vh; 15 | } 16 | 17 | .chat-info { 18 | grid-area: 1 / 2 / 2 / 3; 19 | padding: 0.5rem; 20 | } 21 | 22 | .chat-messages { 23 | grid-area: 2 / 2 / 3 / 3; 24 | padding: 0.5rem; 25 | max-height: 100%; 26 | max-width: 100%; 27 | overflow-y: auto; 28 | } 29 | 30 | .message { 31 | padding: 0.10rem 0.25rem; 32 | border-radius: 0.5rem; 33 | margin: 0.25rem 0.10rem; 34 | border: 0.1rem solid rgb(87, 87, 87); 35 | box-shadow: 0.1rem 0.1rem rgb(87, 87, 87); 36 | clear: both; 37 | } 38 | 39 | .message.their-message { 40 | border-top-left-radius: 0; 41 | } 42 | 43 | .message.our-message { 44 | border-top-right-radius: 0; 45 | text-align: right; 46 | } 47 | 48 | .message .author { 49 | float: right; 50 | display: none; 51 | } 52 | 53 | .message .body { 54 | clear: both; 55 | display: block; 56 | } 57 | 58 | .message .quote { 59 | clear: both; 60 | display: block; 61 | } 62 | 63 | .message.our-message .body { 64 | clear: both; 65 | } 66 | 67 | .message .time { 68 | font-size: 0.75rem; 69 | } 70 | 71 | .message.our-message .time { 72 | float: right; 73 | } 74 | 75 | .message.their-message .time { 76 | float: left; 77 | } 78 | 79 | .message .clear { 80 | clear: both; 81 | display: block; 82 | } 83 | 84 | .message video, .message img { 85 | max-width: 100%; 86 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # wadump 2 | 3 | Small utility to dump and display the data in the WhatsApp web client. Dumps the messages, contacts, chat and group information. Also dumps all the media that can be dumped, although some media downloads fail, and I am currently not sure why. 4 | 5 | It only works with the [multi-device beta](https://faq.whatsapp.com/general/download-and-installation/about-multi-device-beta/?lang=en) enabled. 6 | 7 | See [the blog post](https://mazzo.li/posts/whatsapp-backup.html) about this project for more information about the implementation details. 8 | 9 | I've also only tested this on Chrome, although in principle it should work on every browser. 10 | 11 | ## Existing work 12 | 13 | After I wrote the tool and the blog post, I realized that somebody had [already reverse engineered the WhatsApp web client protocol](https://github.com/sigalor/whatsapp-web-reveng). Head that way for a detailed description of how the web client communicates with the WhatsApp servers. 14 | 15 | ## Disclaimer 16 | 17 | I am not affiliated with Facebook or WhatsApp, and this investigation was done purely to preserve my personal data better. I don't know if backing up your data this way breaches WhatsApp's terms of service. Use at your own risk! 18 | 19 | Also, you probably shouldn't be running random code from the internet in your browser, especially in your WhatsApp window. That said, [`dump-messages.js`](./dump-messages.js) is less than 500 lines long and with no dependencies, so if you do want to try this, please read it all first. 20 | 21 | ## To dump the data 22 | 23 | Open the dev tools while on `web.whatsapp.com`. [Create a new snippet](https://developer.chrome.com/docs/devtools/javascript/snippets/), and paste the contents [`dump-messages.js`](./dump-messages.js) into it. 24 | 25 | Then customize the invocation of `dumpMessages` at the end: 26 | 27 | ```javascript 28 | dumpWhatsApp({ 29 | // Save media on top of text messages 30 | dumpMedia: true, 31 | // Dump only media which is already cached locally. Only relevant if `dumpMedia` is 32 | // true. 33 | dumpOnlyCachedMedia: true, 34 | // Cache newly downloaded media, so that it won't be redownloaded the next time. 35 | // note. Only relevant if `dumpOnlyCachedMedia` is false. 36 | saveDownloadedMediaToCache: true, 37 | }); 38 | ``` 39 | 40 | It is advisable to first run once with `dumpOnlyCachedMedia: true`, since downloading the media can take a while. If you _are_ downloading media, some will probably fail to download, which will show up as errors in the console. Regardless, all the media that can be downloaded will be downloaded. 41 | 42 | After you've decided on the configuration parameters, start the snippet by pressing `Ctrl+Enter`, or by pressing the button on the bottom right. 43 | 44 | Once you started the script, to decrypt the messages the script needs to retrieve the decryption key. This can be done by opening a chat and scrolling to older messages, as the console message instructs you to do: 45 | 46 | ``` 47 | no decrypt args found, waiting for them (open a few chats!) 48 | ``` 49 | 50 | Once the key is retrieved, you'll see this message: 51 | 52 | ``` 53 | decrypt args found {algorithm: {…}, key: CryptoKey} 54 | ``` 55 | 56 | And the script will start reading and decrypting all the messages. When it is done they will be downloadable as a `whatsapp.tar` file. 57 | 58 | If you're not downloading media files it should only take a few seconds. If you are downloading the media it will take much longer. 59 | 60 | ## To view the data 61 | 62 | A very basic viewer is provided to view the data: 63 | 64 | ``` 65 | % yarn install 66 | % yarn parcel 67 | Server running at http://localhost:1234 68 | ``` 69 | 70 | Once it's running, just go to the webpage, upload the `whatsapp.tar` file, and you will be presented with the dumped chats. 71 | 72 | ## Bugs & Limitations 73 | 74 | * Not all media is reliably downloaded. See . The best way to get around this limitation is to write an extension which continuously syncs a filesystem backup with the web client using the new [File System Access API](https://developer.mozilla.org/en-US/docs/Web/API/File_System_Access_API), which would work around CDN links expiring. 75 | 76 | * Everything is done in RAM, which means that all your messages and media need to fit in RAM. This would also be obviated if the script wrote directly to the filesystem. 77 | 78 | * We handle quoted messages incompletely: we display quoted text with >> as the prefix in the viewer, and we do not handle media in quoted messages. 79 | 80 | * We do not retrieve profile images. 81 | 82 | * The viewer is extremely basic. It inlines all media using base64 encoding, which makes loading chats with a lot of media very slow. -------------------------------------------------------------------------------- /viewer.tsx: -------------------------------------------------------------------------------- 1 | import * as React from "react"; 2 | import * as ReactDOM from "react-dom"; 3 | 4 | import "./node_modules/bootstrap/dist/css/bootstrap.min.css"; 5 | 6 | import { Viewing } from "./viewer-ui.tsx"; 7 | 8 | function impossibleCase(x: never): A { 9 | throw `Impossible: ${x}`; 10 | } 11 | 12 | type TarData = {[fileName: string]: ArrayBufferView | TarData} 13 | 14 | // Very poor implementation -- we don't check the checksums or anything. 15 | function untar(blob: ArrayBuffer): TarData { 16 | const bytes = new Uint8Array(blob); 17 | const files: TarData = {}; 18 | let cursor = 0; 19 | while (cursor < blob.byteLength) { 20 | if (bytes[cursor] === 0) { 21 | cursor += 512; // skip empty sector 22 | continue; 23 | } 24 | let offset = 0; 25 | let fileName = ""; 26 | while (bytes[cursor + offset] !== 0) { 27 | fileName += String.fromCharCode(bytes[cursor + offset]); 28 | offset++; 29 | } 30 | let sizeString = ""; 31 | for (offset = 124; offset < 124 + 11; offset++) { 32 | sizeString += String.fromCharCode(bytes[cursor + offset]); 33 | } 34 | const size = parseInt(sizeString, 8); 35 | const fileBytes = new Uint8Array(blob, cursor + 512, size); 36 | cursor += 512 + Math.ceil(size/512)*512; 37 | // end of parsing, store 38 | const fileNameSegments = fileName.split("/"); 39 | let currentDirectory = files; 40 | for (let i = 0; i < fileNameSegments.length - 1; i++) { 41 | const segment = fileNameSegments[i]; 42 | currentDirectory[segment] = currentDirectory[segment] || {}; 43 | currentDirectory = currentDirectory[segment] as TarData; 44 | } 45 | currentDirectory[fileNameSegments[fileNameSegments.length-1]] = fileBytes; 46 | } 47 | return files; 48 | } 49 | 50 | const utf8Decoder = new TextDecoder("utf-8", { fatal: true }); 51 | 52 | function extractWhatsAppData(tar: TarData): WhatsAppData { 53 | // eslint-disable-next-line @typescript-eslint/no-explicit-any 54 | const data: any = {}; 55 | for (const name of ["message.json", "contact.json", "group-metadata.json", "chat.json"]) { 56 | if (tar[name] === undefined) { 57 | throw `Could not find file ${name} in tar archive.`; 58 | } 59 | let str: string | null = null; 60 | try { 61 | str = utf8Decoder.decode(tar[name] as unknown as ArrayBuffer); 62 | } catch (e) { 63 | console.error(`could not decode utf-8 in file ${name}`, e); 64 | throw `Could not decode UTF-8 contents of file ${name}`; 65 | } 66 | try { 67 | data[name] = JSON.parse(str); 68 | } catch (e) { 69 | console.error(`could not decode json in file ${name}`, e); 70 | throw `Could not decode JSON in file ${name}`; 71 | } 72 | } 73 | data["media"] = {}; 74 | const typedData = data as WhatsAppData; 75 | for (const [hash, blob] of Object.entries(tar["media"] || [])) { 76 | typedData.media[hash.replace(/_/g, "/").replace(/-/g, "+") + "="] = blob; 77 | } 78 | const compareTimes = (t1: number | undefined, t2: number | undefined) => { 79 | if (t1 === t2) { return 0; } 80 | if (t1 === undefined) { return 1; } 81 | if (t2 === undefined) { return -1; } 82 | return t2 - t1; 83 | }; 84 | typedData["chat.json"].sort((c1, c2) => compareTimes(c1.t, c2.t)); 85 | typedData["message.json"].sort((m1, m2) => compareTimes(m2.t, m1.t)); 86 | console.log(typedData); 87 | return typedData; 88 | } 89 | 90 | type ViewerState = 91 | | { status: "uploading" } 92 | | { status: "viewing", data: WhatsAppData } 93 | 94 | const Upload: React.FunctionComponent<{ loaded: (data: WhatsAppData) => void }> = ({ loaded }) => { 95 | const [state, setState] = React.useState< 96 | { status: "idle" } | { status: "reading" } | { status: "error", error: string } 97 | >({ status: "idle" }); 98 | return
99 |
100 | 105 | { 110 | setState({ status: "reading" }); 111 | const file = ev.target.files![0]; 112 | const reader = new FileReader(); 113 | reader.onload = (ev) => { 114 | try { 115 | const buffer = ev.target!.result as ArrayBuffer; 116 | const contents = untar(buffer); 117 | const data = extractWhatsAppData(contents); 118 | loaded(data); 119 | } catch (error) { 120 | console.error("caught error while decoding tar archive", error); 121 | if (typeof error === "string") { 122 | setState({ status: "error", error }); 123 | } else { 124 | setState({ status: "error", error: "Could not decode tar file" }); 125 | } 126 | } 127 | }; 128 | reader.readAsArrayBuffer(file); 129 | }} 130 | disabled={state.status === "reading"} 131 | /> 132 | {state.status === "error" && 133 |
134 | {state.error} 135 |
} 136 |
137 |
; 138 | }; 139 | 140 | const Viewer: React.FunctionComponent = () => { 141 | const [viewerState, setViewerState] = React.useState({ status: "uploading" }); 142 | 143 | return ( 144 | viewerState.status === "uploading" ? 145 | setViewerState({ status: "viewing", data })} /> : 146 | viewerState.status === "viewing" ? 147 | : 148 | impossibleCase(viewerState) 149 | ); 150 | }; 151 | 152 | ReactDOM.render( 153 | , 154 | document.getElementById("root") 155 | ); 156 | -------------------------------------------------------------------------------- /viewer-ui.tsx: -------------------------------------------------------------------------------- 1 | // In a separate file so fast-refresh works 2 | import * as React from "react"; 3 | 4 | import "./node_modules/bootstrap/dist/css/bootstrap.min.css"; 5 | 6 | function renderTime(t: number): string { 7 | return new Date(t*1000).toLocaleString(); 8 | } 9 | 10 | const SidebarChat: React.FunctionComponent<{ 11 | contacts: {[phoneNumber: string]: Contact}, 12 | groups: {[phoneNumber: string]: Group}, 13 | chat: Chat 14 | }> = ({ contacts, groups, chat }) => { 15 | const [phoneNumber, _] = chat.id.split("@"); 16 | const contact = contacts[phoneNumber]; 17 | const group = groups[phoneNumber]; 18 | if (group) { 19 | return
{group.subject} (group)
; 20 | } else if (contact) { 21 | return
{contact.name}
; 22 | } else { 23 | return
{phoneNumber}
; 24 | } 25 | }; 26 | 27 | // See 28 | function arrayBufferToBase64(buffer: ArrayBuffer) { 29 | let binary = ""; 30 | const bytes = new Uint8Array( buffer ); 31 | const len = bytes.byteLength; 32 | for (let i = 0; i < len; i++) { 33 | binary += String.fromCharCode( bytes[ i ] ); 34 | } 35 | return window.btoa(binary); 36 | } 37 | 38 | const Media: React.FunctionComponent<{ blob: ArrayBuffer, mimetype: string }> = ({ blob, mimetype }) => { 39 | if (mimetype.startsWith("image/")) { 40 | return ; 41 | } else if (mimetype.startsWith("audio/")) { 42 | return
160 |
161 |
162 | {data["chat.json"].map(chat => { 167 | ev.preventDefault(); 168 | setCurrentChat(chat.id); 169 | }} 170 | > 171 | 172 | )} 173 |
174 |
175 | {currentChatId !== null && 176 | } 177 |
; 178 | }; 179 | -------------------------------------------------------------------------------- /dump-messages.js: -------------------------------------------------------------------------------- 1 | // See README.md for instructions on how to use this file. 2 | // 3 | // I wanted this to be a single file with no dependencies, so we implement 4 | // various things we normally would not implement, like a minimal protobuf 5 | // reader and a tar archive generator. 6 | (() => { 7 | "use strict"; 8 | 9 | const utf8Decoder = new TextDecoder("utf-8", { fatal: true }); 10 | const utf8Encoder = new TextEncoder(); 11 | 12 | // protobuf varint decoder 13 | function decodeVarint(s) { 14 | let number = 0; 15 | let more = false; 16 | let parsedBytes = 0; 17 | do { 18 | if (parsedBytes > 3) { 19 | throw "trying to parse varint wider than 4 bytes, we don't support this since we need to fit within 32 bits"; 20 | } 21 | if (s.cursor >= s.length) { 22 | throw "EOF while parsing varint"; 23 | } 24 | const byte = s.data.getUint8(s.cursor); s.cursor += 1; 25 | more = !!(byte & 0x80); 26 | number += (byte & 0x7f) << (parsedBytes * 7); 27 | parsedBytes += 1; 28 | } while (more); 29 | return number; 30 | } 31 | 32 | // protobuf message decoder. See `decodeWhatsAppProtobuf` message for example on 33 | // the spec format. We currently only bother with wire types encountered when 34 | // decoding whatsapp messages. 35 | // 36 | // rule: we increase cursor _as soon as the data is consumed_. 37 | // cursor should always be at the byte we need to read next. 38 | function decodeProtobufWithState(spec, s) { 39 | const result = {}; 40 | while (s.cursor < s.length) { 41 | const header = s.data.getUint8(s.cursor); s.cursor += 1; 42 | const field = header >> 3; 43 | const wireType = header & 0x7; 44 | const fieldSpec = spec[field]; 45 | if (fieldSpec === undefined) { 46 | throw `non-specced field ${field}`; 47 | } 48 | let fieldValue = null; 49 | if (wireType == 0) { // varint (int32, int64, uint32, uint64, sint32, sint64, bool, enum) 50 | fieldValue = decodeVarint(s); 51 | } else if (wireType === 1) { // fixed64, sfixed64, double 52 | if (fieldSpec.type === "double") { 53 | fieldValue = s.data.getFloat64(s.cursor, true); s.cursor += 8; 54 | } else if (fieldSpec.type === "int64") { 55 | fieldValue = s.data.getBigInt64(s.cursor, true); s.cursor += 8; 56 | } else if (fieldSpec.type === "uint64") { 57 | fieldValue = s.data.getBigUint64(s.cursor, true); s.cursor += 8; 58 | } else { 59 | throw `bad type for 64-bit data: ${fieldSpec.name}, ${fieldSpec.type}`; 60 | } 61 | } else if (wireType === 2) { // length-delimited 62 | const length = decodeVarint(s); 63 | if (fieldSpec.type === "string") { 64 | fieldValue = utf8Decoder.decode(new DataView(s.data.buffer, s.data.byteOffset + s.cursor, length)); 65 | s.cursor += length; 66 | } else if (typeof fieldSpec.type === "object") { 67 | fieldValue = decodeProtobufWithState(fieldSpec.type, { 68 | data: new DataView(s.data.buffer, s.data.byteOffset + s.cursor, length), 69 | cursor: 0, 70 | length: length, 71 | }); 72 | s.cursor += length; 73 | } else { 74 | throw `bad field type for length-delimited data: ${fieldSpec.name}, ${JSON.stringify(fieldSpec.type)}`; 75 | } 76 | } else if (wireType === 5) { // fixed32, sfixed32, float 77 | if (fieldSpec.type === "float") { 78 | fieldValue = s.data.getFloat32(s.cursor, true); s.cursor += 4; 79 | } else if (fieldSpec.type === "int32") { 80 | fieldValue = s.data.getInt32(s.cursor, true); s.cursor += 4; 81 | } else if (fieldSpec.type === "uint32") { 82 | fieldValue = s.data.getInt32(s.cursor, true); s.cursor += 4; 83 | } else { 84 | throw `bad type for 32-bit data: ${fieldSpec.name}, ${fieldSpec.type}`; 85 | } 86 | } else { 87 | throw `unimplemented wire type ${wireType}`; 88 | } 89 | result[fieldSpec.name] = fieldValue; 90 | } 91 | if (s.cursor !== s.length) { 92 | throw `mismatching cursor ${s.cursor} and length ${s.length}`; 93 | } 94 | return result; 95 | } 96 | 97 | // this assumes that there are no fields >= 32. 98 | function decodeProtobuf(spec, buffer) { 99 | const data = new DataView(buffer, 0); 100 | return decodeProtobufWithState(spec, { data, cursor: 0, length: buffer.byteLength }); 101 | } 102 | 103 | // decoding whatsapp protobufs. note that here we have 104 | // things other than "string", I just never encountered them in 105 | // my dataset. 106 | function decodeWhatsAppProtobufMessage(buffer) { 107 | const msgSpec = { 108 | 1: { name: "body", type: "string" }, 109 | 3: { name: "caption", type: "string" }, 110 | 5: { name: "lng", type: "double" }, 111 | 6: { name: "isLive", type: "bool" }, 112 | 7: { name: "lat", type: "double" }, 113 | 8: { name: "paymentAmount1000", type: "int32" }, 114 | 9: { name: "paymentNoteMsgBody", type: "string" }, 115 | 10: { name: "canonicalUrl", type: "string" }, 116 | 11: { name: "matchedText", type: "string" }, 117 | 12: { name: "title", type: "string" }, 118 | 13: { name: "description", type: "string" }, 119 | 14: { name: "futureproofBuffer", type: "bytes" }, 120 | 15: { name: "clientUrl", type: "string" }, 121 | 16: { name: "loc", type: "string" }, 122 | 17: { name: "pollName", type: "string" }, 123 | // 18: { name: "pollOptions"}, not implemented, repeated messages 124 | 20: { name: "pollSelectableOptionsCount", type: "uint32" }, 125 | 21: { name: "messageSecret", type: "bytes" }, 126 | 22: { name: "senderTimestampMs", type: "int64" }, 127 | 23: { name: "pollUpdateParentKey", type: "string" }, 128 | // 24: { name: "encPollVote" }, not implemented, repeated messages 129 | }; 130 | const fullMsgSpec = { 131 | 1: { name: "currentMsg", type: msgSpec }, 132 | 2: { name: "quotedMsg", type: msgSpec }, 133 | }; 134 | return decodeProtobuf(fullMsgSpec, buffer); 135 | } 136 | 137 | // see 138 | function saveFile(fileName, contentType, content) { 139 | const a = document.createElement("a"); 140 | const file = new Blob([content], { type: contentType }); 141 | a.href = URL.createObjectURL(file); 142 | a.download = fileName; 143 | a.click(); 144 | } 145 | 146 | // tar a bunch of files, each a [name, blob] pair. 147 | // see 148 | function saveTar(fileName, contents) { 149 | let tarBufferLen = 512*2; // the final zero-headers 150 | const buffers = contents.map(([name, content]) => { 151 | const buffer = new Uint8Array(content); 152 | tarBufferLen += 512 + Math.ceil(buffer.length/512)*512; // padded header size is 512, we need to round up to 512 153 | const nameBuffer = utf8Encoder.encode(name); 154 | if (nameBuffer.byteLength > 100) { 155 | throw `Tar name too long (${nameBuffer.byteLength})`; 156 | } 157 | return [nameBuffer, buffer]; 158 | }); 159 | const tarBuffer = new Uint8Array(tarBufferLen); 160 | let cursor = 0; 161 | const writeHeaderNum = (size, num, offset) => { 162 | const str = num.toString(8).padStart(size - 1, "0"); // last must be null 163 | tarBuffer.set(utf8Encoder.encode(str), cursor + offset); 164 | }; 165 | for (const [nameBuffer, fileBuffer] of buffers) { 166 | tarBuffer.set(nameBuffer, cursor); // write file name 167 | writeHeaderNum(12, fileBuffer.byteLength, 124); // write file size 168 | writeHeaderNum(8, 420, 100); // file mode -- octal 644 169 | tarBuffer[cursor + 156] = "0".charCodeAt(0); // write file type 170 | // calculate header checksum 171 | for (let i = 0; i < 8; i++) { 172 | tarBuffer[cursor + 148 + i] = 32; 173 | } 174 | let headerChecksum = 0; 175 | for (let i = 0; i < 512; i++) { 176 | headerChecksum += tarBuffer[cursor + i]; 177 | } 178 | writeHeaderNum(6, headerChecksum, 148); tarBuffer[cursor + 148 + 7] = 32; // write checksum 179 | tarBuffer.set(fileBuffer, cursor + 512); // write file contents 180 | cursor += 512 + Math.ceil(fileBuffer.length/512)*512; 181 | } 182 | saveFile(fileName, "application/gzip", tarBuffer); 183 | } 184 | 185 | // HKDF info for encrypted WhatsApp media 186 | function mediaHkdfInfo(type) { 187 | if (type === "image" || type === "sticker") { 188 | return "WhatsApp Image Keys"; 189 | } else if (type === "ptt" || type === "audio") { 190 | return "WhatsApp Audio Keys"; 191 | } else if (type === "video") { 192 | return "WhatsApp Video Keys"; 193 | } else if (type === "document") { 194 | return "WhatsApp Document Keys"; 195 | } else { 196 | throw `Bad media type ${type}`; 197 | } 198 | } 199 | 200 | function isMediaMessage(type) { 201 | return ["image", "sticker", "ptt", "audio", "video", "document"].indexOf(type) >= 0; 202 | } 203 | 204 | // HKDF parameters for encrypted whatsapp media 205 | const hkdfHashLen = 32; 206 | const hkdfAlgo = { "name": "HMAC", "hash": { "name": "SHA-256" } }; 207 | 208 | // HKDF extract, see 209 | async function hkdfExtract({ salt, ikm }) { 210 | const key = await crypto.subtle.importKey("raw", salt, hkdfAlgo, false, ["sign"]); 211 | const prkBytes = await crypto.subtle.sign(hkdfAlgo, key, ikm); 212 | return crypto.subtle.importKey("raw", prkBytes, hkdfAlgo, false, ["sign"]); 213 | } 214 | 215 | // HKDF expand, see 216 | async function hkdfExpand({ prk, info, length }) { 217 | const n = Math.ceil(length / hkdfHashLen); 218 | let okm = new Uint8Array(n*hkdfHashLen); 219 | let t = new Uint8Array(); 220 | for (let i = 0; i < n; i++) { 221 | t = await crypto.subtle.sign(hkdfAlgo, prk, new Uint8Array([...new Uint8Array(t), ...info, i + 1])); 222 | okm.set(new Uint8Array(t), i*hkdfHashLen); 223 | } 224 | okm = okm.slice(0, length); 225 | return okm; 226 | } 227 | 228 | // HKDF, see 229 | async function hkdfExtractAndExpand({ ikm, info, salt, length }) { 230 | salt = salt || new Uint8Array(hkdfHashLen); 231 | const prk = await hkdfExtract({ salt, ikm }); 232 | return hkdfExpand({ prk, info, length }); 233 | } 234 | 235 | // generate media keys from the base64 `mediaKey` message field 236 | async function generateMediaKeys(type, mediaKeyString) { 237 | const mediaKey = Uint8Array.from(window.atob(mediaKeyString), c => c.charCodeAt(0)); 238 | const infoString = mediaHkdfInfo(type); 239 | const info = utf8Encoder.encode(infoString); 240 | const key = await hkdfExtractAndExpand({ 241 | ikm: mediaKey, 242 | info, 243 | length: 112, 244 | }); 245 | return { 246 | iv: key.slice(0, 16), 247 | encKey: key.slice(16, 48), 248 | macKey: key.slice(48, 80), 249 | refKey: key.slice(80, 112) 250 | }; 251 | } 252 | 253 | async function decryptMedia(mediaKeys, bytes) { 254 | const key = await crypto.subtle.importKey("raw", mediaKeys.encKey, "AES-CBC", false, ["decrypt"]); 255 | bytes = bytes.slice(0, -10); // drop the mac 256 | const cleartext = await crypto.subtle.decrypt({ name: "AES-CBC", iv: mediaKeys.iv }, key, bytes); 257 | return cleartext; 258 | } 259 | 260 | // this seems to be the fallback CDN domain for WhatsApp media 261 | const mediaHostname = "mmg.whatsapp.net"; 262 | 263 | // downloads and decrypts a media message. first looks in the cache to minimize 264 | // downloads. 265 | async function downloadAndDecryptMedia(config, mediaCache, stats, msg) { 266 | // TODO figure out what to do when there's no media key 267 | if (msg.mediaKey === undefined) { 268 | stats.noMediaKey.add(msg.id); 269 | return null; 270 | } 271 | if (msg.filehash === undefined) { 272 | stats.noFileHash.add(msg.id); 273 | } 274 | const cacheKey = `https://_media_cache_v2_.whatsapp.com/${encodeURIComponent(`lru-media-array-buffer-cache_${msg.filehash}`)}`; 275 | const cachedBytes = await mediaCache.match(cacheKey); 276 | if (cachedBytes) { 277 | stats.cachedMediaDownloads.add(msg.id); 278 | return cachedBytes.arrayBuffer(); 279 | } else if (config.dumpOnlyCachedMedia) { 280 | return null; 281 | } 282 | const mediaKeys = await generateMediaKeys(msg.type, msg.mediaKey); 283 | const fileUrl = `https://${mediaHostname}${msg.directPath}`; 284 | let fileResp = null; 285 | // TODO figure out why there are so many bad URLs in IndexedDB 286 | try { 287 | fileResp = await fetch(fileUrl); 288 | } catch (e) { 289 | stats.failedMediaDownload.add(msg.id); 290 | return null; 291 | } 292 | if (!fileResp.ok) { 293 | stats.failedMediaDownload.add(msg.id); 294 | return null; 295 | } 296 | stats.successfulMediaDownloads.add(msg.id); 297 | const bytes = await fileResp.arrayBuffer(); 298 | const cleartext = await decryptMedia(mediaKeys, bytes); 299 | if (config.saveDownloadedMediaToCache) { 300 | await mediaCache.put(cacheKey, new Response(cleartext)); 301 | } 302 | return cleartext; 303 | } 304 | 305 | // See 306 | function arrayBufferToBase64(buffer) { 307 | let binary = ""; 308 | const bytes = new Uint8Array( buffer ); 309 | const len = bytes.byteLength; 310 | for (let i = 0; i < len; i++) { 311 | binary += String.fromCharCode( bytes[ i ] ); 312 | } 313 | return window.btoa(binary); 314 | } 315 | 316 | // decrypt and decode a single message 317 | async function decryptMessage(config, mediaCache, { algorithm, key }, stats, messages, mediaBlobs, encodedMessage) { 318 | if (encodedMessage.msgRowOpaqueData) { 319 | const msgBytes = await crypto.subtle.decrypt( 320 | { ...algorithm, iv: encodedMessage.msgRowOpaqueData.iv }, 321 | key, 322 | encodedMessage.msgRowOpaqueData._data, 323 | ); 324 | delete encodedMessage.msgRowOpaqueData; 325 | encodedMessage.msgRowData = arrayBufferToBase64(msgBytes); 326 | if (encodedMessage.type === "chat") { 327 | let decoded = null; 328 | try { 329 | decoded = decodeWhatsAppProtobufMessage(msgBytes); 330 | } catch (e) { 331 | console.error(`could not decode message ${encodedMessage.id}`, e); 332 | throw e; 333 | } 334 | encodedMessage.msgRow = decoded; 335 | } else if (config.dumpMedia && isMediaMessage(encodedMessage.type)) { 336 | let mediaBytes = null; 337 | try { 338 | mediaBytes = await downloadAndDecryptMedia(config, mediaCache, stats, encodedMessage); 339 | } catch (e) { 340 | console.error(`could not download and decrypt media for message ${encodedMessage.id}`, e); 341 | throw e; 342 | } 343 | if (mediaBytes !== null) { 344 | mediaBlobs[encodedMessage.filehash] = mediaBytes; 345 | } 346 | } else { 347 | stats.unknownType.add(encodedMessage.id); 348 | } 349 | } 350 | messages.push(encodedMessage); 351 | } 352 | 353 | // fetch and decrypt all messages 354 | function dumpMessages(config, { db, mediaCache, decryptArgs }, cont) { 355 | console.log("fetching messages"); 356 | const objectStore = db.transaction("message").objectStore("message"); 357 | const mediaBlobs = {}; 358 | const stats = { 359 | unknownType: new Set(), 360 | noMediaKey: new Set(), 361 | noFileHash: new Set(), 362 | failedMediaDownload: new Set(), 363 | successfulMediaDownloads: new Set(), 364 | cachedMediaDownloads: new Set(), 365 | }; 366 | objectStore.getAll().onsuccess = async (e) => { 367 | const messages = []; 368 | const seenTypes = new Set(); 369 | console.log("fetched all messages, decrypting"); 370 | for (const msg of e.target.result) { 371 | seenTypes.add(msg.type); 372 | await decryptMessage(config, mediaCache, decryptArgs, stats, messages, mediaBlobs, msg); 373 | } 374 | console.log(`${messages.length} messages decoded`); 375 | console.log(`${stats.unknownType.size} messages skipped because of unknown type`); 376 | console.log(`${stats.noMediaKey.size} messages skipped because they had no mediaKey`); 377 | console.log(`${stats.failedMediaDownload.size} failed media downloads`); 378 | console.log(`${stats.successfulMediaDownloads.size} successful media downloads`); 379 | console.log(`${stats.cachedMediaDownloads.size} cached media downloads`); 380 | console.log("seen message types", seenTypes); 381 | cont(messages, mediaBlobs); 382 | }; 383 | } 384 | 385 | // Get the args to pass to window.subtle.decrypt 386 | window.whatsappDecryptArgs = window.whatsappDecryptArgs || null; 387 | async function retrieveMessageDecryptArgs({ db }, withDecryptArgs) { 388 | // we get any message, and keep trying to decrypt it with the args 389 | // until one works. 390 | if (window.whatsappDecryptArgs !== null) { 391 | console.log("reusing previously stored decryption arguments"); 392 | withDecryptArgs(window.whatsappDecryptArgs); 393 | return; 394 | } 395 | console.log("no decrypt args found, waiting for them (open a few chats!)"); 396 | const objectStore = db.transaction("message").objectStore("message"); 397 | objectStore.openCursor().onsuccess = (event) => { 398 | const cursor = event.target.result; 399 | const message = event.target.result.value; 400 | if (message.msgRowOpaqueData && message.type === "chat") { 401 | const testData = message.msgRowOpaqueData; 402 | const originalDecrypt = window.crypto.subtle.decrypt; 403 | window.crypto.subtle.decrypt = function (algorithm, key, data) { 404 | // try to decode 405 | if (window.whatsappDecryptArgs === null) { 406 | // eslint-disable-next-line @typescript-eslint/no-this-alias 407 | const that = this; 408 | (async () => { 409 | try { 410 | const msgBytes = await originalDecrypt.call(that, { ...algorithm, iv: testData.iv }, key, testData._data); 411 | decodeWhatsAppProtobufMessage(msgBytes); 412 | // We've made it, store the key 413 | if (window.whatsappDecryptArgs !== null) { return; } // somebody might have gotten there first, it's async 414 | window.crypto.subtle.decrypt = originalDecrypt; 415 | window.whatsappDecryptArgs = { algorithm: { ...algorithm }, key }; 416 | delete window.whatsappDecryptArgs.algorithm.iv; 417 | console.log("decrypt args found", window.whatsappDecryptArgs); 418 | withDecryptArgs(window.whatsappDecryptArgs); 419 | } catch (e) { 420 | console.debug("could not decode test data", e); 421 | } 422 | })(); 423 | } 424 | return originalDecrypt.call(this, algorithm, key, data); 425 | }; 426 | } else { 427 | cursor.continue(); 428 | } 429 | }; 430 | } 431 | 432 | // dumps an entire object store 433 | function dumpObjectStore(db, name, cont) { 434 | const objectStore = db.transaction(name).objectStore(name); 435 | console.log(`fetching object store ${name}`); 436 | objectStore.getAll().onsuccess = (event) => cont(event.target.result); 437 | } 438 | 439 | // sniffs the message encryption key, decrypts all the messages (downloading 440 | // & decrypting the media if requested), saves other useful object stores, 441 | // and packs them in a `whatsapp.tar` file. 442 | async function dumpWhatsApp(config) { 443 | // we first open the two main things we need -- the media cache and the database 444 | const mediaCache = await caches.open("lru-media-array-buffer-cache"); 445 | indexedDB.open("model-storage").onsuccess = (modelEv) => { 446 | const db = modelEv.target.result; 447 | // then we sniff the message key and dump the messages 448 | retrieveMessageDecryptArgs({ db, mediaCache }, decryptArgs => 449 | dumpMessages(config, { db, mediaCache, decryptArgs }, (messages, mediaBlobs) => 450 | dumpObjectStore(db, "chat", chats => 451 | dumpObjectStore(db, "contact", contacts => 452 | dumpObjectStore(db, "group-metadata", async groups => { 453 | const tarContents = [ 454 | ["message.json", await utf8Encoder.encode(JSON.stringify(messages))], 455 | ["chat.json", await utf8Encoder.encode(JSON.stringify(chats))], 456 | ["contact.json", await utf8Encoder.encode(JSON.stringify(contacts))], 457 | ["group-metadata.json", await utf8Encoder.encode(JSON.stringify(groups))], 458 | ]; 459 | for (const [hash, blob] of Object.entries(mediaBlobs)) { 460 | tarContents.push([ 461 | `media/${hash.replace(/\//g, "_").replace(/\+/g, "-").replace(/=+$/, "")}`, 462 | blob 463 | ]); 464 | } 465 | saveTar("whatsapp.tar", tarContents); 466 | } 467 | ))))); 468 | }; 469 | } 470 | 471 | dumpWhatsApp({ 472 | // Save media on top of text messages 473 | dumpMedia: false, 474 | // Dump only media which is already cached locally. Only relevant if `dumpMedia` is 475 | // true. 476 | dumpOnlyCachedMedia: true, 477 | // Cache newly downloaded media, so that it won't be redownloaded the next time. 478 | // note. Only relevant if `dumpOnlyCachedMedia` is false. 479 | saveDownloadedMediaToCache: true, 480 | }); 481 | })(); 482 | --------------------------------------------------------------------------------