├── examples
    ├── image-llm
    │   ├── .env.example
    │   ├── test.jpg
    │   ├── README.md
    │   ├── src
    │   │   └── index.ts
    │   ├── package.json
    │   └── tsconfig.json
    ├── plain-text-conversion
    │   ├── test.txt
    │   ├── src
    │   │   └── index.ts
    │   ├── README.md
    │   ├── package.json
    │   └── tsconfig.json
    ├── image-metadata
    │   ├── test.jpg
    │   ├── src
    │   │   └── index.ts
    │   ├── README.md
    │   ├── package.json
    │   └── tsconfig.json
    ├── pdf-conversion
    │   ├── test.pdf
    │   ├── src
    │   │   └── index.ts
    │   ├── README.md
    │   ├── package.json
    │   └── tsconfig.json
    ├── audio-transcript
    │   ├── test.wav
    │   ├── src
    │   │   └── index.ts
    │   ├── package.json
    │   ├── README.md
    │   └── tsconfig.json
    ├── docx-conversion
    │   ├── test.docx
    │   ├── src
    │   │   └── index.ts
    │   ├── README.md
    │   ├── package.json
    │   └── tsconfig.json
    ├── xlsx-conversion
    │   ├── test.xlsx
    │   ├── src
    │   │   └── index.ts
    │   ├── README.md
    │   ├── package.json
    │   └── tsconfig.json
    ├── zip-conversion
    │   ├── test_files.zip
    │   ├── README.md
    │   ├── src
    │   │   └── index.ts
    │   ├── package.json
    │   └── tsconfig.json
    ├── html-conversion
    │   ├── src
    │   │   └── index.ts
    │   ├── README.md
    │   ├── package.json
    │   ├── tsconfig.json
    │   └── test.html
    ├── bing-serp-conversion
    │   ├── src
    │   │   └── index.ts
    │   ├── README.md
    │   ├── package.json
    │   └── tsconfig.json
    ├── ipynb-conversion
    │   ├── src
    │   │   └── index.ts
    │   ├── README.md
    │   ├── package.json
    │   ├── tsconfig.json
    │   └── test_notebook.ipynb
    ├── wikipedia-conversion
    │   ├── src
    │   │   └── index.ts
    │   ├── README.md
    │   ├── package.json
    │   └── tsconfig.json
    ├── url-conversions
    │   ├── README.md
    │   ├── package.json
    │   ├── tsconfig.json
    │   └── src
    │   │   └── index.ts
    └── youtube-transcript
    │   ├── README.md
    │   ├── src
    │       └── index.ts
    │   ├── package.json
    │   └── tsconfig.json
├── pnpm-workspace.yaml
├── src
    ├── index.ts
    ├── declarations.d.ts
    ├── types.ts
    ├── converters
    │   ├── plain-text.ts
    │   ├── docx.ts
    │   ├── pdf.ts
    │   ├── media.ts
    │   ├── xlsx.ts
    │   ├── html.ts
    │   ├── wikipedia.ts
    │   ├── ipynb.ts
    │   ├── wav.ts
    │   ├── mp3.ts
    │   ├── image.ts
    │   ├── bingserp.ts
    │   ├── zip.ts
    │   ├── xml-rss-atom.ts
    │   └── youtube.ts
    ├── custom-turndown.ts
    └── markitdown.ts
├── test
    ├── __files
    │   ├── test.txt
    │   ├── test.docx
    │   ├── test.jpg
    │   ├── test.pdf
    │   ├── test.pptx
    │   ├── test.wav
    │   ├── test.xlsx
    │   ├── test_llm.jpg
    │   ├── test_files.zip
    │   ├── test_mskanji.csv
    │   ├── test_with_comment.docx
    │   ├── test_notebook.ipynb
    │   └── test_blog.html
    ├── test.data.ts
    └── index.test.ts
├── .github
    ├── renovate.json
    └── workflows
    │   ├── release.yml
    │   └── ci.yml
├── .npmrc
├── .prettierrc
├── tsconfig.json
├── LICENSE
├── .gitignore
├── package.json
└── README.md


/examples/image-llm/.env.example:
--------------------------------------------------------------------------------
1 | OPENAI_API_KEY=""
2 | 


--------------------------------------------------------------------------------
/pnpm-workspace.yaml:
--------------------------------------------------------------------------------
1 | packages:
2 |   - "examples/*"
3 | 


--------------------------------------------------------------------------------
/src/index.ts:
--------------------------------------------------------------------------------
1 | export { MarkItDown } from "./markitdown";
2 | 


--------------------------------------------------------------------------------
/test/__files/test.txt:
--------------------------------------------------------------------------------
1 | hello world
2 | hi there
3 | bye bye
4 | 
5 | 


--------------------------------------------------------------------------------
/.github/renovate.json:
--------------------------------------------------------------------------------
1 | {
2 |   "extends": ["github>yjl9903/renovate-config"]
3 | }
4 | 


--------------------------------------------------------------------------------
/examples/plain-text-conversion/test.txt:
--------------------------------------------------------------------------------
1 | hello world
2 | hi there
3 | bye bye
4 | 
5 | 


--------------------------------------------------------------------------------
/.npmrc:
--------------------------------------------------------------------------------
1 | ignore-workspace-root-check=true
2 | shamefully-hoist=true
3 | shell-emulator=true
4 | 


--------------------------------------------------------------------------------
/test/__files/test.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dead8309/markitdown-ts/HEAD/test/__files/test.docx


--------------------------------------------------------------------------------
/test/__files/test.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dead8309/markitdown-ts/HEAD/test/__files/test.jpg


--------------------------------------------------------------------------------
/test/__files/test.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dead8309/markitdown-ts/HEAD/test/__files/test.pdf


--------------------------------------------------------------------------------
/test/__files/test.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dead8309/markitdown-ts/HEAD/test/__files/test.pptx


--------------------------------------------------------------------------------
/test/__files/test.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dead8309/markitdown-ts/HEAD/test/__files/test.wav


--------------------------------------------------------------------------------
/test/__files/test.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dead8309/markitdown-ts/HEAD/test/__files/test.xlsx


--------------------------------------------------------------------------------
/src/declarations.d.ts:
--------------------------------------------------------------------------------
1 | declare module "@joplin/turndown-plugin-gfm" {
2 |   export const gfm: any;
3 | }
4 | 


--------------------------------------------------------------------------------
/test/__files/test_llm.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dead8309/markitdown-ts/HEAD/test/__files/test_llm.jpg


--------------------------------------------------------------------------------
/examples/image-llm/test.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dead8309/markitdown-ts/HEAD/examples/image-llm/test.jpg


--------------------------------------------------------------------------------
/test/__files/test_files.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dead8309/markitdown-ts/HEAD/test/__files/test_files.zip


--------------------------------------------------------------------------------
/test/__files/test_mskanji.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dead8309/markitdown-ts/HEAD/test/__files/test_mskanji.csv


--------------------------------------------------------------------------------
/examples/image-metadata/test.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dead8309/markitdown-ts/HEAD/examples/image-metadata/test.jpg


--------------------------------------------------------------------------------
/examples/pdf-conversion/test.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dead8309/markitdown-ts/HEAD/examples/pdf-conversion/test.pdf


--------------------------------------------------------------------------------
/.prettierrc:
--------------------------------------------------------------------------------
1 | {
2 |   "semi": true,
3 |   "singleQuote": false,
4 |   "printWidth": 100,
5 |   "trailingComma": "none"
6 | }
7 | 


--------------------------------------------------------------------------------
/examples/audio-transcript/test.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dead8309/markitdown-ts/HEAD/examples/audio-transcript/test.wav


--------------------------------------------------------------------------------
/examples/docx-conversion/test.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dead8309/markitdown-ts/HEAD/examples/docx-conversion/test.docx


--------------------------------------------------------------------------------
/examples/xlsx-conversion/test.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dead8309/markitdown-ts/HEAD/examples/xlsx-conversion/test.xlsx


--------------------------------------------------------------------------------
/test/__files/test_with_comment.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dead8309/markitdown-ts/HEAD/test/__files/test_with_comment.docx


--------------------------------------------------------------------------------
/examples/zip-conversion/test_files.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dead8309/markitdown-ts/HEAD/examples/zip-conversion/test_files.zip


--------------------------------------------------------------------------------
/examples/pdf-conversion/src/index.ts:
--------------------------------------------------------------------------------
1 | import { MarkItDown } from "markitdown-ts";
2 | 
3 | (async () => {
4 |   const markitdown = new MarkItDown();
5 |   const pdfFile = "./test.pdf";
6 |   const result = await markitdown.convert(pdfFile);
7 |   console.log(result?.markdown);
8 | })();
9 | 


--------------------------------------------------------------------------------
/examples/audio-transcript/src/index.ts:
--------------------------------------------------------------------------------
1 | import { MarkItDown } from "markitdown-ts";
2 | 
3 | (async () => {
4 |   const markitdown = new MarkItDown();
5 |   const audioFile = "./test.wav";
6 |   const result = await markitdown.convert(audioFile);
7 |   console.log(result?.markdown);
8 | })();
9 | 


--------------------------------------------------------------------------------
/examples/docx-conversion/src/index.ts:
--------------------------------------------------------------------------------
1 | import { MarkItDown } from "markitdown-ts";
2 | 
3 | (async () => {
4 |   const markitdown = new MarkItDown();
5 |   const docxFile = "./test.docx";
6 |   const result = await markitdown.convert(docxFile);
7 |   console.log(result?.markdown);
8 | })();
9 | 


--------------------------------------------------------------------------------
/examples/html-conversion/src/index.ts:
--------------------------------------------------------------------------------
1 | import { MarkItDown } from "markitdown-ts";
2 | 
3 | (async () => {
4 |   const markitdown = new MarkItDown();
5 |   const htmlFile = "./test.html";
6 |   const result = await markitdown.convert(htmlFile);
7 |   console.log(result?.markdown);
8 | })();
9 | 


--------------------------------------------------------------------------------
/examples/image-metadata/src/index.ts:
--------------------------------------------------------------------------------
1 | import { MarkItDown } from "markitdown-ts";
2 | 
3 | (async () => {
4 |   const markitdown = new MarkItDown();
5 |   const imageFile = "./test.jpg";
6 |   const result = await markitdown.convert(imageFile);
7 |   console.log(result?.markdown);
8 | })();
9 | 


--------------------------------------------------------------------------------
/examples/xlsx-conversion/src/index.ts:
--------------------------------------------------------------------------------
1 | import { MarkItDown } from "markitdown-ts";
2 | 
3 | (async () => {
4 |   const markitdown = new MarkItDown();
5 |   const xlsxFile = "./test.xlsx";
6 |   const result = await markitdown.convert(xlsxFile);
7 |   console.log(result?.markdown);
8 | })();
9 | 


--------------------------------------------------------------------------------
/examples/plain-text-conversion/src/index.ts:
--------------------------------------------------------------------------------
1 | import { MarkItDown } from "markitdown-ts";
2 | 
3 | (async () => {
4 |   const markitdown = new MarkItDown();
5 |   const textFile = "./test.txt";
6 |   const result = await markitdown.convert(textFile);
7 |   console.log(result?.markdown);
8 | })();
9 | 


--------------------------------------------------------------------------------
/examples/bing-serp-conversion/src/index.ts:
--------------------------------------------------------------------------------
1 | import { MarkItDown } from "markitdown-ts";
2 | 
3 | (async () => {
4 |   const markitdown = new MarkItDown();
5 |   const serpFile = "./test_serp.html";
6 |   const result = await markitdown.convert(serpFile);
7 |   console.log(result?.markdown);
8 | })();
9 | 


--------------------------------------------------------------------------------
/examples/ipynb-conversion/src/index.ts:
--------------------------------------------------------------------------------
1 | import { MarkItDown } from "markitdown-ts";
2 | 
3 | (async () => {
4 |   const markitdown = new MarkItDown();
5 |   const ipynbFile = "./test_notebook.ipynb";
6 |   const result = await markitdown.convert(ipynbFile);
7 |   console.log(result?.markdown);
8 | })();
9 | 


--------------------------------------------------------------------------------
/examples/zip-conversion/README.md:
--------------------------------------------------------------------------------
 1 | # zip-example
 2 | 
 3 | 1. Run the following commands from the root directory of the markitdown-ts repo:
 4 | 
 5 | ```sh
 6 | pnpm install
 7 | pnpm build
 8 | ```
 9 | 
10 | 2. Run the example with the following command:
11 | 
12 | ```sh
13 | pnpm tsx src/index.ts
14 | ```
15 | 


--------------------------------------------------------------------------------
/examples/wikipedia-conversion/src/index.ts:
--------------------------------------------------------------------------------
1 | import { MarkItDown } from "markitdown-ts";
2 | 
3 | (async () => {
4 |   const markitdown = new MarkItDown();
5 |   const wikipediaFile = "./test_wikipedia.html";
6 |   const result = await markitdown.convert(wikipediaFile);
7 |   console.log(result?.markdown);
8 | })();
9 | 


--------------------------------------------------------------------------------
/examples/image-metadata/README.md:
--------------------------------------------------------------------------------
 1 | # image-metadata-example
 2 | 
 3 | 1. Run the following commands from the root directory of the markitdown-ts repo:
 4 | 
 5 | ```sh
 6 | pnpm install
 7 | pnpm build
 8 | ```
 9 | 
10 | 2. Run the example with the following command:
11 | 
12 | ```sh
13 | pnpm tsx src/index.ts
14 | ```
15 | 


--------------------------------------------------------------------------------
/examples/pdf-conversion/README.md:
--------------------------------------------------------------------------------
 1 | # pdf-conversion-example
 2 | 
 3 | 1. Run the following commands from the root directory of the markitdown-ts repo:
 4 | 
 5 | ```sh
 6 | pnpm install
 7 | pnpm build
 8 | ```
 9 | 
10 | 2. Run the example with the following command:
11 | 
12 | ```sh
13 | pnpm tsx src/index.ts
14 | ```
15 | 


--------------------------------------------------------------------------------
/examples/docx-conversion/README.md:
--------------------------------------------------------------------------------
 1 | # docx-conversion-example
 2 | 
 3 | 1. Run the following commands from the root directory of the markitdown-ts repo:
 4 | 
 5 | ```sh
 6 | pnpm install
 7 | pnpm build
 8 | ```
 9 | 
10 | 2. Run the example with the following command:
11 | 
12 | ```sh
13 | pnpm tsx src/index.ts
14 | ```
15 | 


--------------------------------------------------------------------------------
/examples/html-conversion/README.md:
--------------------------------------------------------------------------------
 1 | # html-conversion-example
 2 | 
 3 | 1. Run the following commands from the root directory of the markitdown-ts repo:
 4 | 
 5 | ```sh
 6 | pnpm install
 7 | pnpm build
 8 | ```
 9 | 
10 | 2. Run the example with the following command:
11 | 
12 | ```sh
13 | pnpm tsx src/index.ts
14 | ```
15 | 


--------------------------------------------------------------------------------
/examples/ipynb-conversion/README.md:
--------------------------------------------------------------------------------
 1 | # ipynb-conversion-example
 2 | 
 3 | 1. Run the following commands from the root directory of the markitdown-ts repo:
 4 | 
 5 | ```sh
 6 | pnpm install
 7 | pnpm build
 8 | ```
 9 | 
10 | 2. Run the example with the following command:
11 | 
12 | ```sh
13 | pnpm tsx src/index.ts
14 | ```
15 | 


--------------------------------------------------------------------------------
/examples/url-conversions/README.md:
--------------------------------------------------------------------------------
 1 | # url-conversions-example
 2 | 
 3 | 1. Run the following commands from the root directory of the markitdown-ts repo:
 4 | 
 5 | ```sh
 6 | pnpm install
 7 | pnpm build
 8 | ```
 9 | 
10 | 2. Run the example with the following command:
11 | 
12 | ```sh
13 | pnpm tsx src/index.ts
14 | ```
15 | 


--------------------------------------------------------------------------------
/examples/xlsx-conversion/README.md:
--------------------------------------------------------------------------------
 1 | # xlsx-conversion-example
 2 | 
 3 | 1. Run the following commands from the root directory of the markitdown-ts repo:
 4 | 
 5 | ```sh
 6 | pnpm install
 7 | pnpm build
 8 | ```
 9 | 
10 | 2. Run the example with the following command:
11 | 
12 | ```sh
13 | pnpm tsx src/index.ts
14 | ```
15 | 


--------------------------------------------------------------------------------
/examples/youtube-transcript/README.md:
--------------------------------------------------------------------------------
 1 | # youtube-transcript-example
 2 | 
 3 | 1. Run the following commands from the root directory of the markitdown-ts repo:
 4 | 
 5 | ```sh
 6 | pnpm install
 7 | pnpm build
 8 | ```
 9 | 
10 | 2. Run the example with the following command:
11 | 
12 | ```sh
13 | pnpm tsx src/index.ts
14 | ```
15 | 


--------------------------------------------------------------------------------
/examples/bing-serp-conversion/README.md:
--------------------------------------------------------------------------------
 1 | # bing-serp-conversion-example
 2 | 
 3 | 1. Run the following commands from the root directory of the markitdown-ts repo:
 4 | 
 5 | ```sh
 6 | pnpm install
 7 | pnpm build
 8 | ```
 9 | 
10 | 2. Run the example with the following command:
11 | 
12 | ```sh
13 | pnpm tsx src/index.ts
14 | ```
15 | 


--------------------------------------------------------------------------------
/examples/plain-text-conversion/README.md:
--------------------------------------------------------------------------------
 1 | # plain-text-conversion-example
 2 | 
 3 | 1. Run the following commands from the root directory of the markitdown-ts repo:
 4 | 
 5 | ```sh
 6 | pnpm install
 7 | pnpm build
 8 | ```
 9 | 
10 | 2. Run the example with the following command:
11 | 
12 | ```sh
13 | pnpm tsx src/index.ts
14 | ```
15 | 


--------------------------------------------------------------------------------
/examples/wikipedia-conversion/README.md:
--------------------------------------------------------------------------------
 1 | # wikipedia-conversion-example
 2 | 
 3 | 1. Run the following commands from the root directory of the markitdown-ts repo:
 4 | 
 5 | ```sh
 6 | pnpm install
 7 | pnpm build
 8 | ```
 9 | 
10 | 2. Run the example with the following command:
11 | 
12 | ```sh
13 | pnpm tsx src/index.ts
14 | ```
15 | 


--------------------------------------------------------------------------------
/examples/zip-conversion/src/index.ts:
--------------------------------------------------------------------------------
 1 | import { MarkItDown } from "markitdown-ts";
 2 | import path from "path";
 3 | 
 4 | (async () => {
 5 |   const markitdown = new MarkItDown();
 6 |   const zipFile = path.normalize(`${__dirname}/../test_files.zip`);
 7 |   const result = await markitdown.convert(zipFile);
 8 |   console.log(result?.markdown);
 9 | })();
10 | 


--------------------------------------------------------------------------------
/examples/youtube-transcript/src/index.ts:
--------------------------------------------------------------------------------
 1 | import { MarkItDown } from "markitdown-ts";
 2 | 
 3 | (async () => {
 4 |   const markitdown = new MarkItDown();
 5 |   const result = await markitdown.convert("https://www.youtube.com/watch?v=V2qZ_lgxTzg", {
 6 |     enableYoutubeTranscript: true,
 7 |     youtubeTranscriptLanguage: "en"
 8 |   });
 9 |   console.log(result?.markdown);
10 | })();
11 | 


--------------------------------------------------------------------------------
/examples/zip-conversion/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "zip-example",
 3 |   "version": "1.0.0",
 4 |   "private": true,
 5 |   "scripts": {
 6 |     "start": "tsx src/index.ts"
 7 |   },
 8 |   "devDependencies": {
 9 |     "@types/node": "^22.10.2",
10 |     "tsx": "^4.19.2",
11 |     "typescript": "^5.7.2"
12 |   },
13 |   "dependencies": {
14 |     "markitdown-ts": "workspace:*"
15 |   }
16 | }
17 | 


--------------------------------------------------------------------------------
/examples/image-metadata/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "image-metadata-example",
 3 |   "version": "1.0.0",
 4 |   "private": true,
 5 |   "scripts": {
 6 |     "start": "tsx src/index.ts"
 7 |   },
 8 |   "devDependencies": {
 9 |     "@types/node": "^22.10.2",
10 |     "tsx": "^4.19.2",
11 |     "typescript": "^5.7.2"
12 |   },
13 |   "dependencies": {
14 |     "markitdown-ts": "workspace:*"
15 |   }
16 | }
17 | 


--------------------------------------------------------------------------------
/examples/pdf-conversion/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "pdf-conversion-example",
 3 |   "version": "1.0.0",
 4 |   "private": true,
 5 |   "scripts": {
 6 |     "start": "tsx src/index.ts"
 7 |   },
 8 |   "devDependencies": {
 9 |     "@types/node": "^22.10.2",
10 |     "tsx": "^4.19.2",
11 |     "typescript": "^5.7.2"
12 |   },
13 |   "dependencies": {
14 |     "markitdown-ts": "workspace:*"
15 |   }
16 | }
17 | 


--------------------------------------------------------------------------------
/examples/audio-transcript/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "audio-transcript-example",
 3 |   "version": "1.0.0",
 4 |   "private": true,
 5 |   "scripts": {
 6 |     "start": "tsx src/index.ts"
 7 |   },
 8 |   "devDependencies": {
 9 |     "@types/node": "^22.10.2",
10 |     "tsx": "^4.19.2",
11 |     "typescript": "^5.7.2"
12 |   },
13 |   "dependencies": {
14 |     "markitdown-ts": "workspace:*"
15 |   }
16 | }
17 | 


--------------------------------------------------------------------------------
/examples/docx-conversion/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "docx-conversion-example",
 3 |   "version": "1.0.0",
 4 |   "private": true,
 5 |   "scripts": {
 6 |     "start": "tsx src/index.ts"
 7 |   },
 8 |   "devDependencies": {
 9 |     "@types/node": "^22.10.2",
10 |     "tsx": "^4.19.2",
11 |     "typescript": "^5.7.2"
12 |   },
13 |   "dependencies": {
14 |     "markitdown-ts": "workspace:*"
15 |   }
16 | }
17 | 


--------------------------------------------------------------------------------
/examples/html-conversion/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "html-conversion-example",
 3 |   "version": "1.0.0",
 4 |   "private": true,
 5 |   "scripts": {
 6 |     "start": "tsx src/index.ts"
 7 |   },
 8 |   "devDependencies": {
 9 |     "@types/node": "^22.10.2",
10 |     "tsx": "^4.19.2",
11 |     "typescript": "^5.7.2"
12 |   },
13 |   "dependencies": {
14 |     "markitdown-ts": "workspace:*"
15 |   }
16 | }
17 | 


--------------------------------------------------------------------------------
/examples/ipynb-conversion/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "ipynb-conversion-example",
 3 |   "version": "1.0.0",
 4 |   "private": true,
 5 |   "scripts": {
 6 |     "start": "tsx src/index.ts"
 7 |   },
 8 |   "devDependencies": {
 9 |     "@types/node": "^22.10.2",
10 |     "tsx": "^4.19.2",
11 |     "typescript": "^5.7.2"
12 |   },
13 |   "dependencies": {
14 |     "markitdown-ts": "workspace:*"
15 |   }
16 | }
17 | 


--------------------------------------------------------------------------------
/examples/url-conversions/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "url-conversions-example",
 3 |   "version": "1.0.0",
 4 |   "private": true,
 5 |   "scripts": {
 6 |     "start": "tsx src/index.ts"
 7 |   },
 8 |   "devDependencies": {
 9 |     "@types/node": "^22.10.2",
10 |     "tsx": "^4.19.2",
11 |     "typescript": "^5.7.2"
12 |   },
13 |   "dependencies": {
14 |     "markitdown-ts": "workspace:*"
15 |   }
16 | }
17 | 


--------------------------------------------------------------------------------
/examples/xlsx-conversion/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "xlsx-conversion-example",
 3 |   "version": "1.0.0",
 4 |   "private": true,
 5 |   "scripts": {
 6 |     "start": "tsx src/index.ts"
 7 |   },
 8 |   "devDependencies": {
 9 |     "@types/node": "^22.10.2",
10 |     "tsx": "^4.19.2",
11 |     "typescript": "^5.7.2"
12 |   },
13 |   "dependencies": {
14 |     "markitdown-ts": "workspace:*"
15 |   }
16 | }
17 | 


--------------------------------------------------------------------------------
/examples/youtube-transcript/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "youtube-transcript-example",
 3 |   "version": "1.0.0",
 4 |   "private": true,
 5 |   "scripts": {
 6 |     "start": "tsx src/index.ts"
 7 |   },
 8 |   "devDependencies": {
 9 |     "@types/node": "^22.10.2",
10 |     "tsx": "^4.19.2",
11 |     "typescript": "^5.7.2"
12 |   },
13 |   "dependencies": {
14 |     "markitdown-ts": "workspace:*"
15 |   }
16 | }
17 | 


--------------------------------------------------------------------------------
/examples/bing-serp-conversion/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "bing-serp-conversion-example",
 3 |   "version": "1.0.0",
 4 |   "private": true,
 5 |   "scripts": {
 6 |     "start": "tsx src/index.ts"
 7 |   },
 8 |   "devDependencies": {
 9 |     "@types/node": "^22.10.2",
10 |     "tsx": "^4.19.2",
11 |     "typescript": "^5.7.2"
12 |   },
13 |   "dependencies": {
14 |     "markitdown-ts": "workspace:*"
15 |   }
16 | }
17 | 


--------------------------------------------------------------------------------
/examples/plain-text-conversion/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "plain-text-conversion-example",
 3 |   "version": "1.0.0",
 4 |   "private": true,
 5 |   "scripts": {
 6 |     "start": "tsx src/index.ts"
 7 |   },
 8 |   "devDependencies": {
 9 |     "@types/node": "^22.10.2",
10 |     "tsx": "^4.19.2",
11 |     "typescript": "^5.7.2"
12 |   },
13 |   "dependencies": {
14 |     "markitdown-ts": "workspace:*"
15 |   }
16 | }
17 | 


--------------------------------------------------------------------------------
/examples/wikipedia-conversion/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "wikipedia-conversion-example",
 3 |   "version": "1.0.0",
 4 |   "private": true,
 5 |   "scripts": {
 6 |     "start": "tsx src/index.ts"
 7 |   },
 8 |   "devDependencies": {
 9 |     "@types/node": "^22.10.2",
10 |     "tsx": "^4.19.2",
11 |     "typescript": "^5.7.2"
12 |   },
13 |   "dependencies": {
14 |     "markitdown-ts": "workspace:*"
15 |   }
16 | }
17 | 


--------------------------------------------------------------------------------
/examples/image-llm/README.md:
--------------------------------------------------------------------------------
 1 | # image-llm-example
 2 | 
 3 | 1. Create a .env file with the following content:
 4 | ```sh
 5 | OPENAI_API_KEY="YOUR_OPENAI_API_KEY"
 6 | ```
 7 | 
 8 | 2. Run the following commands from the root directory of the markitdown-ts repo:
 9 | 
10 | ```sh
11 | pnpm install
12 | pnpm build
13 | ```
14 | 
15 | 3. Run the example with the following command:
16 | 
17 | ```sh
18 | pnpm tsx src/index.ts
19 | ```
20 | 


--------------------------------------------------------------------------------
/examples/audio-transcript/README.md:
--------------------------------------------------------------------------------
 1 | # audio-transcript-example
 2 | 
 3 | > [!NOTE]
 4 | >
 5 | > Speech Recognition is not yet implemented. Please raise a pr if you want to implement it yourself.
 6 | 
 7 | 1. Run the following commands from the root directory of the markitdown-ts repo:
 8 | 
 9 | ```sh
10 | pnpm install
11 | pnpm build
12 | ```
13 | 
14 | 2. Run the example with the following command:
15 | 
16 | ```sh
17 | pnpm tsx src/index.ts
18 | ```
19 | 


--------------------------------------------------------------------------------
/examples/image-llm/src/index.ts:
--------------------------------------------------------------------------------
 1 | import { openai } from "@ai-sdk/openai";
 2 | import { MarkItDown } from "markitdown-ts";
 3 | import { configDotenv } from "dotenv";
 4 | 
 5 | configDotenv();
 6 | 
 7 | (async () => {
 8 |   const markitdown = new MarkItDown();
 9 |   const imageFile = "./test.jpg";
10 |   const result = await markitdown.convert(imageFile, {
11 |     llmModel: openai("gpt-4o-mini"),
12 |   });
13 |   console.log(result?.markdown);
14 | })();
15 | 


--------------------------------------------------------------------------------
/examples/image-llm/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "image-llm-example",
 3 |   "version": "1.0.0",
 4 |   "private": true,
 5 |   "scripts": {
 6 |     "start": "tsx src/index.ts"
 7 |   },
 8 |   "devDependencies": {
 9 |     "@types/node": "^22.10.2",
10 |     "tsx": "^4.19.2",
11 |     "typescript": "^5.7.2"
12 |   },
13 |   "dependencies": {
14 |     "@ai-sdk/openai": "^2.0.62",
15 |     "dotenv": "^16.4.7",
16 |     "markitdown-ts": "workspace:*"
17 |   }
18 | }
19 | 


--------------------------------------------------------------------------------
/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "$schema": "https://json.schemastore.org/tsconfig",
 3 |   "compilerOptions": {
 4 |     "target": "ESNext",
 5 |     "module": "ESNext",
 6 |     "lib": [
 7 |       "ESNext"
 8 |     ],
 9 |     "moduleResolution": "Node",
10 |     "esModuleInterop": true,
11 |     "strict": true,
12 |     "strictNullChecks": true,
13 |     "resolveJsonModule": true,
14 |     "skipLibCheck": true,
15 |     "skipDefaultLibCheck": true
16 |   },
17 |   "exclude": [
18 |     "node_modules"
19 |   ]
20 | }


--------------------------------------------------------------------------------
/examples/image-llm/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "compilerOptions": {
 3 |     "strict": true,
 4 |     "declaration": true,
 5 |     "sourceMap": true,
 6 |     "target": "es2022",
 7 |     "lib": ["es2022", "dom"],
 8 |     "module": "esnext",
 9 |     "types": ["node"],
10 |     "esModuleInterop": true,
11 |     "allowSyntheticDefaultImports": true,
12 |     "moduleResolution": "node",
13 |     "rootDir": "./src",
14 |     "outDir": "./build",
15 |     "skipLibCheck": true
16 |   },
17 |   "include": ["src/**/*.ts"]
18 | }
19 | 


--------------------------------------------------------------------------------
/examples/audio-transcript/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "compilerOptions": {
 3 |     "strict": true,
 4 |     "declaration": true,
 5 |     "sourceMap": true,
 6 |     "target": "es2022",
 7 |     "lib": ["es2022", "dom"],
 8 |     "module": "esnext",
 9 |     "types": ["node"],
10 |     "esModuleInterop": true,
11 |     "allowSyntheticDefaultImports": true,
12 |     "moduleResolution": "node",
13 |     "rootDir": "./src",
14 |     "outDir": "./build",
15 |     "skipLibCheck": true
16 |   },
17 |   "include": ["src/**/*.ts"]
18 | }
19 | 


--------------------------------------------------------------------------------
/examples/docx-conversion/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "compilerOptions": {
 3 |     "strict": true,
 4 |     "declaration": true,
 5 |     "sourceMap": true,
 6 |     "target": "es2022",
 7 |     "lib": ["es2022", "dom"],
 8 |     "module": "esnext",
 9 |     "types": ["node"],
10 |     "esModuleInterop": true,
11 |     "allowSyntheticDefaultImports": true,
12 |     "moduleResolution": "node",
13 |     "rootDir": "./src",
14 |     "outDir": "./build",
15 |     "skipLibCheck": true
16 |   },
17 |   "include": ["src/**/*.ts"]
18 | }
19 | 


--------------------------------------------------------------------------------
/examples/html-conversion/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "compilerOptions": {
 3 |     "strict": true,
 4 |     "declaration": true,
 5 |     "sourceMap": true,
 6 |     "target": "es2022",
 7 |     "lib": ["es2022", "dom"],
 8 |     "module": "esnext",
 9 |     "types": ["node"],
10 |     "esModuleInterop": true,
11 |     "allowSyntheticDefaultImports": true,
12 |     "moduleResolution": "node",
13 |     "rootDir": "./src",
14 |     "outDir": "./build",
15 |     "skipLibCheck": true
16 |   },
17 |   "include": ["src/**/*.ts"]
18 | }
19 | 


--------------------------------------------------------------------------------
/examples/image-metadata/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "compilerOptions": {
 3 |     "strict": true,
 4 |     "declaration": true,
 5 |     "sourceMap": true,
 6 |     "target": "es2022",
 7 |     "lib": ["es2022", "dom"],
 8 |     "module": "esnext",
 9 |     "types": ["node"],
10 |     "esModuleInterop": true,
11 |     "allowSyntheticDefaultImports": true,
12 |     "moduleResolution": "node",
13 |     "rootDir": "./src",
14 |     "outDir": "./build",
15 |     "skipLibCheck": true
16 |   },
17 |   "include": ["src/**/*.ts"]
18 | }
19 | 


--------------------------------------------------------------------------------
/examples/ipynb-conversion/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "compilerOptions": {
 3 |     "strict": true,
 4 |     "declaration": true,
 5 |     "sourceMap": true,
 6 |     "target": "es2022",
 7 |     "lib": ["es2022", "dom"],
 8 |     "module": "esnext",
 9 |     "types": ["node"],
10 |     "esModuleInterop": true,
11 |     "allowSyntheticDefaultImports": true,
12 |     "moduleResolution": "node",
13 |     "rootDir": "./src",
14 |     "outDir": "./build",
15 |     "skipLibCheck": true
16 |   },
17 |   "include": ["src/**/*.ts"]
18 | }
19 | 


--------------------------------------------------------------------------------
/examples/pdf-conversion/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "compilerOptions": {
 3 |     "strict": true,
 4 |     "declaration": true,
 5 |     "sourceMap": true,
 6 |     "target": "es2022",
 7 |     "lib": ["es2022", "dom"],
 8 |     "module": "esnext",
 9 |     "types": ["node"],
10 |     "esModuleInterop": true,
11 |     "allowSyntheticDefaultImports": true,
12 |     "moduleResolution": "node",
13 |     "rootDir": "./src",
14 |     "outDir": "./build",
15 |     "skipLibCheck": true
16 |   },
17 |   "include": ["src/**/*.ts"]
18 | }
19 | 


--------------------------------------------------------------------------------
/examples/url-conversions/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "compilerOptions": {
 3 |     "strict": true,
 4 |     "declaration": true,
 5 |     "sourceMap": true,
 6 |     "target": "es2022",
 7 |     "lib": ["es2022", "dom"],
 8 |     "module": "esnext",
 9 |     "types": ["node"],
10 |     "esModuleInterop": true,
11 |     "allowSyntheticDefaultImports": true,
12 |     "moduleResolution": "node",
13 |     "rootDir": "./src",
14 |     "outDir": "./build",
15 |     "skipLibCheck": true
16 |   },
17 |   "include": ["src/**/*.ts"]
18 | }
19 | 


--------------------------------------------------------------------------------
/examples/xlsx-conversion/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "compilerOptions": {
 3 |     "strict": true,
 4 |     "declaration": true,
 5 |     "sourceMap": true,
 6 |     "target": "es2022",
 7 |     "lib": ["es2022", "dom"],
 8 |     "module": "esnext",
 9 |     "types": ["node"],
10 |     "esModuleInterop": true,
11 |     "allowSyntheticDefaultImports": true,
12 |     "moduleResolution": "node",
13 |     "rootDir": "./src",
14 |     "outDir": "./build",
15 |     "skipLibCheck": true
16 |   },
17 |   "include": ["src/**/*.ts"]
18 | }
19 | 


--------------------------------------------------------------------------------
/examples/zip-conversion/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "compilerOptions": {
 3 |     "strict": true,
 4 |     "declaration": true,
 5 |     "sourceMap": true,
 6 |     "target": "es2022",
 7 |     "lib": ["es2022", "dom"],
 8 |     "module": "esnext",
 9 |     "types": ["node"],
10 |     "esModuleInterop": true,
11 |     "allowSyntheticDefaultImports": true,
12 |     "moduleResolution": "node",
13 |     "rootDir": "./src",
14 |     "outDir": "./build",
15 |     "skipLibCheck": true
16 |   },
17 |   "include": ["src/**/*.ts"]
18 | }
19 | 


--------------------------------------------------------------------------------
/examples/bing-serp-conversion/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "compilerOptions": {
 3 |     "strict": true,
 4 |     "declaration": true,
 5 |     "sourceMap": true,
 6 |     "target": "es2022",
 7 |     "lib": ["es2022", "dom"],
 8 |     "module": "esnext",
 9 |     "types": ["node"],
10 |     "esModuleInterop": true,
11 |     "allowSyntheticDefaultImports": true,
12 |     "moduleResolution": "node",
13 |     "rootDir": "./src",
14 |     "outDir": "./build",
15 |     "skipLibCheck": true
16 |   },
17 |   "include": ["src/**/*.ts"]
18 | }
19 | 


--------------------------------------------------------------------------------
/examples/plain-text-conversion/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "compilerOptions": {
 3 |     "strict": true,
 4 |     "declaration": true,
 5 |     "sourceMap": true,
 6 |     "target": "es2022",
 7 |     "lib": ["es2022", "dom"],
 8 |     "module": "esnext",
 9 |     "types": ["node"],
10 |     "esModuleInterop": true,
11 |     "allowSyntheticDefaultImports": true,
12 |     "moduleResolution": "node",
13 |     "rootDir": "./src",
14 |     "outDir": "./build",
15 |     "skipLibCheck": true
16 |   },
17 |   "include": ["src/**/*.ts"]
18 | }
19 | 


--------------------------------------------------------------------------------
/examples/wikipedia-conversion/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "compilerOptions": {
 3 |     "strict": true,
 4 |     "declaration": true,
 5 |     "sourceMap": true,
 6 |     "target": "es2022",
 7 |     "lib": ["es2022", "dom"],
 8 |     "module": "esnext",
 9 |     "types": ["node"],
10 |     "esModuleInterop": true,
11 |     "allowSyntheticDefaultImports": true,
12 |     "moduleResolution": "node",
13 |     "rootDir": "./src",
14 |     "outDir": "./build",
15 |     "skipLibCheck": true
16 |   },
17 |   "include": ["src/**/*.ts"]
18 | }
19 | 


--------------------------------------------------------------------------------
/examples/youtube-transcript/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "compilerOptions": {
 3 |     "strict": true,
 4 |     "declaration": true,
 5 |     "sourceMap": true,
 6 |     "target": "es2022",
 7 |     "lib": ["es2022", "dom"],
 8 |     "module": "esnext",
 9 |     "types": ["node"],
10 |     "esModuleInterop": true,
11 |     "allowSyntheticDefaultImports": true,
12 |     "moduleResolution": "node",
13 |     "rootDir": "./src",
14 |     "outDir": "./build",
15 |     "skipLibCheck": true
16 |   },
17 |   "include": ["src/**/*.ts"]
18 | }
19 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
 1 | name: Release
 2 | 
 3 | on:
 4 |   push:
 5 |     tags:
 6 |       - 'v*'
 7 | 
 8 | jobs:
 9 |   release:
10 |     runs-on: ubuntu-latest
11 | 
12 |     permissions:
13 |       contents: write
14 | 
15 |     steps:
16 |       - uses: actions/checkout@v4
17 |         with:
18 |           fetch-depth: 0
19 | 
20 |       - uses: actions/setup-node@v4
21 |         with:
22 |           node-version: 20.x
23 | 
24 |       - run: npx changelogithub
25 |         env:
26 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
27 | 


--------------------------------------------------------------------------------
/examples/url-conversions/src/index.ts:
--------------------------------------------------------------------------------
 1 | import { MarkItDown } from "markitdown-ts";
 2 | 
 3 | (async () => {
 4 |   const markitdown = new MarkItDown();
 5 |   const URLS = [
 6 |     "https://microsoft.github.io/autogen/blog/2023/04/21/LLM-tuning-math",
 7 |     "https://en.wikipedia.org/wiki/Microsoft",
 8 |     "https://www.youtube.com/watch?v=V2qZ_lgxTzg",
 9 |     "https://www.bing.com/search?q=microsoft+wikipedia",
10 |     "https://arxiv.org/pdf/2308.08155v2.pdf"
11 |   ];
12 |   for (const url of URLS) {
13 |     const result = await markitdown.convert(url);
14 |     console.log(result?.markdown);
15 |   }
16 | })();
17 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |   pull_request:
 8 |     branches:
 9 |       - main
10 | 
11 | jobs:
12 |   build:
13 |     runs-on: ubuntu-latest
14 |     steps:
15 |       - uses: actions/checkout@v4
16 | 
17 |       - name: Use Exiftool
18 |         uses: woss/exiftool-action@v12.87
19 | 
20 |       - name: Setup pnpm
21 |         uses: pnpm/action-setup@v4.0.0
22 | 
23 |       - name: Setup node
24 |         uses: actions/setup-node@v4
25 |         with:
26 |           node-version: 20.x
27 |           cache: pnpm
28 | 
29 |       - name: Install
30 |         run: pnpm install
31 | 
32 |       - name: Build
33 |         run: pnpm build
34 | 
35 |       - name: Test
36 |         run: pnpm test
37 | 


--------------------------------------------------------------------------------
/src/types.ts:
--------------------------------------------------------------------------------
 1 | import { LanguageModel } from "ai";
 2 | import mammoth from "mammoth";
 3 | 
 4 | export type ConverterResult =
 5 |   | {
 6 |       title: string | null;
 7 |       markdown: string;
 8 |       /** @deprecated Use `markdown` instead. */
 9 |       text_content: string;
10 |     }
11 |   | null
12 |   | undefined;
13 | 
14 | export type ConverterOptions = {
15 |   llmModel?: LanguageModel;
16 |   llmPrompt?: string;
17 |   file_extension?: string;
18 |   url?: string;
19 |   fetch?: typeof fetch;
20 |   enableYoutubeTranscript?: boolean;
21 |   youtubeTranscriptLanguage?: string;
22 |   cleanupExtracted?: boolean;
23 |   //
24 |   _parent_converters?: DocumentConverter[];
25 | } & MammothOptions;
26 | 
27 | type MammothOptions = Parameters<typeof mammoth.convertToHtml>[1];
28 | 
29 | export interface DocumentConverter {
30 |   convert(source: string | Buffer, options: ConverterOptions): Promise<ConverterResult>;
31 | }
32 | 


--------------------------------------------------------------------------------
/src/converters/plain-text.ts:
--------------------------------------------------------------------------------
 1 | import { ConverterOptions, DocumentConverter, ConverterResult } from "../types";
 2 | 
 3 | import * as mime from "mime-types";
 4 | import fs from "fs";
 5 | 
 6 | export class PlainTextConverter implements DocumentConverter {
 7 |   async convert(source: string | Buffer, options: ConverterOptions = {}): Promise<ConverterResult> {
 8 |     const fileExtension = options.file_extension || "";
 9 |     const contentType = mime.lookup(fileExtension);
10 | 
11 |     if (!contentType) {
12 |       return null;
13 |     } else if (!contentType.toLowerCase().includes("text/")) {
14 |       return null;
15 |     }
16 | 
17 |     let content: string;
18 |     if (typeof source === "string") {
19 |       content = fs.readFileSync(source, { encoding: "utf-8" });
20 |     } else {
21 |       content = Buffer.from(source).toString("utf-8");
22 |     }
23 | 
24 |     return {
25 |       title: null,
26 |       markdown: content,
27 |       text_content: content
28 |     };
29 |   }
30 | }
31 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Vaibhav Raj
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/src/converters/docx.ts:
--------------------------------------------------------------------------------
 1 | import { ConverterOptions, ConverterResult } from "../types";
 2 | import * as fs from "fs";
 3 | import { HtmlConverter } from "./html";
 4 | import Mammoth from "mammoth";
 5 | 
 6 | export class DocxConverter extends HtmlConverter {
 7 |   async convert(source: string | Buffer, options: ConverterOptions): Promise<ConverterResult> {
 8 |     const fileExtension = options.file_extension || "";
 9 |     if (![".docx"].includes(fileExtension.toLowerCase())) {
10 |       return null;
11 |     }
12 | 
13 |     try {
14 |       let mammothInput: { path: string } | { buffer: Buffer };
15 |       if (typeof source === "string") {
16 |         if (!fs.existsSync(source)) {
17 |           throw new Error("File does'nt exists");
18 |         }
19 |         mammothInput = { path: source };
20 |       } else {
21 |         mammothInput = { buffer: Buffer.from(source) };
22 |       }
23 | 
24 |       let htmlContent = await Mammoth.convertToHtml(mammothInput, {
25 |         ...options
26 |       });
27 | 
28 |       return await this._convert(htmlContent.value);
29 |     } catch (e) {
30 |       console.error(e);
31 |       return null;
32 |     }
33 |   }
34 | }
35 | 


--------------------------------------------------------------------------------
/src/converters/pdf.ts:
--------------------------------------------------------------------------------
 1 | import fs from "fs";
 2 | import { PDFParse } from "pdf-parse";
 3 | import { ConverterOptions, ConverterResult, DocumentConverter } from "../types";
 4 | 
 5 | export class PdfConverter implements DocumentConverter {
 6 |   async convert(
 7 |     source: string | Buffer,
 8 |     options: ConverterOptions = {}
 9 |   ): Promise<ConverterResult | null> {
10 |     const fileExtension = options.file_extension || "";
11 |     if (![".pdf"].includes(fileExtension.toLowerCase())) {
12 |       return null;
13 |     }
14 | 
15 |     try {
16 |       const pdfContent = typeof source === "string" ? fs.readFileSync(source) : Buffer.from(source);
17 |       return this._convert(pdfContent);
18 |     } catch (error) {
19 |       console.error("PDF Parsing Error:", error);
20 |       return null;
21 |     }
22 |   }
23 |   private async _convert(pdfContent: Buffer): Promise<ConverterResult> {
24 |     try {
25 |       const parser = new PDFParse({ data: pdfContent });
26 |       const result = await parser.getText();
27 |       await parser.destroy();
28 |       return { title: null, markdown: result.text, text_content: result.text };
29 |     } catch (error) {
30 |       console.error("PDF Parsing Error:", error);
31 |       return null;
32 |     }
33 |   }
34 | }
35 | 


--------------------------------------------------------------------------------
/src/converters/media.ts:
--------------------------------------------------------------------------------
 1 | import { ConverterOptions, ConverterResult, DocumentConverter } from "../types";
 2 | import * as childProcess from "child_process";
 3 | import * as util from "util";
 4 | 
 5 | const exec = util.promisify(childProcess.exec);
 6 | 
 7 | export abstract class MediaConverter implements DocumentConverter {
 8 |   abstract convert(
 9 |     source: string | Buffer,
10 |     options: ConverterOptions
11 |   ): Promise<ConverterResult | null>;
12 | 
13 |   async _getMetadata(local_path: string): Promise<{ [key: string]: string } | null> {
14 |     const exiftool = await this._which("exiftool");
15 |     if (!exiftool) {
16 |       console.error("exiftool is not found on this system so metadata cannot be extracted");
17 |       return null;
18 |     }
19 |     try {
20 |       const result = await exec(`"${exiftool}" -json "${local_path}"`);
21 |       return JSON.parse(result.stdout)[0];
22 |     } catch (error) {
23 |       console.error("Exiftool error:", error);
24 |       return null;
25 |     }
26 |   }
27 |   private async _which(command: string): Promise<string | null> {
28 |     try {
29 |       const result = await exec(`which ${command}`);
30 |       return result.stdout.trim();
31 |     } catch (error) {
32 |       console.warn("Which command error:", error);
33 |       return null;
34 |     }
35 |   }
36 | }
37 | 


--------------------------------------------------------------------------------
/src/converters/xlsx.ts:
--------------------------------------------------------------------------------
 1 | import { ConverterOptions, ConverterResult } from "../types";
 2 | import { HtmlConverter } from "./html";
 3 | import * as fs from "fs";
 4 | import * as XLSX from "xlsx";
 5 | 
 6 | export class XlsxConverter extends HtmlConverter {
 7 |   async convert(source: string | Buffer, options: ConverterOptions): Promise<ConverterResult> {
 8 |     const extension = options.file_extension || "";
 9 |     if (![".xlsx"].includes(extension.toLowerCase())) {
10 |       return null;
11 |     }
12 | 
13 |     try {
14 |       let workbook: XLSX.WorkBook;
15 |       if (typeof source === "string") {
16 |         if (!fs.existsSync(source)) {
17 |           throw new Error("File does'nt exists");
18 |         }
19 |         workbook = XLSX.readFile(source);
20 |       } else {
21 |         workbook = XLSX.read(source, { type: "buffer" });
22 |       }
23 | 
24 |       let mdContent = "";
25 | 
26 |       for (const sheetName of workbook.SheetNames) {
27 |         mdContent += `## ${sheetName}\n`;
28 |         let htmlContent = XLSX.utils.sheet_to_html(workbook.Sheets[sheetName]);
29 |         mdContent += (await this._convert(htmlContent))?.markdown.trim() + "\n\n";
30 |       }
31 |       return { title: workbook?.Props?.Title || "Untitled", markdown: mdContent, text_content: mdContent };
32 |     } catch (e) {
33 |       console.error(e);
34 |       return null;
35 |     }
36 |   }
37 | }
38 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Dependency directory
 2 | node_modules
 3 | 
 4 | # Build output
 5 | dist
 6 | 
 7 | # Rest pulled from https://github.com/github/gitignore/blob/master/Node.gitignore
 8 | # Logs
 9 | logs
10 | *.log
11 | npm-debug.log*
12 | yarn-debug.log*
13 | yarn-error.log*
14 | lerna-debug.log*
15 | 
16 | # Diagnostic reports (https://nodejs.org/api/report.html)
17 | report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
18 | 
19 | # Runtime data
20 | pids
21 | *.pid
22 | *.seed
23 | *.pid.lock
24 | 
25 | # Directory for instrumented libs generated by jscoverage/JSCover
26 | lib-cov
27 | 
28 | # Coverage directory used by tools like istanbul
29 | coverage
30 | *.lcov
31 | 
32 | # nyc test coverage
33 | .nyc_output
34 | 
35 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
36 | .grunt
37 | 
38 | # Bower dependency directory (https://bower.io/)
39 | bower_components
40 | 
41 | # node-waf configuration
42 | .lock-wscript
43 | 
44 | # Compiled binary addons (https://nodejs.org/api/addons.html)
45 | build/Release
46 | 
47 | # Dependency directories
48 | jspm_packages/
49 | 
50 | # TypeScript v1 declaration files
51 | typings/
52 | 
53 | # TypeScript cache
54 | *.tsbuildinfo
55 | 
56 | # Optional npm cache directory
57 | .npm
58 | 
59 | # Optional eslint cache
60 | .eslintcache
61 | 
62 | # Optional REPL history
63 | .node_repl_history
64 | 
65 | # Output of 'npm pack'
66 | *.tgz
67 | 
68 | # Yarn Integrity file
69 | .yarn-integrity
70 | 
71 | # parcel-bundler cache (https://parceljs.org/)
72 | .cache
73 | 
74 | # next.js build output
75 | .next
76 | 
77 | # nuxt.js build output
78 | .nuxt
79 | 
80 | # vuepress build output
81 | .vuepress/dist
82 | 
83 | # Serverless directories
84 | .serverless/
85 | 
86 | # FuseBox cache
87 | .fusebox/
88 | 
89 | # DynamoDB Local files
90 | .dynamodb/
91 | 
92 | # OS metadata
93 | .DS_Store
94 | Thumbs.db
95 | 
96 | # Ignore built ts files
97 | __tests__/runner/*
98 | lib/**/*


--------------------------------------------------------------------------------
/src/converters/html.ts:
--------------------------------------------------------------------------------
 1 | import * as fs from "fs";
 2 | import { JSDOM } from "jsdom";
 3 | import { CustomTurnDown } from "../custom-turndown";
 4 | import { ConverterOptions, ConverterResult, DocumentConverter } from "../types";
 5 | 
 6 | export class HtmlConverter implements DocumentConverter {
 7 |   async convert(source: string | Buffer, options: ConverterOptions): Promise<ConverterResult> {
 8 |     const extension = options.file_extension || "";
 9 |     if (![".html", ".htm"].includes(extension.toLowerCase())) {
10 |       return null;
11 |     }
12 | 
13 |     try {
14 |       let content;
15 |       if (typeof source === "string") {
16 |         let exists = fs.existsSync(source);
17 |         if (!exists) {
18 |           throw new Error("File does'nt exists");
19 |         }
20 |         content = fs.readFileSync(source, { encoding: "utf-8" });
21 |       } else {
22 |         content = source.toString("utf-8");
23 |       }
24 | 
25 |       return await this._convert(content);
26 |     } catch (e) {
27 |       console.error(e);
28 |       return null;
29 |     }
30 |   }
31 | 
32 |   async _convert(htmlContent: string): Promise<ConverterResult> {
33 |     const soup = new JSDOM(htmlContent);
34 |     // NOTE: I can add this to avoid getting
35 |     // ReferenceError: HTMLElement is not defined in CustomTurndown
36 |     // but I'm not sure if it's the right way to do it
37 |     // global.HTMLElement = soup.window.HTMLElement;
38 | 
39 |     const doc = soup.window.document;
40 | 
41 |     doc.querySelectorAll("script, style").forEach((script) => {
42 |       script.remove();
43 |     });
44 | 
45 |     const bodyElm = doc.querySelector("body");
46 |     let webpageText = "";
47 |     if (bodyElm) {
48 |       webpageText = new CustomTurnDown().convert_soup(bodyElm);
49 |     } else {
50 |       webpageText = new CustomTurnDown().convert_soup(doc);
51 |     }
52 | 
53 |     return {
54 |       title: doc.title,
55 |       markdown: webpageText,
56 |       text_content: webpageText
57 |     };
58 |   }
59 | }
60 | 


--------------------------------------------------------------------------------
/src/converters/wikipedia.ts:
--------------------------------------------------------------------------------
 1 | import * as fs from "fs";
 2 | import { JSDOM } from "jsdom";
 3 | import { CustomTurnDown } from "../custom-turndown";
 4 | import { ConverterOptions, ConverterResult, DocumentConverter } from "../types";
 5 | 
 6 | const WIKIPEDIA_REGEX = /^https?:\/\/[a-zA-Z]{2,3}\.wikipedia\.org\//;
 7 | const BODY_SELECTOR_QUERY = "div#mw-content-text";
 8 | const TITLE_SELECTOR_QUERY = "span.mw-page-title-main";
 9 | 
10 | export class WikipediaConverter implements DocumentConverter {
11 |   async convert(source: string | Buffer, options: ConverterOptions = {}): Promise<ConverterResult> {
12 |     const fileExtension = options.file_extension || "";
13 |     if (![".html", ".htm"].includes(fileExtension.toLowerCase())) {
14 |       return null;
15 |     }
16 |     const url = options.url || "";
17 |     if (!WIKIPEDIA_REGEX.test(url)) {
18 |       return null;
19 |     }
20 | 
21 |     try {
22 |       const htmlContent =
23 |         typeof source === "string"
24 |           ? fs.readFileSync(source, { encoding: "utf-8" })
25 |           : source.toString("utf-8");
26 |       return this._convert(htmlContent);
27 |     } catch (error) {
28 |       console.error("Wikipedia Parsing Error:", error);
29 |       return null;
30 |     }
31 |   }
32 | 
33 |   private _convert(htmlContent: string): ConverterResult {
34 |     const dom = new JSDOM(htmlContent);
35 |     const doc = dom.window.document;
36 | 
37 |     doc.querySelectorAll("script, style").forEach((script) => {
38 |       script.remove();
39 |     });
40 | 
41 |     const bodyElm = doc.querySelector(BODY_SELECTOR_QUERY);
42 |     const titleElm = doc.querySelector(TITLE_SELECTOR_QUERY);
43 | 
44 |     let webpageText = "";
45 |     let mainTitle = doc.title;
46 | 
47 |     if (bodyElm) {
48 |       if (titleElm && titleElm.textContent) {
49 |         mainTitle = titleElm.textContent;
50 |       }
51 |       webpageText =
52 |         `# ${mainTitle}\n\n` + new CustomTurnDown().convert_soup(bodyElm as HTMLElement);
53 |     } else {
54 |       webpageText = new CustomTurnDown().convert_soup(doc);
55 |     }
56 | 
57 |     return { title: mainTitle, markdown: webpageText, text_content: webpageText };
58 |   }
59 | }
60 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "markitdown-ts",
 3 |   "version": "0.0.8",
 4 |   "description": "",
 5 |   "keywords": [],
 6 |   "homepage": "https://github.com/dead8309/markitdown-ts#readme",
 7 |   "bugs": {
 8 |     "url": "https://github.com/dead8309/markitdown-ts/issues"
 9 |   },
10 |   "repository": {
11 |     "type": "git",
12 |     "url": "git+https://github.com/dead8309/markitdown-ts.git"
13 |   },
14 |   "license": "MIT",
15 |   "author": "Vaibhav Raj",
16 |   "sideEffects": false,
17 |   "exports": {
18 |     ".": {
19 |       "require": "./dist/index.cjs",
20 |       "import": "./dist/index.mjs",
21 |       "types": "./dist/index.d.ts"
22 |     }
23 |   },
24 |   "main": "dist/index.cjs",
25 |   "module": "dist/index.mjs",
26 |   "types": "dist/index.d.ts",
27 |   "files": [
28 |     "dist"
29 |   ],
30 |   "scripts": {
31 |     "build": "unbuild",
32 |     "format": "prettier --write src/**/*.ts",
33 |     "release": "bumpp --commit --push --tag && pnpm publish",
34 |     "preversion": "pnpm typecheck && pnpm build",
35 |     "test": "vitest",
36 |     "typecheck": "tsc --noEmit"
37 |   },
38 |   "devDependencies": {
39 |     "@ai-sdk/openai": "^2.0.62",
40 |     "@types/jsdom": "^21.1.7",
41 |     "@types/mime-types": "^2.1.4",
42 |     "@types/node": "^22.10.2",
43 |     "@types/turndown": "^5.0.5",
44 |     "@types/unzipper": "^0.10.10",
45 |     "bumpp": "^9.9.1",
46 |     "is-ci": "^4.1.0",
47 |     "prettier": "^3.4.2",
48 |     "typescript": "^5.7.2",
49 |     "unbuild": "^3.0.1",
50 |     "vite": "^6.0.4",
51 |     "vitest": "^2.1.8",
52 |     "zod": "^4.1.8"
53 |   },
54 |   "packageManager": "pnpm@9.15.1",
55 |   "dependencies": {
56 |     "@joplin/turndown-plugin-gfm": "^1.0.60",
57 |     "@xmldom/xmldom": "^0.9.6",
58 |     "ai": "^5.0.87",
59 |     "jsdom": "^25.0.1",
60 |     "mammoth": "^1.8.0",
61 |     "mime-types": "^2.1.35",
62 |     "pdf-parse": "^2.4.5",
63 |     "turndown": "^7.2.0",
64 |     "xlsx": "^0.18.5"
65 |   },
66 |   "peerDependencies": {
67 |     "unzipper": "^0.12.3",
68 |     "youtube-transcript": "^1.2.1"
69 |   },
70 |   "peerDependenciesMeta": {
71 |     "youtube-transcript": {
72 |       "optional": true
73 |     },
74 |     "unzipper": {
75 |       "optional": true
76 |     }
77 |   }
78 | }
79 | 


--------------------------------------------------------------------------------
/src/converters/ipynb.ts:
--------------------------------------------------------------------------------
 1 | import { ConverterOptions, ConverterResult, DocumentConverter } from "../types";
 2 | import * as fs from "fs";
 3 | 
 4 | export class IpynbConverter implements DocumentConverter {
 5 |   async convert(
 6 |     source: string | Buffer,
 7 |     options: ConverterOptions = {}
 8 |   ): Promise<ConverterResult | null> {
 9 |     const fileExtension = options.file_extension || "";
10 |     if (fileExtension.toLowerCase() !== ".ipynb") {
11 |       return null;
12 |     }
13 |     try {
14 |       const contentStirng =
15 |         typeof source === "string"
16 |           ? fs.readFileSync(source, { encoding: "utf-8" })
17 |           : source.toString("utf-8");
18 |       const notebookContent = JSON.parse(contentStirng);
19 |       return this._convert(notebookContent);
20 |     } catch (error) {
21 |       console.error("Error converting .ipynb file:", error);
22 |       return null;
23 |     }
24 |   }
25 | 
26 |   private _convert(notebookContent: any): ConverterResult {
27 |     try {
28 |       const mdOutput: string[] = [];
29 |       let title: string | null = null;
30 |       for (const cell of notebookContent.cells || []) {
31 |         const cellType = cell.cell_type || "";
32 |         const sourceLines: string[] = cell.source || [];
33 | 
34 |         if (cellType === "markdown") {
35 |           mdOutput.push(sourceLines.join(""));
36 |           if (!title) {
37 |             for (const line of sourceLines) {
38 |               if (line.startsWith("# ")) {
39 |                 title = line.substring(line.indexOf("# ") + 2).trim();
40 |                 break;
41 |               }
42 |             }
43 |           }
44 |         } else if (cellType === "code") {
45 |           mdOutput.push(`\`\`\`python\n${sourceLines.join("")}\n\`\`\``);
46 |         } else if (cellType === "raw") {
47 |           mdOutput.push(`\`\`\`\n${sourceLines.join("")}\n\`\`\``);
48 |         }
49 |       }
50 |       const mdText = mdOutput.join("\n\n");
51 |       title = notebookContent.metadata?.title || title;
52 |       return { title: title, markdown: mdText, text_content: mdText };
53 |     } catch (e) {
54 |       console.error("Error converting .ipynb file:", e);
55 |       throw new Error(`Error converting .ipynb file: ${e}`);
56 |     }
57 |   }
58 | }
59 | 


--------------------------------------------------------------------------------
/src/converters/wav.ts:
--------------------------------------------------------------------------------
 1 | import { ConverterOptions, ConverterResult } from "../types";
 2 | import { MediaConverter } from "./media";
 3 | 
 4 | export class WavConverter extends MediaConverter {
 5 |   async convert(
 6 |     source: string | Buffer,
 7 |     options: ConverterOptions = {}
 8 |   ): Promise<ConverterResult | null> {
 9 |     const fileExtension = options.file_extension || "";
10 |     if (fileExtension.toLowerCase() !== ".wav") {
11 |       return null;
12 |     }
13 |     try {
14 |       return this._convert(source, options);
15 |     } catch (error) {
16 |       console.error("WAV Conversion Error:", error);
17 |       return null;
18 |     }
19 |   }
20 | 
21 |   private async _convert(source: string | Buffer, _: ConverterOptions): Promise<ConverterResult> {
22 |     let mdContent = "";
23 | 
24 |     if (typeof source === "string") {
25 |       const metadata = await this._getMetadata(source);
26 |       if (metadata) {
27 |         for (const f of [
28 |           "Title",
29 |           "Artist",
30 |           "Author",
31 |           "Band",
32 |           "Album",
33 |           "Genre",
34 |           "Track",
35 |           "DateTimeOriginal",
36 |           "CreateDate",
37 |           "Duration"
38 |         ]) {
39 |           if (metadata[f]) {
40 |             mdContent += `${f}: ${metadata[f]}\n`;
41 |           }
42 |         }
43 |       }
44 |     } else {
45 |       console.warn(
46 |         "Metadata extraction is skipped for Buffer inputs as it requires a file path for exiftool."
47 |       );
48 |     }
49 | 
50 |     if (typeof source === "string") {
51 |       try {
52 |         const transcript = await this._transcribeAudio(source);
53 |         mdContent += `\n\n### Audio Transcript:\n${
54 |           transcript === "" ? "[No speech detected]" : transcript
55 |         }`;
56 |       } catch (error) {
57 |         console.error("Error loading speech recognition module:", error);
58 |         mdContent += "\n\n### Audio Transcript:\nError. Could not transcribe this audio.";
59 |       }
60 |     } else {
61 |       mdContent +=
62 |         "\n\n### Audio Transcript:\n[Audio transcription is not supported for Buffer inputs in this version.]";
63 |     }
64 | 
65 |     return { title: null, markdown: mdContent.trim(), text_content: mdContent.trim() };
66 |   }
67 | 
68 |   // TODO: Add speech to text
69 |   protected async _transcribeAudio(_: string): Promise<string> {
70 |     throw new Error("TODO: Audio transcription not implemented yet");
71 |   }
72 | }
73 | 


--------------------------------------------------------------------------------
/src/converters/mp3.ts:
--------------------------------------------------------------------------------
 1 | import { ConverterOptions, ConverterResult } from "../types";
 2 | import { WavConverter } from "./wav";
 3 | import * as fs from "fs/promises";
 4 | import * as os from "os";
 5 | import * as path from "path";
 6 | 
 7 | export class Mp3Converter extends WavConverter {
 8 |   async convert(
 9 |     source: string | Buffer,
10 |     options: ConverterOptions = {}
11 |   ): Promise<ConverterResult | null> {
12 |     const fileExtension = options.file_extension || "";
13 |     if (fileExtension.toLowerCase() !== ".mp3") {
14 |       return null;
15 |     }
16 |     try {
17 |       return await this._convert$(source, options);
18 |     } catch (error) {
19 |       console.error("MP3 Conversion Error:", error);
20 |       return null;
21 |     }
22 |   }
23 | 
24 |   private async _convert$(
25 |     source: string | Buffer,
26 |     options: ConverterOptions
27 |   ): Promise<ConverterResult> {
28 |     let mdContent = "";
29 | 
30 |     if (typeof source === "string") {
31 |       const metadata = await this._getMetadata(source);
32 |       if (metadata) {
33 |         for (const f of [
34 |           "Title",
35 |           "Artist",
36 |           "Author",
37 |           "Band",
38 |           "Album",
39 |           "Genre",
40 |           "Track",
41 |           "DateTimeOriginal",
42 |           "CreateDate",
43 |           "Duration"
44 |         ]) {
45 |           if (metadata[f]) {
46 |             mdContent += `${f}: ${metadata[f]}\n`;
47 |           }
48 |         }
49 |       }
50 |     } else {
51 |       console.warn(
52 |         "Metadata extraction is skipped for Buffer inputs as it requires a file path for exiftool."
53 |       );
54 |     }
55 | 
56 |     if (typeof source === "string") {
57 |       const tempPath = await fs.mkdtemp(path.join(os.tmpdir(), "temp_"));
58 |       const wavPath = path.join(tempPath, "audio.wav");
59 |       try {
60 |         const transcript = await super._transcribeAudio(wavPath);
61 |         mdContent += `\n\n### Audio Transcript:\n${transcript == "" ? "[No speech detected]" : transcript}`;
62 |       } catch (e) {
63 |         mdContent += "\n\n### Audio Transcript:\nError. Could not transcribe this audio.";
64 |       } finally {
65 |         await fs.unlink(wavPath);
66 |         await fs.rmdir(tempPath);
67 |       }
68 |     } else {
69 |       mdContent +=
70 |         "\n\n### Audio Transcript:\n[Audio conversion and transcription are not supported for Buffer inputs.]";
71 |     }
72 | 
73 |     return { title: null, markdown: mdContent.trim(), text_content: mdContent.trim() };
74 |   }
75 | }
76 | 


--------------------------------------------------------------------------------
/test/__files/test_notebook.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |     "cells": [
 3 |         {
 4 |             "cell_type": "markdown",
 5 |             "id": "0f61db80",
 6 |             "metadata": {},
 7 |             "source": [
 8 |                 "# Test Notebook"
 9 |             ]
10 |         },
11 |         {
12 |             "cell_type": "code",
13 |             "execution_count": 11,
14 |             "id": "3f2a5bbd",
15 |             "metadata": {},
16 |             "outputs": [
17 |                 {
18 |                     "name": "stdout",
19 |                     "output_type": "stream",
20 |                     "text": [
21 |                         "markitdown\n"
22 |                     ]
23 |                 }
24 |             ],
25 |             "source": [
26 |                 "print('markitdown')"
27 |             ]
28 |         },
29 |         {
30 |             "cell_type": "markdown",
31 |             "id": "9b9c0468",
32 |             "metadata": {},
33 |             "source": [
34 |                 "## Code Cell Below"
35 |             ]
36 |         },
37 |         {
38 |             "cell_type": "code",
39 |             "execution_count": 10,
40 |             "id": "37d8088a",
41 |             "metadata": {},
42 |             "outputs": [
43 |                 {
44 |                     "name": "stdout",
45 |                     "output_type": "stream",
46 |                     "text": [
47 |                         "42\n"
48 |                     ]
49 |                 }
50 |             ],
51 |             "source": [
52 |                 "# comment in code\n",
53 |                 "print(42)"
54 |             ]
55 |         },
56 |         {
57 |             "cell_type": "markdown",
58 |             "id": "2e3177bd",
59 |             "metadata": {},
60 |             "source": [
61 |                 "End\n",
62 |                 "\n",
63 |                 "---"
64 |             ]
65 |         }
66 |     ],
67 |     "metadata": {
68 |         "kernelspec": {
69 |             "display_name": "Python 3",
70 |             "language": "python",
71 |             "name": "python3"
72 |         },
73 |         "language_info": {
74 |             "codemirror_mode": {
75 |                 "name": "ipython",
76 |                 "version": 3
77 |             },
78 |             "file_extension": ".py",
79 |             "mimetype": "text/x-python",
80 |             "name": "python",
81 |             "nbconvert_exporter": "python",
82 |             "pygments_lexer": "ipython3",
83 |             "version": "3.12.8"
84 |         },
85 |         "title": "Test Notebook Title"
86 |     },
87 |     "nbformat": 4,
88 |     "nbformat_minor": 5
89 | }
90 | 


--------------------------------------------------------------------------------
/examples/ipynb-conversion/test_notebook.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |     "cells": [
 3 |         {
 4 |             "cell_type": "markdown",
 5 |             "id": "0f61db80",
 6 |             "metadata": {},
 7 |             "source": [
 8 |                 "# Test Notebook"
 9 |             ]
10 |         },
11 |         {
12 |             "cell_type": "code",
13 |             "execution_count": 11,
14 |             "id": "3f2a5bbd",
15 |             "metadata": {},
16 |             "outputs": [
17 |                 {
18 |                     "name": "stdout",
19 |                     "output_type": "stream",
20 |                     "text": [
21 |                         "markitdown\n"
22 |                     ]
23 |                 }
24 |             ],
25 |             "source": [
26 |                 "print('markitdown')"
27 |             ]
28 |         },
29 |         {
30 |             "cell_type": "markdown",
31 |             "id": "9b9c0468",
32 |             "metadata": {},
33 |             "source": [
34 |                 "## Code Cell Below"
35 |             ]
36 |         },
37 |         {
38 |             "cell_type": "code",
39 |             "execution_count": 10,
40 |             "id": "37d8088a",
41 |             "metadata": {},
42 |             "outputs": [
43 |                 {
44 |                     "name": "stdout",
45 |                     "output_type": "stream",
46 |                     "text": [
47 |                         "42\n"
48 |                     ]
49 |                 }
50 |             ],
51 |             "source": [
52 |                 "# comment in code\n",
53 |                 "print(42)"
54 |             ]
55 |         },
56 |         {
57 |             "cell_type": "markdown",
58 |             "id": "2e3177bd",
59 |             "metadata": {},
60 |             "source": [
61 |                 "End\n",
62 |                 "\n",
63 |                 "---"
64 |             ]
65 |         }
66 |     ],
67 |     "metadata": {
68 |         "kernelspec": {
69 |             "display_name": "Python 3",
70 |             "language": "python",
71 |             "name": "python3"
72 |         },
73 |         "language_info": {
74 |             "codemirror_mode": {
75 |                 "name": "ipython",
76 |                 "version": 3
77 |             },
78 |             "file_extension": ".py",
79 |             "mimetype": "text/x-python",
80 |             "name": "python",
81 |             "nbconvert_exporter": "python",
82 |             "pygments_lexer": "ipython3",
83 |             "version": "3.12.8"
84 |         },
85 |         "title": "Test Notebook Title"
86 |     },
87 |     "nbformat": 4,
88 |     "nbformat_minor": 5
89 | }
90 | 


--------------------------------------------------------------------------------
/src/converters/image.ts:
--------------------------------------------------------------------------------
 1 | import { ConverterOptions, ConverterResult } from "../types";
 2 | import { MediaConverter } from "./media";
 3 | import * as fs from "fs";
 4 | import { generateText } from "ai";
 5 | 
 6 | export class ImageConverter extends MediaConverter {
 7 |   async convert(
 8 |     source: string | Buffer,
 9 |     options: ConverterOptions = {}
10 |   ): Promise<ConverterResult | null> {
11 |     const fileExtension = options.file_extension || "";
12 |     if (![".jpg", ".jpeg", ".png"].includes(fileExtension.toLowerCase())) {
13 |       return null;
14 |     }
15 | 
16 |     try {
17 |       return this._convert(source, options);
18 |     } catch (error) {
19 |       console.error("Image Conversion Error:", error);
20 |       return null;
21 |     }
22 |   }
23 |   private async _convert(
24 |     source: string | Buffer,
25 |     options: ConverterOptions
26 |   ): Promise<ConverterResult> {
27 |     let mdContent = "";
28 | 
29 |     if (typeof source === "string") {
30 |       const metadata = await this._getMetadata(source);
31 |       if (metadata) {
32 |         for (const f of [
33 |           "ImageSize",
34 |           "Title",
35 |           "Caption",
36 |           "Description",
37 |           "Keywords",
38 |           "Artist",
39 |           "Author",
40 |           "DateTimeOriginal",
41 |           "CreateDate",
42 |           "GPSPosition"
43 |         ]) {
44 |           if (metadata[f]) {
45 |             mdContent += `${f}: ${metadata[f]}\n`;
46 |           }
47 |         }
48 |       }
49 |     } else {
50 |       console.warn(
51 |         "Metadata extraction is skipped for Buffer inputs as it requires a file path for exiftool."
52 |       );
53 |     }
54 | 
55 |     if (options.llmModel) {
56 |       const imageBuffer =
57 |         typeof source === "string" ? fs.readFileSync(source) : Buffer.from(source);
58 |       mdContent += `\n# Description:\n${(await this._getLLMDescription(imageBuffer, options)).trim()}\n`;
59 |     }
60 |     return { title: null, markdown: mdContent.trim(), text_content: mdContent.trim() };
61 |   }
62 |   private async _getLLMDescription(
63 |     imageBuffer: Buffer,
64 |     options: ConverterOptions
65 |   ): Promise<string> {
66 |     if (!options.llmPrompt || options.llmPrompt.trim() === "") {
67 |       options.llmPrompt = "Write a detailed caption for this image.";
68 |     }
69 |     const imageFileAsBase64 = imageBuffer.toString("base64");
70 | 
71 |     const result = await generateText({
72 |       model: options.llmModel!,
73 |       messages: [
74 |         {
75 |           role: "user",
76 |           content: [
77 |             { type: "text", text: options.llmPrompt },
78 |             {
79 |               type: "image",
80 |               image: imageFileAsBase64
81 |             }
82 |           ]
83 |         }
84 |       ]
85 |     });
86 | 
87 |     return result.text.trim();
88 |   }
89 | }
90 | 


--------------------------------------------------------------------------------
/src/converters/bingserp.ts:
--------------------------------------------------------------------------------
 1 | import { ConverterOptions, ConverterResult, DocumentConverter } from "../types";
 2 | import * as fs from "fs";
 3 | import { JSDOM } from "jsdom";
 4 | import { URL, URLSearchParams } from "url";
 5 | import { CustomTurnDown } from "../custom-turndown";
 6 | 
 7 | export class BingSerpConverter implements DocumentConverter {
 8 |   async convert(
 9 |     source: string | Buffer,
10 |     options: ConverterOptions = {}
11 |   ): Promise<ConverterResult | null> {
12 |     const fileExtension = options.file_extension || "";
13 |     if (![".html", ".htm"].includes(fileExtension.toLowerCase())) {
14 |       return null;
15 |     }
16 |     const url = options.url || "";
17 |     if (!/^https:\/\/www\.bing\.com\/search\?q=/.test(url)) {
18 |       return null;
19 |     }
20 | 
21 |     try {
22 |       const htmlContent =
23 |         typeof source === "string"
24 |           ? fs.readFileSync(source, { encoding: "utf-8" })
25 |           : Buffer.from(source).toString("utf-8");
26 |       return this._convert(htmlContent, url);
27 |     } catch (error) {
28 |       console.error("Bing SERP Parsing Error:", error);
29 |       return null;
30 |     }
31 |   }
32 |   private _convert(htmlContent: string, url: string): ConverterResult {
33 |     const dom = new JSDOM(htmlContent);
34 |     const doc = dom.window.document;
35 |     const parsedParams = new URL(url).searchParams;
36 |     const query = parsedParams.get("q") || "";
37 | 
38 |     doc.querySelectorAll(".tptt").forEach((tptt) => {
39 |       if (tptt.textContent) {
40 |         tptt.textContent += " ";
41 |       }
42 |     });
43 |     doc.querySelectorAll(".algoSlug_icon").forEach((slug) => {
44 |       slug.remove();
45 |     });
46 | 
47 |     const markdownify = new CustomTurnDown();
48 |     const results: string[] = [];
49 |     doc.querySelectorAll(".b_algo").forEach((result) => {
50 |       result.querySelectorAll("a[href]").forEach((a) => {
51 |         try {
52 |           const parsedHref = new URL(a.getAttribute("href")!);
53 |           const params = parsedHref.searchParams;
54 |           const u = params.get("u");
55 |           if (u) {
56 |             const decoded = this._decodeBase64Url(u);
57 |             a.setAttribute("href", decoded);
58 |           }
59 |         } catch (e) {}
60 |       });
61 |       const mdResult = markdownify.convert_soup(result as HTMLElement).trim();
62 |       const lines = mdResult
63 |         .split(/\n+/)
64 |         .map((line) => line.trim())
65 |         .filter((line) => line.length > 0);
66 |       results.push(lines.join("\n"));
67 |     });
68 |     const webpageText = `## A Bing search for '${query}' found the following results:\n\n${results.join("\n\n")}`;
69 |     return { title: doc.title, markdown: webpageText, text_content: webpageText };
70 |   }
71 | 
72 |   private _decodeBase64Url(encodedUrl: string): string {
73 |     let u = encodedUrl.slice(2).trim() + "==";
74 |     try {
75 |       const decoded = Buffer.from(u, "base64").toString("utf-8");
76 |       return decoded;
77 |     } catch (error) {
78 |       console.error("Error decoding Base64URL:", error);
79 |       return encodedUrl;
80 |     }
81 |   }
82 | }
83 | 


--------------------------------------------------------------------------------
/src/custom-turndown.ts:
--------------------------------------------------------------------------------
 1 | import TurndownService from "turndown";
 2 | import turndownPluginGfm from "@joplin/turndown-plugin-gfm";
 3 | 
 4 | export class CustomTurnDown {
 5 |   convert_soup(doc: string | TurndownService.Node): string {
 6 |     let turnDownService = new TurndownService({
 7 |       headingStyle: "atx"
 8 |     });
 9 |     turnDownService.use(turndownPluginGfm.gfm);
10 | 
11 |     turnDownService.addRule("anchor tags", {
12 |       filter: ["a"],
13 |       replacement: function (content, node) {
14 |         if (content === "") {
15 |           return "";
16 |         }
17 | 
18 |         let prefix = "";
19 |         let suffix = "";
20 |         if (content && content[0] === " ") {
21 |           prefix = " ";
22 |         }
23 |         if (content && content[content.length - 1] === " ") {
24 |           suffix = " ";
25 |         }
26 | 
27 |         //NOTE:replace all the characters after \n\n with empty string if its present
28 |         let text = content.trim().replace(/\n\n.*/g, "");
29 |         if (text === "") {
30 |           return "";
31 |         }
32 | 
33 |         // NOTE: Ignore the type error for getAttribute and title call
34 |         // @ts-ignore
35 |         let href = node.getAttribute("href");
36 |         // @ts-ignore
37 |         let title = node.title;
38 | 
39 |         if (href) {
40 |           try {
41 |             let parsed_url = new URL(href);
42 |             if (!["https:", "http:", "file:"].includes(parsed_url.protocol)) {
43 |               return `${prefix}${text}${suffix}`;
44 |             }
45 |             // NOTE: Some Tests were failing if the href was encoded
46 |             // href = encodeURIComponent(parsed_url.pathname);
47 |           } catch (e) {
48 |             if (!/^https?:|^file:/.test(href)) {
49 |               return `${prefix}[${text}](${href} "${title}")${suffix}`;
50 |             }
51 |             return `${prefix}${text}${suffix}`;
52 |           }
53 |         }
54 | 
55 |         if (text.replace(/\\_/g, "_") === href && !title) {
56 |           return `<${href}>`;
57 |         }
58 | 
59 |         if (!title && href) {
60 |           title = href;
61 |         }
62 | 
63 |         let title_part = title ? ` "${title}"` : "";
64 | 
65 |         return `${prefix}[${text}](${href}${title_part})${suffix}`;
66 |       }
67 |     });
68 | 
69 |     turnDownService.addRule("img tags", {
70 |       filter: ["img"],
71 |       replacement: function (_, node) {
72 |         if (!node || node.nodeName !== "IMG") {
73 |           return "";
74 |         }
75 | 
76 |         // NOTE: Ignore the type error for getAttribute calls
77 |         // @ts-ignore
78 |         let alt = node.getAttribute("alt") || "";
79 |         // @ts-ignore
80 |         let src = node.getAttribute("src") || "";
81 |         // @ts-ignore
82 |         let title = node.getAttribute("title") || "";
83 | 
84 |         let titlePart = title ? ` "${title}"` : "";
85 | 
86 |         if (src.startsWith("data:")) {
87 |           src = src.split(",")[0] + "...";
88 |         }
89 | 
90 |         return `![${alt}](${src}${titlePart})`;
91 |       }
92 |     });
93 | 
94 |     let markdown = turnDownService.turndown(doc);
95 |     return markdown;
96 |   }
97 | }
98 | 


--------------------------------------------------------------------------------
/src/converters/zip.ts:
--------------------------------------------------------------------------------
  1 | import { ConverterOptions, ConverterResult, DocumentConverter } from "../types";
  2 | import * as fs from "fs";
  3 | import * as path from "path";
  4 | import { PassThrough } from "stream";
  5 | import unzipper from "unzipper";
  6 | 
  7 | export class ZipConverter implements DocumentConverter {
  8 |   async convert(
  9 |     source: string | Buffer,
 10 |     options: ConverterOptions = {}
 11 |   ): Promise<ConverterResult | null> {
 12 |     const fileExtension = options.file_extension || "";
 13 |     if (fileExtension.toLowerCase() !== ".zip") {
 14 |       return null;
 15 |     }
 16 |     const parentConverters = options._parent_converters || [];
 17 |     if (!parentConverters) {
 18 |       return {
 19 |         title: null,
 20 |         markdown: `[ERROR] No converters available to process zip contents from: ${source}`,
 21 |         text_content: `[ERROR] No converters available to process zip contents from: ${source}`
 22 |       };
 23 |     }
 24 | 
 25 |     let unzipper;
 26 |     try {
 27 |       unzipper = await import("unzipper").then((mod) => mod.default);
 28 |     } catch (error) {
 29 |       console.error(
 30 |         "Optional dependency 'unzipper' is not installed. Run `npm install unzipper` to enable this feature."
 31 |       );
 32 |       return null;
 33 |     }
 34 | 
 35 |     try {
 36 |       const zipFileName = typeof source === "string" ? path.basename(source) : "archive.zip";
 37 |       let mdContent = `Content from the zip file \`${zipFileName}\`:\n\n`;
 38 |       const mdResults: string[] = [];
 39 | 
 40 |       const processEntry = async (entry: unzipper.Entry) => {
 41 |         const relativePath = entry.path;
 42 |         if (entry.type === "File") {
 43 |           const entryExtension = path.extname(relativePath);
 44 |           const entryBuffer = await entry.buffer();
 45 | 
 46 |           const fileOptions = {
 47 |             ...options,
 48 |             file_extension: entryExtension,
 49 |             _parent_converters: parentConverters
 50 |           };
 51 | 
 52 |           for (const converter of parentConverters) {
 53 |             if (converter instanceof ZipConverter) {
 54 |               continue;
 55 |             }
 56 |             const result = await converter.convert(entryBuffer, fileOptions);
 57 |             if (result) {
 58 |               mdResults.push(`\n## File: ${relativePath}\n\n${result.markdown}\n\n`);
 59 |               break;
 60 |             }
 61 |           }
 62 |         } else {
 63 |           entry.autodrain();
 64 |         }
 65 |       };
 66 | 
 67 |       const inputStream =
 68 |         typeof source === "string" ? fs.createReadStream(source) : new PassThrough().end(source);
 69 | 
 70 |       await new Promise((res, rej) => {
 71 |         const parser = unzipper.Parse();
 72 | 
 73 |         parser.on("entry", (entry: unzipper.Entry) => {
 74 |           processEntry(entry).catch((err) => {
 75 |             parser.destroy(err);
 76 |             rej(err);
 77 |           });
 78 |         });
 79 |         parser.on("finish", res);
 80 |         parser.on("error", rej);
 81 | 
 82 |         inputStream.pipe(parser);
 83 |       });
 84 | 
 85 |       mdContent += mdResults.join("");
 86 | 
 87 |       return { title: null, markdown: mdContent.trim(), text_content: mdContent.trim() };
 88 |     } catch (error: any) {
 89 |       if (error.message.includes("invalid signature")) {
 90 |         return {
 91 |           title: null,
 92 |           markdown: `[ERROR] Invalid or corrupted zip file: ${source}`,
 93 |           text_content: `[ERROR] Invalid or corrupted zip file: ${source}`
 94 |         };
 95 |       }
 96 |       return {
 97 |         title: null,
 98 |         markdown: `[ERROR] Failed to process zip file ${source}: ${String(error)}`,
 99 |         text_content: `[ERROR] Failed to process zip file ${source}: ${String(error)}`
100 |       };
101 |     }
102 |   }
103 | }
104 | 


--------------------------------------------------------------------------------
/test/test.data.ts:
--------------------------------------------------------------------------------
  1 | export const PLAIN_TEST = ["hello world", "hi there", "bye bye"];
  2 | 
  3 | export const BLOG_TEST_URL = "https://microsoft.github.io/autogen/blog/2023/04/21/LLM-tuning-math";
  4 | export const BLOG_TEST_STRINGS = [
  5 |   "Large language models (LLMs) are powerful tools that can generate natural language texts for various applications, such as chatbots, summarization, translation, and more. GPT-4 is currently the state of the art LLM in the world. Is model selection irrelevant? What about inference parameters?",
  6 |   "an example where high cost can easily prevent a generic complex"
  7 | ];
  8 | 
  9 | export const RSS_TEST_STRINGS = [
 10 |   "The Official Microsoft Blog",
 11 |   "In the case of AI, it is absolutely true that the industry is moving incredibly fast"
 12 | ];
 13 | 
 14 | export const WIKIPEDIA_TEST_URL = "https://en.wikipedia.org/wiki/Microsoft";
 15 | export const WIKIPEDIA_TEST_STRINGS = [
 16 |   "Microsoft entered the operating system (OS) business in 1980 with its own version of [Unix]",
 17 |   'Microsoft was founded by [Bill Gates](/wiki/Bill_Gates "Bill Gates")'
 18 | ];
 19 | export const WIKIPEDIA_TEST_EXCLUDES = [
 20 |   "You are encouraged to create an account and log in",
 21 |   "154 languages",
 22 |   "move to sidebar"
 23 | ];
 24 | 
 25 | export const YOUTUBE_TEST_URL = "https://www.youtube.com/watch?v=V2qZ_lgxTzg";
 26 | export const YOUTUBE_TEST_STRINGS = [
 27 |   "## AutoGen FULL Tutorial with Python (Step-By-Step)",
 28 |   "This is an intermediate tutorial for installing and using AutoGen locally",
 29 |   "PT15M4S",
 30 | ];
 31 | 
 32 | export const IPYNB_TEST_STRINGS = [
 33 |   "Test Notebook",
 34 |   "## Code Cell Below",
 35 |   "print(42)",
 36 |   "print('markitdown')"
 37 | ];
 38 | 
 39 | export const SERP_TEST_URL = "https://www.bing.com/search?q=microsoft+wikipedia";
 40 | export const SERP_TEST_STRINGS = [
 41 |   "](https://en.wikipedia.org/wiki/Microsoft",
 42 |   "Microsoft Corporation is **an American multinational corporation and technology company headquartered** in Redmond",
 43 |   "*   1995–2007: Foray into the Web, Windows 95, Windows XP, and Xbox"
 44 | ];
 45 | export const SERP_TEST_EXCLUDES = [
 46 |   "https://www.bing.com/ck/a?!&&p=",
 47 |   "data:image/svg+xml,%3Csvg%20width%3D"
 48 | ];
 49 | 
 50 | export const PDF_TEST_URL = "https://arxiv.org/pdf/2308.08155v2.pdf";
 51 | export const PDF_TEST_STRINGS = [
 52 |   "While there is contemporaneous exploration of multi-agent approaches"
 53 | ];
 54 | 
 55 | export const DOCX_TEST_STRINGS = [
 56 |   "314b0a30-5b04-470b-b9f7-eed2c2bec74a",
 57 |   "49e168b7-d2ae-407f-a055-2167576f39a1",
 58 |   "## d666f1f7-46cb-42bd-9a39-9a39cf2a509f",
 59 |   "# Abstract",
 60 |   "# Introduction",
 61 |   "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation"
 62 | ];
 63 | 
 64 | export const DOCX_COMMENT_TEST_STRINGS = [
 65 |   "314b0a30-5b04-470b-b9f7-eed2c2bec74a",
 66 |   "49e168b7-d2ae-407f-a055-2167576f39a1",
 67 |   "## d666f1f7-46cb-42bd-9a39-9a39cf2a509f",
 68 |   "# Abstract",
 69 |   "# Introduction",
 70 |   "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
 71 |   "This is a test comment. 12df-321a",
 72 |   "Yet another comment in the doc. 55yiyi-asd09"
 73 | ];
 74 | 
 75 | export const XLSX_TEST_STRINGS = [
 76 |   "## 09060124-b5e7-4717-9d07-3c046eb",
 77 |   "6ff4173b-42a5-4784-9b19-f49caff4d93d",
 78 |   "affc7dad-52dc-4b98-9b5d-51e65d8a8ad0"
 79 | ];
 80 | 
 81 | export const PPTX_TEST_STRINGS = [
 82 |   "2cdda5c8-e50e-4db4-b5f0-9722a649f455",
 83 |   "04191ea8-5c73-4215-a1d3-1cfb43aaaf12",
 84 |   "44bf7d06-5e7a-4a40-a2e1-a2e42ef28c8a",
 85 |   "1b92870d-e3b5-4e65-8153-919f4ff45592",
 86 |   "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
 87 |   "a3f6004b-6f4f-4ea8-bee3-3741f4dc385f", // chart title
 88 |   "2003" // chart value
 89 | ];
 90 | 
 91 | export const WAV_TEST_STRINGS = ["Duration: 0:00:51", "Audio Transcript:"];
 92 | 
 93 | export const JPG_TEST_EXIFTOOL = {
 94 |   Author: "AutoGen Authors",
 95 |   Title: "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
 96 |   Description: "AutoGen enables diverse LLM-based applications",
 97 |   ImageSize: "1615x1967",
 98 |   DateTimeOriginal: "2024:03:14 22:10:00"
 99 | };
100 | 
101 | export const LLM_TEST_STRINGS = ["5bda1dd6"];
102 | 


--------------------------------------------------------------------------------
/src/converters/xml-rss-atom.ts:
--------------------------------------------------------------------------------
  1 | import { CustomTurnDown } from "../custom-turndown";
  2 | import { ConverterOptions, ConverterResult, DocumentConverter } from "../types";
  3 | import { Document, DOMParser, Element } from "@xmldom/xmldom";
  4 | import * as fs from "fs";
  5 | import { JSDOM } from "jsdom";
  6 | 
  7 | export class RSSConverter implements DocumentConverter {
  8 |   async convert(source: string | Buffer, options: ConverterOptions = {}): Promise<ConverterResult> {
  9 |     const fileExtension = options.file_extension || "";
 10 |     if (![".xml", ".rss", ".atom"].includes(fileExtension.toLowerCase())) {
 11 |       return null;
 12 |     }
 13 | 
 14 |     try {
 15 |       const xmlString =
 16 |         typeof source === "string"
 17 |           ? fs.readFileSync(source, { encoding: "utf-8" })
 18 |           : source.toString("utf-8");
 19 |       const doc = new DOMParser().parseFromString(xmlString, "text/xml");
 20 | 
 21 |       let result;
 22 | 
 23 |       if (doc.getElementsByTagName("rss").length > 0) {
 24 |         result = this._parseRssType(doc);
 25 |       } else if (doc.getElementsByTagName("feed").length > 0) {
 26 |         const root = doc.getElementsByTagName("feed")[0];
 27 |         if (root.getElementsByTagName("entry").length > 0) {
 28 |           result = this._parseAtomType(doc);
 29 |         }
 30 |       }
 31 |       return result;
 32 |     } catch (error) {
 33 |       console.error("RSS Parsing Error:", error);
 34 |       return null;
 35 |     }
 36 |   }
 37 |   private _parseAtomType(doc: Document) {
 38 |     try {
 39 |       const root = doc.getElementsByTagName("feed")[0];
 40 |       const title = this._getDataByTagName(root, "title");
 41 |       const subtitle = this._getDataByTagName(root, "subtitle");
 42 |       const entries = root.getElementsByTagName("entry");
 43 |       let mdText = `# ${title}\n`;
 44 |       if (subtitle) {
 45 |         mdText += `${subtitle}\n`;
 46 |       }
 47 |       for (let i = 0; i < entries.length; i++) {
 48 |         const entry = entries[i];
 49 |         const entryTitle = this._getDataByTagName(entry, "title");
 50 |         const entrySummary = this._getDataByTagName(entry, "summary");
 51 |         const entryUpdated = this._getDataByTagName(entry, "updated");
 52 |         const entryContent = this._getDataByTagName(entry, "content");
 53 |         if (entryTitle) {
 54 |           mdText += `\n## ${entryTitle}\n`;
 55 |         }
 56 |         if (entryUpdated) {
 57 |           mdText += `Updated on: ${entryUpdated}\n`;
 58 |         }
 59 |         if (entrySummary) {
 60 |           mdText += this._parseContent(entrySummary);
 61 |         }
 62 |         if (entryContent) {
 63 |           mdText += this._parseContent(entryContent);
 64 |         }
 65 |       }
 66 |       return { title: title, markdown: mdText, text_content: mdText };
 67 |     } catch (error) {
 68 |       console.error("Atom Parsing Error:", error);
 69 |       return null;
 70 |     }
 71 |   }
 72 | 
 73 |   private _parseRssType(doc: Document) {
 74 |     try {
 75 |       const root = doc.getElementsByTagName("rss")[0];
 76 |       const channel = root.getElementsByTagName("channel");
 77 |       if (!channel || channel.length === 0) {
 78 |         return null;
 79 |       }
 80 |       const channelElement = channel[0];
 81 |       const channelTitle = this._getDataByTagName(channelElement, "title");
 82 |       const channelDescription = this._getDataByTagName(channelElement, "description");
 83 |       const items = channelElement.getElementsByTagName("item");
 84 |       let mdText = "";
 85 |       if (channelTitle) {
 86 |         mdText = `# ${channelTitle}\n`;
 87 |       }
 88 |       if (channelDescription) {
 89 |         mdText += `${channelDescription}\n`;
 90 |       }
 91 |       for (let i = 0; i < items.length; i++) {
 92 |         const item = items[i];
 93 |         const title = this._getDataByTagName(item, "title");
 94 |         const description = this._getDataByTagName(item, "description");
 95 |         const pubDate = this._getDataByTagName(item, "pubDate");
 96 |         const content = this._getDataByTagName(item, "content:encoded");
 97 |         if (title) {
 98 |           mdText += `\n## ${title}\n`;
 99 |         }
100 |         if (pubDate) {
101 |           mdText += `Published on: ${pubDate}\n`;
102 |         }
103 |         if (description) {
104 |           mdText += this._parseContent(description);
105 |         }
106 |         if (content) {
107 |           mdText += this._parseContent(content);
108 |         }
109 |       }
110 |       return { title: channelTitle, markdown: mdText, text_content: mdText };
111 |     } catch (error) {
112 |       console.error("RSS Parsing Error:", error);
113 |       return null;
114 |     }
115 |   }
116 | 
117 |   private _parseContent(content: string) {
118 |     try {
119 |       const dom = new JSDOM(content);
120 |       const document = dom.window.document;
121 |       return new CustomTurnDown().convert_soup(document);
122 |     } catch (error) {
123 |       console.warn("Parsing content error", error);
124 |       return content;
125 |     }
126 |   }
127 | 
128 |   private _getDataByTagName(element: Element, tagName: string) {
129 |     const nodes = element.getElementsByTagName(tagName);
130 |     if (!nodes || nodes.length === 0) {
131 |       return null;
132 |     }
133 |     const fc = nodes[0].firstChild;
134 |     if (fc && fc.nodeValue) {
135 |       return fc.nodeValue;
136 |     }
137 |     return null;
138 |   }
139 | }
140 | 


--------------------------------------------------------------------------------
/src/converters/youtube.ts:
--------------------------------------------------------------------------------
  1 | import * as fs from "fs";
  2 | import { JSDOM } from "jsdom";
  3 | import { URL } from "url";
  4 | import { ConverterOptions, ConverterResult, DocumentConverter } from "../types";
  5 | 
  6 | export class YouTubeConverter implements DocumentConverter {
  7 |   async convert(
  8 |     source: string | Buffer,
  9 |     options: ConverterOptions = {}
 10 |   ): Promise<ConverterResult | null> {
 11 |     const fileExtension = options.file_extension || "";
 12 |     if (![".html", ".htm"].includes(fileExtension.toLowerCase())) {
 13 |       return null;
 14 |     }
 15 |     const url = options.url || "";
 16 |     if (!url.startsWith("https://www.youtube.com/watch?")) {
 17 |       return null;
 18 |     }
 19 |     try {
 20 |       const htmlContent =
 21 |         typeof source === "string"
 22 |           ? fs.readFileSync(source, { encoding: "utf-8" })
 23 |           : source.toString("utf-8");
 24 |       return this._convert(htmlContent, url, options);
 25 |     } catch (error) {
 26 |       console.error("YouTube Parsing Error:", error);
 27 |       return null;
 28 |     }
 29 |   }
 30 |   private async _convert(
 31 |     htmlContent: string,
 32 |     url: string,
 33 |     options: ConverterOptions
 34 |   ): Promise<ConverterResult> {
 35 |     const dom = new JSDOM(htmlContent);
 36 |     const doc = dom.window.document;
 37 | 
 38 |     const metadata: Record<string, string> = {
 39 |       title: doc.title
 40 |     };
 41 | 
 42 |     doc.querySelectorAll("meta").forEach((meta) => {
 43 |       for (const a of meta.attributes) {
 44 |         const attributeContent = meta.getAttribute("content");
 45 |         if (["itemprop", "property", "name"].includes(a.name) && attributeContent) {
 46 |           // console.log({
 47 |           //   name: a.name,
 48 |           //   value: a.value,
 49 |           //   textContent: a.textContent,
 50 |           //   attributeContent: meta.getAttribute("content")
 51 |           // });
 52 |           metadata[a.value] = attributeContent;
 53 |           break;
 54 |         }
 55 |       }
 56 |     });
 57 | 
 58 |     // We can also try to read the full description. This is more prone to breaking, since it reaches into the page implementation
 59 |     try {
 60 |       for (const script of doc.querySelectorAll("script")) {
 61 |         const content = script.textContent || "";
 62 |         if (content.includes("ytInitialData")) {
 63 |           const lines = content.split(/\r?\n/);
 64 |           const objStart = lines[0].indexOf("{");
 65 |           const objEnd = lines[0].lastIndexOf("}");
 66 |           if (objStart >= 0 && objEnd >= 0) {
 67 |             const data = JSON.parse(lines[0].substring(objStart, objEnd + 1));
 68 |             const attrdesc = this._findKey(data, "attributedDescriptionBodyText");
 69 |             if (attrdesc) {
 70 |               metadata["description"] = attrdesc["content"];
 71 |             }
 72 |           }
 73 |           break;
 74 |         }
 75 |       }
 76 |     } catch (e) {
 77 |       console.warn("Error while parsing Youtube description");
 78 |     }
 79 |     let webpageText = "# YouTube\n";
 80 |     const title = this._get(metadata, ["title", "og:title", "name"]);
 81 |     if (title) {
 82 |       webpageText += `\n## ${title}\n`;
 83 |     }
 84 |     let stats = "";
 85 |     const views = this._get(metadata, ["interactionCount"]);
 86 |     if (views) {
 87 |       stats += `- **Views:** ${views}\n`;
 88 |     }
 89 |     const keywords = this._get(metadata, ["keywords"]);
 90 |     if (keywords) {
 91 |       stats += `- **Keywords:** ${keywords}\n`;
 92 |     }
 93 |     const runtime = this._get(metadata, ["duration"]);
 94 |     if (runtime) {
 95 |       stats += `- **Runtime:** ${runtime}\n`;
 96 |     }
 97 |     if (stats.length > 0) {
 98 |       webpageText += `\n### Video Metadata\n${stats}\n`;
 99 |     }
100 |     const description = this._get(metadata, ["description", "og:description"]);
101 |     if (description) {
102 |       webpageText += `\n### Description\n${description}\n`;
103 |     }
104 |     if (options.enableYoutubeTranscript) {
105 |       let transcriptText = "";
106 |       const parsedUrl = new URL(url);
107 |       const params = parsedUrl.searchParams;
108 |       const videoId = params.get("v");
109 |       let ytTranscript;
110 |       try {
111 |         ytTranscript = await import("youtube-transcript").then((mod) => mod.YoutubeTranscript);
112 |       } catch (error) {
113 |         console.warn(
114 |           "Optional dependency 'youtube-transcript' is not installed. Run `npm install youtube-transcript` to enable this feature."
115 |         );
116 |         return null;
117 |       }
118 |       if (videoId) {
119 |         try {
120 |           const youtubeTranscriptLanguage = options.youtubeTranscriptLanguage || "en";
121 |           const transcript = await ytTranscript.fetchTranscript(videoId, {
122 |             lang: youtubeTranscriptLanguage
123 |           });
124 |           transcriptText = transcript.map((part) => part.text).join(" ");
125 |         } catch (error) {
126 |           console.warn("Error while extracting the Youtube Transcript", error);
127 |         }
128 |       }
129 |       if (transcriptText) {
130 |         webpageText += `\n### Transcript\n${transcriptText}\n`;
131 |       }
132 |     }
133 |     const finalTitle = title ? title : doc.title;
134 |     return { title: finalTitle, markdown: webpageText, text_content: webpageText };
135 |   }
136 |   private _get(
137 |     metadata: Record<string, string>,
138 |     keys: string[],
139 |     default_value?: string
140 |   ): string | null {
141 |     for (const k of keys) {
142 |       if (metadata[k]) {
143 |         return metadata[k];
144 |       }
145 |     }
146 |     return default_value || null;
147 |   }
148 |   private _findKey(json: any, key: string): any {
149 |     if (Array.isArray(json)) {
150 |       for (const elm of json) {
151 |         const ret = this._findKey(elm, key);
152 |         if (ret) {
153 |           return ret;
154 |         }
155 |       }
156 |     } else if (typeof json === "object" && json !== null) {
157 |       for (const k in json) {
158 |         if (k === key) {
159 |           return json[k];
160 |         } else {
161 |           const ret = this._findKey(json[k], key);
162 |           if (ret) {
163 |             return ret;
164 |           }
165 |         }
166 |       }
167 |     }
168 |     return null;
169 |   }
170 | }
171 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # markitdown-ts
  2 | 
  3 | [![CI](https://github.com/dead8309/markitdown-ts/actions/workflows/ci.yml/badge.svg)](https://github.com/dead8309/markitdown/actions/workflows/ci.yml)
  4 | 
  5 | `markitdown-ts` is a TypeScript library designed for converting various file formats to Markdown. It can process fiiles from local paths, URLs, or directly from in-memory buffers, making it ideal for serverless and edge environments like Supabase Functions or Cloudflare Workers.
  6 | 
  7 | It is a TypeScript implementation of the original `markitdown` [Python library.](https://github.com/microsoft/markitdown) and is suitable for indexing, text analysis, and other applications that benefit from structured text.
  8 | 
  9 | It supports:
 10 | 
 11 | - [x] PDF
 12 | - [x] Word (.docx)
 13 | - [x] Excel (.xlsx)
 14 | - [x] Images (EXIF metadata extraction and optional LLM-based description)
 15 | - [x] Audio (EXIF metadata extraction only)
 16 | - [x] HTML
 17 | - [x] Text-based formats (plain text, .csv, .xml, .rss, .atom)
 18 | - [x] Jupyter Notebooks (.ipynb)
 19 | - [x] Bing Search Result Pages (SERP)
 20 | - [x] ZIP files (recursively iterates over contents)
 21 | - [ ] PowerPoint
 22 | 
 23 | > [!NOTE]
 24 | >
 25 | > Speech Recognition for audio converter has not been implemented yet. I'm happy to accept contributions for this feature.
 26 | 
 27 | ## Installation
 28 | 
 29 | Install `markitdown-ts` using your preferred package manager:
 30 | 
 31 | ```bash
 32 | pnpm add markitdown-ts
 33 | ```
 34 | 
 35 | ## Usage
 36 | 
 37 | ### Basic Usage (from a File Path)
 38 | 
 39 | The simplest way to use the library is by providing a local file path or a URL.
 40 | 
 41 | ```typescript
 42 | import { MarkItDown } from "markitdown-ts";
 43 | 
 44 | const markitdown = new MarkItDown();
 45 | try {
 46 |   // Convert a local file
 47 |   const result = await markitdown.convert("path/to/your/file.pdf");
 48 | 
 49 |   // Or convert from a URL
 50 |   const result = await markitdown.convert("https://arxiv.org/pdf/2308.08155v2.pdf");
 51 | 
 52 |   if (result) {
 53 |     console.log(result.markdown);
 54 |   }
 55 | } catch (error) {
 56 |   console.error("Conversion failed:", error);
 57 | }
 58 | ```
 59 | 
 60 | ### Advanced Usage (from Buffers, Blobs, or Responses)
 61 | 
 62 | For use in serverless environments where you can't rely on a persistent filesystem, you can convert data directly from memory.
 63 | 
 64 | > [!IMPORTANT]
 65 | >
 66 | > This is the recommended approach for environments like **Supabase Edge Functions**, **Cloudflare Workers**, or **AWS Lambda**.
 67 | 
 68 | #### From a Buffer
 69 | 
 70 | If you have your file content in a `Buffer`, use the `convertBuffer` method. You **must** provide the `file_extension` in the options so the library knows which converter to use.
 71 | 
 72 | ```typescript
 73 | import { MarkItDown } from "markitdown-ts";
 74 | import * as fs from "fs";
 75 | 
 76 | const markitdown = new MarkItDown();
 77 | try {
 78 |   const buffer = fs.readFileSync("path/to/your/file.docx");
 79 |   const result = await markitdown.convertBuffer(buffer, {
 80 |     file_extension: ".docx"
 81 |   });
 82 |   console.log(result?.text_content);
 83 | } catch (error) {
 84 |   console.error("Conversion failed:", error);
 85 | }
 86 | ```
 87 | 
 88 | #### From a Response or Blob
 89 | 
 90 | You can pass a standard `Response` object directly to the `convert` method. This is perfect for handling file uploads from a request body.
 91 | 
 92 | ```typescript
 93 | import { MarkItDown } from "markitdown-ts";
 94 | 
 95 | const markitdown = new MarkItDown();
 96 | 
 97 | // Example: Simulating a file upload by creating a Blob and a Response
 98 | const buffer = fs.readFileSync("path/to/archive.zip");
 99 | const blob = new Blob([buffer]);
100 | const response = new Response(blob, {
101 |   headers: { "Content-Type": "application/zip" }
102 | });
103 | 
104 | try {
105 |   const result = await markitdown.convert(response);
106 |   console.log(result?.text_content);
107 | } catch (error) {
108 |   console.error("Conversion failed:", error);
109 | }
110 | ```
111 | 
112 | ## YouTube Transcript Support
113 | 
114 | When converting YouTube files, you can pass the `enableYoutubeTranscript` and the `youtubeTranscriptLanguage` option to control the transcript extraction. By default it will use `"en"` if the `youtubeTranscriptLanguage` is not provided.
115 | 
116 | ```typescript
117 | const markitdown = new MarkItDown();
118 | const result = await markitdown.convert("https://www.youtube.com/watch?v=V2qZ_lgxTzg", {
119 |   enableYoutubeTranscript: true,
120 |   youtubeTranscriptLanguage: "en"
121 | });
122 | ```
123 | 
124 | ## LLM Image Description Support
125 | 
126 | To enable LLM functionality, you need to configure a model and client in the `options` for the image converter. You can use the `@ai-sdk/openai` to get an LLM client.
127 | 
128 | ```typescript
129 | import { openai } from "@ai-sdk/openai";
130 | 
131 | const markitdown = new MarkItDown();
132 | const result = await markitdown.convert("test.jpg", {
133 |   llmModel: openai("gpt-4o-mini"),
134 |   llmPrompt: "Write a detailed description of this image"
135 | });
136 | ```
137 | 
138 | ## API
139 | 
140 | The library exposes a `MarkItDown` class with two primary conversion methods.
141 | 
142 | ```typescript
143 | class MarkItDown {
144 |   /**
145 |    * Converts a source from a file path, URL, or Response object.
146 |    */
147 |   async convert(source: string | Response, options?: ConverterOptions): Promise<ConverterResult>;
148 | 
149 |   /**
150 |    * Converts a source from an in-memory Buffer.
151 |    */
152 |   async convertBuffer(
153 |     source: Buffer,
154 |     options: ConverterOptions & { file_extension: string }
155 |   ): Promise<ConverterResult>;
156 | }
157 | 
158 | export type ConverterResult =
159 |   | {
160 |       title: string | null;
161 |       markdown: string;
162 |       /** @deprecated Use `markdown` instead. */
163 |       text_content: string;
164 |     }
165 |   | null
166 |   | undefined;
167 | 
168 | export type ConverterOption = {
169 |   // Required when using convertBuffer
170 |   file_extension?: string;
171 | 
172 |   // For URL-based converters (e.g., Wikipedia, Bing SERP)
173 |   url?: string;
174 | 
175 |   // Provide a custom fetch implementation
176 |   fetch?: typeof fetch;
177 | 
178 |   // YouTube-specific options
179 |   enableYoutubeTranscript?: boolean; // Default: false
180 |   youtubeTranscriptLanguage?: string; // Default: "en"
181 | 
182 |   // Image-specific LLM options
183 |   llmModel?: LanguageModel;
184 |   llmPrompt?: string;
185 | 
186 |   // Options for .docx conversion (passed to mammoth.js)
187 |   styleMap?: string | Array<string>;
188 | 
189 |   // Options for .zip conversion
190 |   cleanupExtracted?: boolean; // Default: true
191 | };
192 | ```
193 | 
194 | ## Examples
195 | 
196 | Check out the [examples](./examples) folder.
197 | 
198 | ## License
199 | 
200 | MIT License © 2024 [Vaibhav Raj](https://github.com/dead8309)
201 | 


--------------------------------------------------------------------------------
/src/markitdown.ts:
--------------------------------------------------------------------------------
  1 | import * as mime from "mime-types";
  2 | import path from "path";
  3 | import * as fs from "fs";
  4 | import { ConverterOptions, DocumentConverter, ConverterResult } from "./types";
  5 | import { PlainTextConverter } from "./converters/plain-text";
  6 | import { HtmlConverter } from "./converters/html";
  7 | import { RSSConverter } from "./converters/xml-rss-atom";
  8 | import { WikipediaConverter } from "./converters/wikipedia";
  9 | import { YouTubeConverter } from "./converters/youtube";
 10 | import { IpynbConverter } from "./converters/ipynb";
 11 | import { BingSerpConverter } from "./converters/bingserp";
 12 | import { PdfConverter } from "./converters/pdf";
 13 | import { DocxConverter } from "./converters/docx";
 14 | import { XlsxConverter } from "./converters/xlsx";
 15 | import { WavConverter } from "./converters/wav";
 16 | import { Mp3Converter } from "./converters/mp3";
 17 | import { ImageConverter } from "./converters/image";
 18 | import { ZipConverter } from "./converters/zip";
 19 | 
 20 | export class MarkItDown {
 21 |   private readonly converters: Array<DocumentConverter> = [];
 22 | 
 23 |   constructor() {
 24 |     this.register_converter(new PlainTextConverter());
 25 |     this.register_converter(new HtmlConverter());
 26 |     this.register_converter(new RSSConverter());
 27 |     this.register_converter(new WikipediaConverter());
 28 |     this.register_converter(new YouTubeConverter());
 29 |     this.register_converter(new BingSerpConverter());
 30 |     this.register_converter(new DocxConverter());
 31 |     this.register_converter(new XlsxConverter());
 32 |     this.register_converter(new WavConverter());
 33 |     this.register_converter(new Mp3Converter());
 34 |     this.register_converter(new ImageConverter());
 35 |     this.register_converter(new IpynbConverter());
 36 |     this.register_converter(new PdfConverter());
 37 |     this.register_converter(new ZipConverter());
 38 |   }
 39 | 
 40 |   /**
 41 |    * Converts a source from a file path, URL, or Response object.
 42 |    */
 43 |   async convert(
 44 |     source: string | Response,
 45 |     options: ConverterOptions = {}
 46 |   ): Promise<ConverterResult> {
 47 |     if (source instanceof Response) {
 48 |       return await this.convert_response(source, options);
 49 |     } else {
 50 |       if (
 51 |         source.startsWith("http://") ||
 52 |         source.startsWith("https://") ||
 53 |         source.startsWith("file://")
 54 |       ) {
 55 |         return await this.convert_url(source, options);
 56 |       } else {
 57 |         return this.convert_local(source, options);
 58 |       }
 59 |     }
 60 |   }
 61 | 
 62 |   /**
 63 |    * Converts a source from an in-memory Buffer.
 64 |    */
 65 |   async convertBuffer(
 66 |     source: Buffer,
 67 |     options: ConverterOptions & { file_extension: string }
 68 |   ): Promise<ConverterResult> {
 69 |     const extensions = new Set<string>([options.file_extension]);
 70 |     return this._convert(source, extensions, options);
 71 |   }
 72 | 
 73 |   private async convert_url(
 74 |     source: string,
 75 |     { fetch = globalThis.fetch, ...options }: ConverterOptions
 76 |   ): Promise<ConverterResult> {
 77 |     let response = await fetch(source);
 78 |     if (!response.ok) {
 79 |       throw new Error(`Failed to fetch URL: ${source}, status: ${response.status}`);
 80 |     }
 81 | 
 82 |     return await this.convert_response(response, options);
 83 |   }
 84 | 
 85 |   private async convert_response(
 86 |     response: Response,
 87 |     options: ConverterOptions
 88 |   ): Promise<ConverterResult> {
 89 |     const ext = options.file_extension;
 90 |     const extensions = ext ? new Set<string>([ext]) : new Set<string>();
 91 |     const contentType = response.headers?.get("content-type")?.split(";")[0];
 92 |     if (!contentType) {
 93 |       throw new Error("Response Content-Type header is missing");
 94 |     }
 95 | 
 96 |     const mimeExtension = mime.extension(contentType);
 97 |     if (mimeExtension) {
 98 |       //NOTE: . was missing from the starting of the string which lead to youtube
 99 |       // test to fail as it was not able to find the correct extension i.e .html
100 |       extensions.add(`.${mimeExtension}`);
101 |     }
102 | 
103 |     const content_disposition = response.headers?.get("content-disposition") || "";
104 |     const fname = content_disposition.match(/filename="([^;]+)"/);
105 |     if (fname) {
106 |       extensions.add(path.extname(fname[1]));
107 |     }
108 | 
109 |     if (response.url) {
110 |       const url_ext = path.extname(new URL(response.url).pathname);
111 |       extensions.add(url_ext);
112 |     }
113 | 
114 |     if (extensions.size === 0) {
115 |       throw new Error(
116 |         "Could not determine file type. Please provide a `file_extension` in the options."
117 |       );
118 |     }
119 | 
120 |     if (response.body == null) {
121 |       throw new Error("Response body is empty");
122 |     }
123 | 
124 |     const buffer = Buffer.from(await response.arrayBuffer());
125 |     return await this._convert(buffer, extensions, {
126 |       ...options,
127 |       url: response.url
128 |     });
129 |   }
130 | 
131 |   private async convert_local(source: string, options: ConverterOptions): Promise<ConverterResult> {
132 |     const ext = options.file_extension;
133 |     const extensions = ext ? new Set<string>(ext) : new Set<string>();
134 |     if (!fs.existsSync(source)) {
135 |       throw new Error(`File not found: ${source}`);
136 |     }
137 | 
138 |     const extname = path.extname(source);
139 |     if (extname === "") {
140 |       throw new Error(`File extension not found: ${source}`);
141 |     }
142 | 
143 |     if (!extensions.has(extname)) {
144 |       extensions.add(extname);
145 |     }
146 | 
147 |     return await this._convert(source, extensions, options);
148 |   }
149 | 
150 |   private async _convert(
151 |     source: string | Buffer,
152 |     extensions: Set<string>,
153 |     options: any = {}
154 |   ): Promise<ConverterResult> {
155 |     let error;
156 | 
157 |     for (const ext of extensions) {
158 |       for (const converter of this.converters) {
159 |         let res;
160 |         try {
161 |           const op: ConverterOptions = {
162 |             ...options,
163 |             file_extension: ext,
164 |             _parent_converters: this.converters
165 |           };
166 |           res = await converter.convert(source, op);
167 |         } catch (e) {
168 |           error = e;
169 |         }
170 | 
171 |         if (res != null) {
172 |           res.markdown = res.markdown.replace(/(?:\r\n|\r|\n)/g, "\n").trim();
173 |           res.markdown = res.markdown.replace(/\n{3,}/g, "\n\n");
174 | 
175 |           return res;
176 |         }
177 |       }
178 |     }
179 | 
180 |     if (error) {
181 |       throw new Error(
182 |         `Could not convert ${source} to markdown. While converting the following error occurred: ${error}`
183 |       );
184 |     }
185 |     throw new Error(
186 |       `Could not convert ${source} to markdown format. The ${Array.from(extensions).join(
187 |         ", "
188 |       )} are not supported.`
189 |     );
190 |   }
191 | 
192 |   // NOTE: Inserts the converter at the beginning of the list
193 |   private register_converter(converter: DocumentConverter) {
194 |     this.converters.unshift(converter);
195 |   }
196 | }
197 | 


--------------------------------------------------------------------------------
/test/index.test.ts:
--------------------------------------------------------------------------------
  1 | import { describe, it, expect } from "vitest";
  2 | import { MarkItDown } from "../src/markitdown";
  3 | import * as path from "path";
  4 | import * as fs from "fs";
  5 | import isCi from "is-ci";
  6 | import { openai } from "@ai-sdk/openai";
  7 | import {
  8 |   PLAIN_TEST,
  9 |   BLOG_TEST_URL,
 10 |   BLOG_TEST_STRINGS,
 11 |   RSS_TEST_STRINGS,
 12 |   WIKIPEDIA_TEST_URL,
 13 |   WIKIPEDIA_TEST_STRINGS,
 14 |   WIKIPEDIA_TEST_EXCLUDES,
 15 |   YOUTUBE_TEST_URL,
 16 |   YOUTUBE_TEST_STRINGS,
 17 |   IPYNB_TEST_STRINGS,
 18 |   SERP_TEST_URL,
 19 |   SERP_TEST_STRINGS,
 20 |   SERP_TEST_EXCLUDES,
 21 |   PDF_TEST_URL,
 22 |   PDF_TEST_STRINGS,
 23 |   DOCX_TEST_STRINGS,
 24 |   DOCX_COMMENT_TEST_STRINGS,
 25 |   XLSX_TEST_STRINGS,
 26 |   WAV_TEST_STRINGS,
 27 |   JPG_TEST_EXIFTOOL,
 28 |   LLM_TEST_STRINGS
 29 | } from "./test.data";
 30 | 
 31 | describe("MarkItDown Tests", () => {
 32 |   describe("Plain Text Converter", () => {
 33 |     it("should convert plain text", async () => {
 34 |       const markitdown = new MarkItDown();
 35 |       const result = await markitdown.convert(path.join(__dirname, "__files/test.txt"));
 36 |       expect(result).toBeTruthy();
 37 |       const textContent = result?.markdown.replace("\\", "");
 38 |       expect(result?.title).toBeNull();
 39 |       for (const testStr of PLAIN_TEST) {
 40 |         expect(textContent).toContain(testStr);
 41 |       }
 42 |     });
 43 |   });
 44 |   describe("HTML Converter", () => {
 45 |     it("should convert HTML to markdown", async () => {
 46 |       const markitdown = new MarkItDown();
 47 |       const result = await markitdown.convert(path.join(__dirname, "__files/test_blog.html"), {
 48 |         url: BLOG_TEST_URL
 49 |       });
 50 |       expect(result).not.toBeNull();
 51 |       expect(result).not.toBeUndefined();
 52 |       const textContent = result?.markdown.replace("\\", "");
 53 |       for (const testString of BLOG_TEST_STRINGS) {
 54 |         expect(textContent).toContain(testString);
 55 |       }
 56 |     });
 57 |   });
 58 | 
 59 |   describe("RSS Converter", () => {
 60 |     it("should convert RSS to markdown", async () => {
 61 |       const markitdown = new MarkItDown();
 62 |       const result = await markitdown.convert(path.join(__dirname, "__files/test_rss.xml"));
 63 |       expect(result).not.toBeNull();
 64 |       expect(result).not.toBeUndefined();
 65 |       const textContent = result?.markdown.replace("\\", "");
 66 |       for (const testString of RSS_TEST_STRINGS) {
 67 |         expect(textContent).toContain(testString);
 68 |       }
 69 |     });
 70 |   });
 71 | 
 72 |   describe("Wikipedia Converter", () => {
 73 |     it("should convert Wikipedia to markdown", async () => {
 74 |       const markitdown = new MarkItDown();
 75 |       const result = await markitdown.convert(path.join(__dirname, "__files/test_wikipedia.html"), {
 76 |         url: WIKIPEDIA_TEST_URL
 77 |       });
 78 |       expect(result).not.toBeNull();
 79 |       expect(result).not.toBeUndefined();
 80 |       const textContent = result?.markdown.replace("\\", "");
 81 |       for (const testString of WIKIPEDIA_TEST_EXCLUDES) {
 82 |         expect(textContent).not.toContain(testString);
 83 |       }
 84 |       for (const testString of WIKIPEDIA_TEST_STRINGS) {
 85 |         expect(textContent).toContain(testString);
 86 |       }
 87 |     });
 88 |   });
 89 | 
 90 |   if (!isCi) {
 91 |     describe("Youtube Converter", () => {
 92 |       it("should convert YouTube to markdown with transcript", async () => {
 93 |         const markitdown = new MarkItDown();
 94 |         const result = await markitdown.convert(YOUTUBE_TEST_URL, {
 95 |           enableYoutubeTranscript: true
 96 |         });
 97 |         expect(result).not.toBeNull();
 98 |         expect(result).not.toBeUndefined();
 99 |         const textContent = result?.markdown.replace("\\", "");
100 |         for (const testString of YOUTUBE_TEST_STRINGS) {
101 |           expect(textContent).toContain(testString);
102 |         }
103 |       }, 30000);
104 |     });
105 |   }
106 | 
107 |   describe("IPYNB Converter", () => {
108 |     it("should convert .ipynb to markdown", async () => {
109 |       const markitdown = new MarkItDown();
110 |       const result = await markitdown.convert(path.join(__dirname, "__files/test_notebook.ipynb"));
111 |       expect(result).not.toBeNull();
112 |       expect(result).not.toBeUndefined();
113 |       const textContent = result?.markdown.replace("\\", "");
114 |       for (const testString of IPYNB_TEST_STRINGS) {
115 |         expect(textContent).toContain(testString);
116 |       }
117 |     });
118 |   });
119 |   describe("BingSerp Converter", () => {
120 |     it("should convert Bing SERP to markdown", async () => {
121 |       const markitdown = new MarkItDown();
122 |       const result = await markitdown.convert(path.join(__dirname, "__files/test_serp.html"), {
123 |         url: SERP_TEST_URL
124 |       });
125 |       expect(result).not.toBeNull();
126 |       expect(result).not.toBeUndefined();
127 |       const textContent = result?.markdown.replace("\\", "");
128 |       for (const testString of SERP_TEST_EXCLUDES) {
129 |         expect(textContent).not.toContain(testString);
130 |       }
131 |       for (const testString of SERP_TEST_STRINGS) {
132 |         expect(textContent).toContain(testString);
133 |       }
134 |     });
135 |   });
136 |   describe("PDF Converter", () => {
137 |     it("should convert PDF to text", async () => {
138 |       const markitdown = new MarkItDown();
139 |       const result = await markitdown.convert(path.join(__dirname, "__files/test.pdf"), {
140 |         url: PDF_TEST_URL
141 |       });
142 |       expect(result).not.toBeNull();
143 |       expect(result).not.toBeUndefined();
144 |       const textContent = result?.markdown.replace("\\", "");
145 |       for (const testString of PDF_TEST_STRINGS) {
146 |         expect(textContent).toContain(testString);
147 |       }
148 |     });
149 |   });
150 |   describe("DOCX Converter", () => {
151 |     it("should convert .docx to markdown", async () => {
152 |       const markitdown = new MarkItDown();
153 |       const result = await markitdown.convert(path.join(__dirname, "__files/test.docx"));
154 |       expect(result).not.toBeNull();
155 |       expect(result).not.toBeUndefined();
156 |       const textContent = result?.markdown.replace("\\", "");
157 |       for (const testString of DOCX_TEST_STRINGS) {
158 |         expect(textContent).toContain(testString);
159 |       }
160 |     });
161 |     it("should convert .docx to markdown with comments", async () => {
162 |       const markitdown = new MarkItDown();
163 |       const result = await markitdown.convert(
164 |         path.join(__dirname, "__files/test_with_comment.docx"),
165 |         { styleMap: "comment-reference => " }
166 |       );
167 |       expect(result).not.toBeNull();
168 |       expect(result).not.toBeUndefined();
169 |       const textContent = result?.markdown.replace("\\", "");
170 |       for (const testString of DOCX_COMMENT_TEST_STRINGS) {
171 |         expect(textContent).toContain(testString);
172 |       }
173 |     });
174 |   });
175 |   describe("XLSX Converter", () => {
176 |     it("should convert .xlsx to markdown", async () => {
177 |       const markitdown = new MarkItDown();
178 |       const result = await markitdown.convert(path.join(__dirname, "__files/test.xlsx"));
179 |       expect(result).not.toBeNull();
180 |       expect(result).not.toBeUndefined();
181 |       const textContent = result?.markdown.replace("\\", "");
182 |       for (const testString of XLSX_TEST_STRINGS) {
183 |         expect(textContent).toContain(testString);
184 |       }
185 |     });
186 |   });
187 | 
188 |   describe("WAV Converter", () => {
189 |     it("should convert .wav metadata to markdown", async () => {
190 |       const markitdown = new MarkItDown();
191 |       const result = await markitdown.convert(path.join(__dirname, "__files/test.wav"));
192 |       expect(result).not.toBeNull();
193 |       expect(result).not.toBeUndefined();
194 |       const textContent = result?.markdown.replace("\\", "");
195 |       for (const testString of WAV_TEST_STRINGS) {
196 |         expect(textContent).toContain(testString);
197 |       }
198 |     });
199 |   });
200 | 
201 |   describe("Image Converter", () => {
202 |     it("should process .jpg metadata", async () => {
203 |       const markitdown = new MarkItDown();
204 |       const result = await markitdown.convert(path.join(__dirname, "__files/test.jpg"));
205 |       expect(result).not.toBeNull();
206 |       expect(result).not.toBeUndefined();
207 |       const textContent = result?.markdown.replace("\\", "");
208 |       Object.entries(JPG_TEST_EXIFTOOL).forEach(([key, value]) => {
209 |         const target = `${key}: ${value}`;
210 |         expect(textContent).toContain(target);
211 |       });
212 |     });
213 | 
214 |     if (process.env.OPENAI_API_KEY && !isCi) {
215 |       it("should process .jpg metadata with ai", { timeout: 30000 }, async () => {
216 |         const markitdown = new MarkItDown();
217 |         const result = await markitdown.convert(path.join(__dirname, "__files/test.jpg"), {
218 |           llmModel: openai("gpt-4o-mini")
219 |         });
220 |         expect(result).not.toBeNull();
221 |         expect(result).not.toBeUndefined();
222 |         const textContent = result?.markdown.replace("\\", "");
223 |         Object.entries(JPG_TEST_EXIFTOOL).forEach(([key, value]) => {
224 |           const target = `${key}: ${value}`;
225 |           expect(textContent).toContain(target);
226 |         });
227 |       });
228 |     }
229 | 
230 |     if (process.env.OPENAI_API_KEY && !isCi) {
231 |       it("should process colors, texts in images with llm", { timeout: 30000 }, async () => {
232 |         const markitdown = new MarkItDown();
233 |         const result = await markitdown.convert(path.join(__dirname, "__files/test_llm.jpg"), {
234 |           llmModel: openai("gpt-4o-mini")
235 |         });
236 |         expect(result).not.toBeNull();
237 |         expect(result).not.toBeUndefined();
238 |         const textContent = result?.markdown.replace("\\", "");
239 |         for (const testString of LLM_TEST_STRINGS) {
240 |           expect(textContent).toContain(testString);
241 |         }
242 |         for (const testString of ["red", "circle", "blue", "square"]) {
243 |           expect(textContent?.toLowerCase()).toContain(testString.toLowerCase());
244 |         }
245 |       });
246 |     }
247 |   });
248 | 
249 |   describe("Zip Converter", () => {
250 |     it("should convert .zip file contents to markdown", async () => {
251 |       const markitdown = new MarkItDown();
252 |       const result = await markitdown.convert(path.join(__dirname, "__files/test_files.zip"));
253 |       expect(result).not.toBeNull();
254 |       expect(result).not.toBeUndefined();
255 |       const textContent = result?.markdown.replace("\\", "");
256 |       for (const testString of DOCX_TEST_STRINGS) {
257 |         expect(textContent).toContain(testString);
258 |       }
259 |     });
260 |   });
261 | 
262 |   describe("Buffer Conversion", () => {
263 |     it("should correctly convert a .zip file passed as a buffer", async () => {
264 |       const zipFilePath = path.join(__dirname, "__files/test_files.zip");
265 |       const buffer = fs.readFileSync(zipFilePath);
266 |       const markitdown = new MarkItDown();
267 |       const result = await markitdown.convertBuffer(buffer, {
268 |         file_extension: ".zip" // NOTE: this is required for buffer conversions
269 |       });
270 | 
271 |       expect(result).not.toBeNull();
272 |       expect(result).not.toBeUndefined();
273 |       const textContent = result?.markdown.replace("\\", "");
274 | 
275 |       expect(textContent).toContain("File: test.docx");
276 |       for (const testString of DOCX_TEST_STRINGS) {
277 |         expect(textContent).toContain(testString);
278 |       }
279 |     });
280 |   });
281 | 
282 |   describe("Blob Conversion", () => {
283 |     it("should correctly convert a file passed as a Blob via a Response", async () => {
284 |       const zipFilePath = path.join(__dirname, "__files/test_files.zip");
285 |       const buffer = fs.readFileSync(zipFilePath);
286 |       const blob = new Blob([buffer]);
287 | 
288 |       const response = new Response(blob, {
289 |         headers: {
290 |           "Content-Type": "application/zip"
291 |         }
292 |       });
293 | 
294 |       const markitdown = new MarkItDown();
295 |       const result = await markitdown.convert(response);
296 | 
297 |       expect(result).not.toBeNull();
298 |       expect(result).not.toBeUndefined();
299 |       const textContent = result?.markdown.replace("\\", "");
300 | 
301 |       for (const testString of DOCX_TEST_STRINGS) {
302 |         expect(textContent).toContain(testString);
303 |       }
304 |     });
305 |   });
306 | });
307 | 


--------------------------------------------------------------------------------
/test/__files/test_blog.html:
--------------------------------------------------------------------------------
 1 | <!doctype html>
 2 | <html lang="en" dir="ltr" class="blog-wrapper blog-post-page plugin-blog plugin-id-default" data-has-hydrated="false">
 3 | <head>
 4 | <meta charset="UTF-8">
 5 | <meta name="generator" content="Docusaurus v3.1.1">
 6 | <title data-rh="true">Does Model and Inference Parameter Matter in LLM Applications? - A Case Study for MATH | AutoGen</title><meta data-rh="true" name="viewport" content="width=device-width,initial-scale=1"><meta data-rh="true" name="twitter:card" content="summary_large_image"><meta data-rh="true" property="og:url" content="https://microsoft.github.io/autogen/blog/2023/04/21/LLM-tuning-math"><meta data-rh="true" property="og:locale" content="en"><meta data-rh="true" name="docusaurus_locale" content="en"><meta data-rh="true" name="docusaurus_tag" content="default"><meta data-rh="true" name="docsearch:language" content="en"><meta data-rh="true" name="docsearch:docusaurus_tag" content="default"><meta data-rh="true" property="og:title" content="Does Model and Inference Parameter Matter in LLM Applications? - A Case Study for MATH | AutoGen"><meta data-rh="true" name="description" content="level 2 algebra"><meta data-rh="true" property="og:description" content="level 2 algebra"><meta data-rh="true" property="og:type" content="article"><meta data-rh="true" property="article:published_time" content="2023-04-21T00:00:00.000Z"><meta data-rh="true" property="article:author" content="https://www.linkedin.com/in/chi-wang-49b15b16/"><meta data-rh="true" property="article:tag" content="LLM,GPT,research"><link data-rh="true" rel="icon" href="/autogen/img/ag.ico"><link data-rh="true" rel="canonical" href="https://microsoft.github.io/autogen/blog/2023/04/21/LLM-tuning-math"><link data-rh="true" rel="alternate" href="https://microsoft.github.io/autogen/blog/2023/04/21/LLM-tuning-math" hreflang="en"><link data-rh="true" rel="alternate" href="https://microsoft.github.io/autogen/blog/2023/04/21/LLM-tuning-math" hreflang="x-default"><link rel="alternate" type="application/rss+xml" href="/autogen/blog/rss.xml" title="AutoGen RSS Feed">
 7 | <link rel="alternate" type="application/atom+xml" href="/autogen/blog/atom.xml" title="AutoGen Atom Feed">
 8 | 
 9 | 
10 | 
11 | 
12 | 
13 | 
14 | <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.13.11/dist/katex.min.css" integrity="sha384-Um5gpz1odJg5Z4HAmzPtgZKdTBHZdw8S29IecapCSB31ligYPhHQZMIlWLYQGVoc" crossorigin="anonymous">
15 | <script src="/autogen/js/custom.js" async defer="defer"></script><link rel="stylesheet" href="/autogen/assets/css/styles.ca10f300.css">
16 | <script src="/autogen/assets/js/runtime~main.83ab9fec.js" defer="defer"></script>
17 | <script src="/autogen/assets/js/main.5d28c826.js" defer="defer"></script>
18 | </head>
19 | <body class="navigation-with-keyboard">
20 | <script>!function(){function t(t){document.documentElement.setAttribute("data-theme",t)}var e=function(){try{return new URLSearchParams(window.location.search).get("docusaurus-theme")}catch(t){}}()||function(){try{return localStorage.getItem("theme")}catch(t){}}();t(null!==e?e:"light")}(),function(){try{const a=new URLSearchParams(window.location.search).entries();for(var[t,e]of a)if(t.startsWith("docusaurus-data-")){var n=t.replace("docusaurus-data-","data-");document.documentElement.setAttribute(n,e)}}catch(t){}}(),document.documentElement.setAttribute("data-announcement-bar-initially-dismissed",function(){try{return"true"===localStorage.getItem("docusaurus.announcement.dismiss")}catch(t){}return!1}())</script><div id="__docusaurus"><div role="region" aria-label="Skip to main content"><a class="skipToContent_fXgn" href="#__docusaurus_skipToContent_fallback">Skip to main content</a></div><div class="announcementBar_mb4j" style="background-color:#fafbfc;color:#091E42" role="banner"><div class="announcementBarPlaceholder_vyr4"></div><div class="content_knG7 announcementBarContent_xLdY">What's new in AutoGen? Read <a href="/autogen/blog/2024/03/03/AutoGen-Update">this blog</a> for an overview of updates</div><button type="button" aria-label="Close" class="clean-btn close closeButton_CVFx announcementBarClose_gvF7"><svg viewBox="0 0 15 15" width="14" height="14"><g stroke="currentColor" stroke-width="3.1"><path d="M.75.75l13.5 13.5M14.25.75L.75 14.25"></path></g></svg></button></div><nav aria-label="Main" class="navbar navbar--fixed-top"><div class="navbar__inner"><div class="navbar__items"><button aria-label="Toggle navigation bar" aria-expanded="false" class="navbar__toggle clean-btn" type="button"><svg width="30" height="30" viewBox="0 0 30 30" aria-hidden="true"><path stroke="currentColor" stroke-linecap="round" stroke-miterlimit="10" stroke-width="2" d="M4 7h22M4 15h22M4 23h22"></path></svg></button><a class="navbar__brand" href="/autogen/"><div class="navbar__logo"><img src="/autogen/img/ag.svg" alt="AutoGen" class="themedComponent_mlkZ themedComponent--light_NVdE"><img src="/autogen/img/ag.svg" alt="AutoGen" class="themedComponent_mlkZ themedComponent--dark_xIcU"></div><b class="navbar__title text--truncate">AutoGen</b></a><a class="navbar__item navbar__link" href="/autogen/docs/Getting-Started">Docs</a><a class="navbar__item navbar__link" href="/autogen/docs/reference/agentchat/conversable_agent">API</a><a aria-current="page" class="navbar__item navbar__link navbar__link--active" href="/autogen/blog">Blog</a><a class="navbar__item navbar__link" href="/autogen/docs/FAQ">FAQ</a><a class="navbar__item navbar__link" href="/autogen/docs/Examples">Examples</a><a class="navbar__item navbar__link" href="/autogen/docs/notebooks">Notebooks</a><a class="navbar__item navbar__link" href="/autogen/docs/Gallery">Gallery</a></div><div class="navbar__items navbar__items--right"><div class="navbar__item dropdown dropdown--hoverable dropdown--right"><a href="#" aria-haspopup="true" aria-expanded="false" role="button" class="navbar__link">Other Languages</a><ul class="dropdown__menu"><li><a href="https://microsoft.github.io/autogen-for-net/" target="_blank" rel="noopener noreferrer" class="dropdown__link">Dotnet<svg width="12" height="12" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li></ul></div><a href="https://github.com/microsoft/autogen" target="_blank" rel="noopener noreferrer" class="navbar__item navbar__link">GitHub<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a><div class="toggle_vylO colorModeToggle_DEke"><button class="clean-btn toggleButton_gllP toggleButtonDisabled_aARS" type="button" disabled="" title="Switch between dark and light mode (currently light mode)" aria-label="Switch between dark and light mode (currently light mode)" aria-live="polite"><svg viewBox="0 0 24 24" width="24" height="24" class="lightToggleIcon_pyhR"><path fill="currentColor" d="M12,9c1.65,0,3,1.35,3,3s-1.35,3-3,3s-3-1.35-3-3S10.35,9,12,9 M12,7c-2.76,0-5,2.24-5,5s2.24,5,5,5s5-2.24,5-5 S14.76,7,12,7L12,7z M2,13l2,0c0.55,0,1-0.45,1-1s-0.45-1-1-1l-2,0c-0.55,0-1,0.45-1,1S1.45,13,2,13z M20,13l2,0c0.55,0,1-0.45,1-1 s-0.45-1-1-1l-2,0c-0.55,0-1,0.45-1,1S19.45,13,20,13z M11,2v2c0,0.55,0.45,1,1,1s1-0.45,1-1V2c0-0.55-0.45-1-1-1S11,1.45,11,2z M11,20v2c0,0.55,0.45,1,1,1s1-0.45,1-1v-2c0-0.55-0.45-1-1-1C11.45,19,11,19.45,11,20z M5.99,4.58c-0.39-0.39-1.03-0.39-1.41,0 c-0.39,0.39-0.39,1.03,0,1.41l1.06,1.06c0.39,0.39,1.03,0.39,1.41,0s0.39-1.03,0-1.41L5.99,4.58z M18.36,16.95 c-0.39-0.39-1.03-0.39-1.41,0c-0.39,0.39-0.39,1.03,0,1.41l1.06,1.06c0.39,0.39,1.03,0.39,1.41,0c0.39-0.39,0.39-1.03,0-1.41 L18.36,16.95z M19.42,5.99c0.39-0.39,0.39-1.03,0-1.41c-0.39-0.39-1.03-0.39-1.41,0l-1.06,1.06c-0.39,0.39-0.39,1.03,0,1.41 s1.03,0.39,1.41,0L19.42,5.99z M7.05,18.36c0.39-0.39,0.39-1.03,0-1.41c-0.39-0.39-1.03-0.39-1.41,0l-1.06,1.06 c-0.39,0.39-0.39,1.03,0,1.41s1.03,0.39,1.41,0L7.05,18.36z"></path></svg><svg viewBox="0 0 24 24" width="24" height="24" class="darkToggleIcon_wfgR"><path fill="currentColor" d="M9.37,5.51C9.19,6.15,9.1,6.82,9.1,7.5c0,4.08,3.32,7.4,7.4,7.4c0.68,0,1.35-0.09,1.99-0.27C17.45,17.19,14.93,19,12,19 c-3.86,0-7-3.14-7-7C5,9.07,6.81,6.55,9.37,5.51z M12,3c-4.97,0-9,4.03-9,9s4.03,9,9,9s9-4.03,9-9c0-0.46-0.04-0.92-0.1-1.36 c-0.98,1.37-2.58,2.26-4.4,2.26c-2.98,0-5.4-2.42-5.4-5.4c0-1.81,0.89-3.42,2.26-4.4C12.92,3.04,12.46,3,12,3L12,3z"></path></svg></button></div><div class="navbarSearchContainer_Bca1"><div class="navbar__search searchBarContainer_NW3z"><input placeholder="Search" aria-label="Search" class="navbar__search-input"><div class="loadingRing_RJI3 searchBarLoadingRing_YnHq"><div></div><div></div><div></div><div></div></div><div class="searchHintContainer_Pkmr"><kbd class="searchHint_iIMx">ctrl</kbd><kbd class="searchHint_iIMx">K</kbd></div></div></div></div></div><div role="presentation" class="navbar-sidebar__backdrop"></div></nav><div id="__docusaurus_skipToContent_fallback" class="main-wrapper mainWrapper_z2l0"><div class="container margin-vert--lg"><div class="row"><aside class="col col--3"><nav class="sidebar_re4s thin-scrollbar" aria-label="Blog recent posts navigation"><div class="sidebarItemTitle_pO2u margin-bottom--md">Recent posts</div><ul class="sidebarItemList_Yudw clean-list"><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2024/03/03/AutoGen-Update">What&#x27;s New in AutoGen?</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2024/02/29/StateFlow">StateFlow - Build LLM Workflows with Customized State-Oriented Transition Function in GroupChat</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2024/02/11/FSM-GroupChat">FSM Group Chat -- User-specified agent transitions</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2024/02/02/AutoAnny">Anny: Assisting AutoGen Devs Via AutoGen</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2024/01/26/Custom-Models">AutoGen with Custom Models: Empowering Users to Use Their Own Inference Mechanism</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2024/01/25/AutoGenBench">AutoGenBench -- A Tool for Measuring and Evaluating AutoGen Agents</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2024/01/23/Code-execution-in-docker">Code execution is now by default inside docker container</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2023/12/29/AgentDescriptions">All About Agent Descriptions</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2023/12/23/AgentOptimizer">AgentOptimizer - An Agentic Way to Train Your LLM Agent</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2023/12/01/AutoGenStudio">AutoGen Studio: Interactively Explore Multi-Agent Workflows</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2023/11/26/Agent-AutoBuild">Agent AutoBuild - Automatically Building Multi-agent Systems</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2023/11/20/AgentEval">How to Assess Utility of LLM-powered Applications?</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2023/11/13/OAI-assistants">AutoGen Meets GPTs</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2023/11/09/EcoAssistant">EcoAssistant - Using LLM Assistants More Accurately and Affordably</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2023/11/06/LMM-Agent">Multimodal with GPT-4V and LLaVA</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2023/10/26/TeachableAgent">AutoGen&#x27;s Teachable Agents</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2023/10/18/RetrieveChat">Retrieval-Augmented Generation (RAG) Applications with AutoGen</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2023/07/14/Local-LLMs">Use AutoGen for Local LLMs</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2023/06/28/MathChat">MathChat - An Conversational Framework to Solve Math Problems</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2023/05/18/GPT-adaptive-humaneval">Achieve More, Pay Less - Use GPT-4 Smartly</a></li><li class="sidebarItem__DBe"><a aria-current="page" class="sidebarItemLink_mo7H sidebarItemLinkActive_I1ZP" href="/autogen/blog/2023/04/21/LLM-tuning-math">Does Model and Inference Parameter Matter in LLM Applications? - A Case Study for MATH</a></li></ul></nav></aside><main class="col col--7" itemscope="" itemtype="https://schema.org/Blog"><article itemprop="blogPost" itemscope="" itemtype="https://schema.org/BlogPosting"><meta itemprop="description" content="level 2 algebra"><header><h1 class="title_f1Hy" itemprop="headline">Does Model and Inference Parameter Matter in LLM Applications? - A Case Study for MATH</h1><div class="container_mt6G margin-vert--md"><time datetime="2023-04-21T00:00:00.000Z" itemprop="datePublished">April 21, 2023</time> · <!-- -->6 min read</div><div class="margin-top--md margin-bottom--sm row"><div class="col col--6 authorCol_Hf19"><div class="avatar margin-bottom--sm"><a href="https://www.linkedin.com/in/chi-wang-49b15b16/" target="_blank" rel="noopener noreferrer" class="avatar__photo-link"><img class="avatar__photo" src="https://github.com/sonichi.png" alt="Chi Wang" itemprop="image"></a><div class="avatar__intro" itemprop="author" itemscope="" itemtype="https://schema.org/Person"><div class="avatar__name"><a href="https://www.linkedin.com/in/chi-wang-49b15b16/" target="_blank" rel="noopener noreferrer" itemprop="url"><span itemprop="name">Chi Wang</span></a></div><small class="avatar__subtitle" itemprop="description">Principal Researcher at Microsoft Research</small></div></div></div></div></header><div id="__blog-post-container" class="markdown" itemprop="articleBody"><p><img decoding="async" loading="lazy" alt="level 2 algebra" src="/autogen/assets/images/level2algebra-659ba95286432d9945fc89e84d606797.png" width="575" height="469" class="img_ev3q"></p>
21 | <p><strong>TL;DR:</strong></p>
22 | <ul>
23 | <li><strong>Just by tuning the inference parameters like model, number of responses, temperature etc. without changing any model weights or prompt, the baseline accuracy of untuned gpt-4 can be improved by 20% in high school math competition problems.</strong></li>
24 | <li><strong>For easy problems, the tuned gpt-3.5-turbo model vastly outperformed untuned gpt-4 in accuracy (e.g., 90% vs. 70%) and cost efficiency. For hard problems, the tuned gpt-4 is much more accurate (e.g., 35% vs. 20%) and less expensive than untuned gpt-4.</strong></li>
25 | <li><strong>AutoGen can help with model selection, parameter tuning, and cost-saving in LLM applications.</strong></li>
26 | </ul>
27 | <p>Large language models (LLMs) are powerful tools that can generate natural language texts for various applications, such as chatbots, summarization, translation, and more. GPT-4 is currently the state of the art LLM in the world. Is model selection irrelevant? What about inference parameters?</p>
28 | <p>In this blog post, we will explore how model and inference parameter matter in LLM applications, using a case study for <a href="https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/be83ab3ecd0db773eb2dc1b0a17836a1-Abstract-round2.html" target="_blank" rel="noopener noreferrer">MATH</a>, a benchmark for evaluating LLMs on advanced mathematical problem solving. MATH consists of 12K math competition problems from AMC-10, AMC-12 and AIME. Each problem is accompanied by a step-by-step solution.</p>
29 | <p>We will use AutoGen to automatically find the best model and inference parameter for LLMs on a given task and dataset given an inference budget, using a novel low-cost search &amp; pruning strategy. AutoGen currently supports all the LLMs from OpenAI, such as GPT-3.5 and GPT-4.</p>
30 | <p>We will use AutoGen to perform model selection and inference parameter tuning. Then we compare the performance and inference cost on solving algebra problems with the untuned gpt-4. We will also analyze how different difficulty levels affect the results.</p>
31 | <h2 class="anchor anchorWithStickyNavbar_LWe7" id="experiment-setup">Experiment Setup<a href="#experiment-setup" class="hash-link" aria-label="Direct link to Experiment Setup" title="Direct link to Experiment Setup">​</a></h2>
32 | <p>We use AutoGen to select between the following models with a target inference budget $0.02 per instance:</p>
33 | <ul>
34 | <li>gpt-3.5-turbo, a relatively cheap model that powers the popular ChatGPT app</li>
35 | <li>gpt-4, the state of the art LLM that costs more than 10 times of gpt-3.5-turbo</li>
36 | </ul>
37 | <p>We adapt the models using 20 examples in the train set, using the problem statement as the input and generating the solution as the output. We use the following inference parameters:</p>
38 | <ul>
39 | <li>temperature: The parameter that controls the randomness of the output text. A higher temperature means more diversity but less coherence. We search for the optimal temperature in the range of [0, 1].</li>
40 | <li>top_p: The parameter that controls the probability mass of the output tokens. Only tokens with a cumulative probability less than or equal to top-p are considered. A lower top-p means more diversity but less coherence. We search for the optimal top-p in the range of [0, 1].</li>
41 | <li>max_tokens: The maximum number of tokens that can be generated for each output. We search for the optimal max length in the range of [50, 1000].</li>
42 | <li>n: The number of responses to generate. We search for the optimal n in the range of [1, 100].</li>
43 | <li>prompt: We use the template: &quot;{problem} Solve the problem carefully. Simplify your answer as much as possible. Put the final answer in \boxed{{}}.&quot; where {problem} will be replaced by the math problem instance.</li>
44 | </ul>
45 | <p>In this experiment, when n &gt; 1, we find the answer with highest votes among all the responses and then select it as the final answer to compare with the ground truth. For example, if n = 5 and 3 of the responses contain a final answer 301 while 2 of the responses contain a final answer 159, we choose 301 as the final answer. This can help with resolving potential errors due to randomness. We use the average accuracy and average inference cost as the metric to evaluate the performance over a dataset. The inference cost of a particular instance is measured by the price per 1K tokens and the number of tokens consumed.</p>
46 | <h2 class="anchor anchorWithStickyNavbar_LWe7" id="experiment-results">Experiment Results<a href="#experiment-results" class="hash-link" aria-label="Direct link to Experiment Results" title="Direct link to Experiment Results">​</a></h2>
47 | <p>The first figure in this blog post shows the average accuracy and average inference cost of each configuration on the level 2 Algebra test set.</p>
48 | <p>Surprisingly, the tuned gpt-3.5-turbo model is selected as a better model and it vastly outperforms untuned gpt-4 in accuracy (92% vs. 70%) with equal or 2.5 times higher inference budget.
49 | The same observation can be obtained on the level 3 Algebra test set.</p>
50 | <p><img decoding="async" loading="lazy" alt="level 3 algebra" src="/autogen/assets/images/level3algebra-94e87a683ac8832ac7ae6f41f30131a4.png" width="575" height="469" class="img_ev3q"></p>
51 | <p>However, the selected model changes on level 4 Algebra.</p>
52 | <p><img decoding="async" loading="lazy" alt="level 4 algebra" src="/autogen/assets/images/level4algebra-492beb22490df30d6cc258f061912dcd.png" width="580" height="469" class="img_ev3q"></p>
53 | <p>This time gpt-4 is selected as the best model. The tuned gpt-4 achieves much higher accuracy (56% vs. 44%) and lower cost than the untuned gpt-4.
54 | On level 5 the result is similar.</p>
55 | <p><img decoding="async" loading="lazy" alt="level 5 algebra" src="/autogen/assets/images/level5algebra-8fba701551334296d08580b4b489fe56.png" width="575" height="469" class="img_ev3q"></p>
56 | <p>We can see that AutoGen has found different optimal model and inference parameters for each subset of a particular level, which shows that these parameters matter in cost-sensitive LLM applications and need to be carefully tuned or adapted.</p>
57 | <p>An example notebook to run these experiments can be found at: <a href="https://github.com/microsoft/FLAML/blob/v1.2.1/notebook/autogen_chatgpt.ipynb" target="_blank" rel="noopener noreferrer">https://github.com/microsoft/FLAML/blob/v1.2.1/notebook/autogen_chatgpt.ipynb</a>. The experiments were run when AutoGen was a subpackage in FLAML.</p>
58 | <h2 class="anchor anchorWithStickyNavbar_LWe7" id="analysis-and-discussion">Analysis and Discussion<a href="#analysis-and-discussion" class="hash-link" aria-label="Direct link to Analysis and Discussion" title="Direct link to Analysis and Discussion">​</a></h2>
59 | <p>While gpt-3.5-turbo demonstrates competitive accuracy with voted answers in relatively easy algebra problems under the same inference budget, gpt-4 is a better choice for the most difficult problems. In general, through parameter tuning and model selection, we can identify the opportunity to save the expensive model for more challenging tasks, and improve the overall effectiveness of a budget-constrained system.</p>
60 | <p>There are many other alternative ways of solving math problems, which we have not covered in this blog post. When there are choices beyond the inference parameters, they can be generally tuned via <a href="https://microsoft.github.io/FLAML/docs/Use-Cases/Tune-User-Defined-Function" target="_blank" rel="noopener noreferrer"><code>flaml.tune</code></a>.</p>
61 | <p>The need for model selection, parameter tuning and cost saving is not specific to the math problems. The <a href="https://github.com/Significant-Gravitas/Auto-GPT" target="_blank" rel="noopener noreferrer">Auto-GPT</a> project is an example where high cost can easily prevent a generic complex task to be accomplished as it needs many LLM inference calls.</p>
62 | <h2 class="anchor anchorWithStickyNavbar_LWe7" id="for-further-reading">For Further Reading<a href="#for-further-reading" class="hash-link" aria-label="Direct link to For Further Reading" title="Direct link to For Further Reading">​</a></h2>
63 | <ul>
64 | <li><a href="https://arxiv.org/abs/2303.04673" target="_blank" rel="noopener noreferrer">Research paper about the tuning technique</a></li>
65 | <li><a href="/autogen/docs/Use-Cases/enhanced_inference">Documentation about inference tuning</a></li>
66 | </ul>
67 | <p><em>Do you have any experience to share about LLM applications? Do you like to see more support or research of LLM optimization or automation? Please join our <a href="https://discord.gg/pAbnFJrkgZ" target="_blank" rel="noopener noreferrer">Discord</a> server for discussion.</em></p></div><footer class="row docusaurus-mt-lg blogPostFooterDetailsFull_mRVl"><div class="col"><b>Tags:</b><ul class="tags_jXut padding--none margin-left--sm"><li class="tag_QGVx"><a class="tag_zVej tagRegular_sFm0" href="/autogen/blog/tags/llm">LLM</a></li><li class="tag_QGVx"><a class="tag_zVej tagRegular_sFm0" href="/autogen/blog/tags/gpt">GPT</a></li><li class="tag_QGVx"><a class="tag_zVej tagRegular_sFm0" href="/autogen/blog/tags/research">research</a></li></ul></div></footer></article><nav class="pagination-nav docusaurus-mt-lg" aria-label="Blog post page navigation"><a class="pagination-nav__link pagination-nav__link--prev" href="/autogen/blog/2023/05/18/GPT-adaptive-humaneval"><div class="pagination-nav__sublabel">Newer Post</div><div class="pagination-nav__label">Achieve More, Pay Less - Use GPT-4 Smartly</div></a></nav></main><div class="col col--2"><div class="tableOfContents_bqdL thin-scrollbar"><ul class="table-of-contents table-of-contents__left-border"><li><a href="#experiment-setup" class="table-of-contents__link toc-highlight">Experiment Setup</a></li><li><a href="#experiment-results" class="table-of-contents__link toc-highlight">Experiment Results</a></li><li><a href="#analysis-and-discussion" class="table-of-contents__link toc-highlight">Analysis and Discussion</a></li><li><a href="#for-further-reading" class="table-of-contents__link toc-highlight">For Further Reading</a></li></ul></div></div></div></div></div><footer class="footer footer--dark"><div class="container container-fluid"><div class="row footer__links"><div class="col footer__col"><div class="footer__title">Community</div><ul class="footer__items clean-list"><li class="footer__item"><a href="https://discord.gg/pAbnFJrkgZ" target="_blank" rel="noopener noreferrer" class="footer__link-item">Discord<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li><li class="footer__item"><a href="https://twitter.com/pyautogen" target="_blank" rel="noopener noreferrer" class="footer__link-item">Twitter<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li></ul></div></div><div class="footer__bottom text--center"><div class="footer__copyright">Copyright © 2024 AutoGen Authors |  <a target="_blank" style="color:#10adff" href="https://go.microsoft.com/fwlink/?LinkId=521839">Privacy and Cookies</a></div></div></div></footer></div>
68 | </body>
69 | </html>
70 | 


--------------------------------------------------------------------------------
/examples/html-conversion/test.html:
--------------------------------------------------------------------------------
 1 | <!doctype html>
 2 | <html lang="en" dir="ltr" class="blog-wrapper blog-post-page plugin-blog plugin-id-default" data-has-hydrated="false">
 3 | <head>
 4 | <meta charset="UTF-8">
 5 | <meta name="generator" content="Docusaurus v3.1.1">
 6 | <title data-rh="true">Does Model and Inference Parameter Matter in LLM Applications? - A Case Study for MATH | AutoGen</title><meta data-rh="true" name="viewport" content="width=device-width,initial-scale=1"><meta data-rh="true" name="twitter:card" content="summary_large_image"><meta data-rh="true" property="og:url" content="https://microsoft.github.io/autogen/blog/2023/04/21/LLM-tuning-math"><meta data-rh="true" property="og:locale" content="en"><meta data-rh="true" name="docusaurus_locale" content="en"><meta data-rh="true" name="docusaurus_tag" content="default"><meta data-rh="true" name="docsearch:language" content="en"><meta data-rh="true" name="docsearch:docusaurus_tag" content="default"><meta data-rh="true" property="og:title" content="Does Model and Inference Parameter Matter in LLM Applications? - A Case Study for MATH | AutoGen"><meta data-rh="true" name="description" content="level 2 algebra"><meta data-rh="true" property="og:description" content="level 2 algebra"><meta data-rh="true" property="og:type" content="article"><meta data-rh="true" property="article:published_time" content="2023-04-21T00:00:00.000Z"><meta data-rh="true" property="article:author" content="https://www.linkedin.com/in/chi-wang-49b15b16/"><meta data-rh="true" property="article:tag" content="LLM,GPT,research"><link data-rh="true" rel="icon" href="/autogen/img/ag.ico"><link data-rh="true" rel="canonical" href="https://microsoft.github.io/autogen/blog/2023/04/21/LLM-tuning-math"><link data-rh="true" rel="alternate" href="https://microsoft.github.io/autogen/blog/2023/04/21/LLM-tuning-math" hreflang="en"><link data-rh="true" rel="alternate" href="https://microsoft.github.io/autogen/blog/2023/04/21/LLM-tuning-math" hreflang="x-default"><link rel="alternate" type="application/rss+xml" href="/autogen/blog/rss.xml" title="AutoGen RSS Feed">
 7 | <link rel="alternate" type="application/atom+xml" href="/autogen/blog/atom.xml" title="AutoGen Atom Feed">
 8 | 
 9 | 
10 | 
11 | 
12 | 
13 | 
14 | <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.13.11/dist/katex.min.css" integrity="sha384-Um5gpz1odJg5Z4HAmzPtgZKdTBHZdw8S29IecapCSB31ligYPhHQZMIlWLYQGVoc" crossorigin="anonymous">
15 | <script src="/autogen/js/custom.js" async defer="defer"></script><link rel="stylesheet" href="/autogen/assets/css/styles.ca10f300.css">
16 | <script src="/autogen/assets/js/runtime~main.83ab9fec.js" defer="defer"></script>
17 | <script src="/autogen/assets/js/main.5d28c826.js" defer="defer"></script>
18 | </head>
19 | <body class="navigation-with-keyboard">
20 | <script>!function(){function t(t){document.documentElement.setAttribute("data-theme",t)}var e=function(){try{return new URLSearchParams(window.location.search).get("docusaurus-theme")}catch(t){}}()||function(){try{return localStorage.getItem("theme")}catch(t){}}();t(null!==e?e:"light")}(),function(){try{const a=new URLSearchParams(window.location.search).entries();for(var[t,e]of a)if(t.startsWith("docusaurus-data-")){var n=t.replace("docusaurus-data-","data-");document.documentElement.setAttribute(n,e)}}catch(t){}}(),document.documentElement.setAttribute("data-announcement-bar-initially-dismissed",function(){try{return"true"===localStorage.getItem("docusaurus.announcement.dismiss")}catch(t){}return!1}())</script><div id="__docusaurus"><div role="region" aria-label="Skip to main content"><a class="skipToContent_fXgn" href="#__docusaurus_skipToContent_fallback">Skip to main content</a></div><div class="announcementBar_mb4j" style="background-color:#fafbfc;color:#091E42" role="banner"><div class="announcementBarPlaceholder_vyr4"></div><div class="content_knG7 announcementBarContent_xLdY">What's new in AutoGen? Read <a href="/autogen/blog/2024/03/03/AutoGen-Update">this blog</a> for an overview of updates</div><button type="button" aria-label="Close" class="clean-btn close closeButton_CVFx announcementBarClose_gvF7"><svg viewBox="0 0 15 15" width="14" height="14"><g stroke="currentColor" stroke-width="3.1"><path d="M.75.75l13.5 13.5M14.25.75L.75 14.25"></path></g></svg></button></div><nav aria-label="Main" class="navbar navbar--fixed-top"><div class="navbar__inner"><div class="navbar__items"><button aria-label="Toggle navigation bar" aria-expanded="false" class="navbar__toggle clean-btn" type="button"><svg width="30" height="30" viewBox="0 0 30 30" aria-hidden="true"><path stroke="currentColor" stroke-linecap="round" stroke-miterlimit="10" stroke-width="2" d="M4 7h22M4 15h22M4 23h22"></path></svg></button><a class="navbar__brand" href="/autogen/"><div class="navbar__logo"><img src="/autogen/img/ag.svg" alt="AutoGen" class="themedComponent_mlkZ themedComponent--light_NVdE"><img src="/autogen/img/ag.svg" alt="AutoGen" class="themedComponent_mlkZ themedComponent--dark_xIcU"></div><b class="navbar__title text--truncate">AutoGen</b></a><a class="navbar__item navbar__link" href="/autogen/docs/Getting-Started">Docs</a><a class="navbar__item navbar__link" href="/autogen/docs/reference/agentchat/conversable_agent">API</a><a aria-current="page" class="navbar__item navbar__link navbar__link--active" href="/autogen/blog">Blog</a><a class="navbar__item navbar__link" href="/autogen/docs/FAQ">FAQ</a><a class="navbar__item navbar__link" href="/autogen/docs/Examples">Examples</a><a class="navbar__item navbar__link" href="/autogen/docs/notebooks">Notebooks</a><a class="navbar__item navbar__link" href="/autogen/docs/Gallery">Gallery</a></div><div class="navbar__items navbar__items--right"><div class="navbar__item dropdown dropdown--hoverable dropdown--right"><a href="#" aria-haspopup="true" aria-expanded="false" role="button" class="navbar__link">Other Languages</a><ul class="dropdown__menu"><li><a href="https://microsoft.github.io/autogen-for-net/" target="_blank" rel="noopener noreferrer" class="dropdown__link">Dotnet<svg width="12" height="12" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li></ul></div><a href="https://github.com/microsoft/autogen" target="_blank" rel="noopener noreferrer" class="navbar__item navbar__link">GitHub<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a><div class="toggle_vylO colorModeToggle_DEke"><button class="clean-btn toggleButton_gllP toggleButtonDisabled_aARS" type="button" disabled="" title="Switch between dark and light mode (currently light mode)" aria-label="Switch between dark and light mode (currently light mode)" aria-live="polite"><svg viewBox="0 0 24 24" width="24" height="24" class="lightToggleIcon_pyhR"><path fill="currentColor" d="M12,9c1.65,0,3,1.35,3,3s-1.35,3-3,3s-3-1.35-3-3S10.35,9,12,9 M12,7c-2.76,0-5,2.24-5,5s2.24,5,5,5s5-2.24,5-5 S14.76,7,12,7L12,7z M2,13l2,0c0.55,0,1-0.45,1-1s-0.45-1-1-1l-2,0c-0.55,0-1,0.45-1,1S1.45,13,2,13z M20,13l2,0c0.55,0,1-0.45,1-1 s-0.45-1-1-1l-2,0c-0.55,0-1,0.45-1,1S19.45,13,20,13z M11,2v2c0,0.55,0.45,1,1,1s1-0.45,1-1V2c0-0.55-0.45-1-1-1S11,1.45,11,2z M11,20v2c0,0.55,0.45,1,1,1s1-0.45,1-1v-2c0-0.55-0.45-1-1-1C11.45,19,11,19.45,11,20z M5.99,4.58c-0.39-0.39-1.03-0.39-1.41,0 c-0.39,0.39-0.39,1.03,0,1.41l1.06,1.06c0.39,0.39,1.03,0.39,1.41,0s0.39-1.03,0-1.41L5.99,4.58z M18.36,16.95 c-0.39-0.39-1.03-0.39-1.41,0c-0.39,0.39-0.39,1.03,0,1.41l1.06,1.06c0.39,0.39,1.03,0.39,1.41,0c0.39-0.39,0.39-1.03,0-1.41 L18.36,16.95z M19.42,5.99c0.39-0.39,0.39-1.03,0-1.41c-0.39-0.39-1.03-0.39-1.41,0l-1.06,1.06c-0.39,0.39-0.39,1.03,0,1.41 s1.03,0.39,1.41,0L19.42,5.99z M7.05,18.36c0.39-0.39,0.39-1.03,0-1.41c-0.39-0.39-1.03-0.39-1.41,0l-1.06,1.06 c-0.39,0.39-0.39,1.03,0,1.41s1.03,0.39,1.41,0L7.05,18.36z"></path></svg><svg viewBox="0 0 24 24" width="24" height="24" class="darkToggleIcon_wfgR"><path fill="currentColor" d="M9.37,5.51C9.19,6.15,9.1,6.82,9.1,7.5c0,4.08,3.32,7.4,7.4,7.4c0.68,0,1.35-0.09,1.99-0.27C17.45,17.19,14.93,19,12,19 c-3.86,0-7-3.14-7-7C5,9.07,6.81,6.55,9.37,5.51z M12,3c-4.97,0-9,4.03-9,9s4.03,9,9,9s9-4.03,9-9c0-0.46-0.04-0.92-0.1-1.36 c-0.98,1.37-2.58,2.26-4.4,2.26c-2.98,0-5.4-2.42-5.4-5.4c0-1.81,0.89-3.42,2.26-4.4C12.92,3.04,12.46,3,12,3L12,3z"></path></svg></button></div><div class="navbarSearchContainer_Bca1"><div class="navbar__search searchBarContainer_NW3z"><input placeholder="Search" aria-label="Search" class="navbar__search-input"><div class="loadingRing_RJI3 searchBarLoadingRing_YnHq"><div></div><div></div><div></div><div></div></div><div class="searchHintContainer_Pkmr"><kbd class="searchHint_iIMx">ctrl</kbd><kbd class="searchHint_iIMx">K</kbd></div></div></div></div></div><div role="presentation" class="navbar-sidebar__backdrop"></div></nav><div id="__docusaurus_skipToContent_fallback" class="main-wrapper mainWrapper_z2l0"><div class="container margin-vert--lg"><div class="row"><aside class="col col--3"><nav class="sidebar_re4s thin-scrollbar" aria-label="Blog recent posts navigation"><div class="sidebarItemTitle_pO2u margin-bottom--md">Recent posts</div><ul class="sidebarItemList_Yudw clean-list"><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2024/03/03/AutoGen-Update">What&#x27;s New in AutoGen?</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2024/02/29/StateFlow">StateFlow - Build LLM Workflows with Customized State-Oriented Transition Function in GroupChat</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2024/02/11/FSM-GroupChat">FSM Group Chat -- User-specified agent transitions</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2024/02/02/AutoAnny">Anny: Assisting AutoGen Devs Via AutoGen</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2024/01/26/Custom-Models">AutoGen with Custom Models: Empowering Users to Use Their Own Inference Mechanism</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2024/01/25/AutoGenBench">AutoGenBench -- A Tool for Measuring and Evaluating AutoGen Agents</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2024/01/23/Code-execution-in-docker">Code execution is now by default inside docker container</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2023/12/29/AgentDescriptions">All About Agent Descriptions</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2023/12/23/AgentOptimizer">AgentOptimizer - An Agentic Way to Train Your LLM Agent</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2023/12/01/AutoGenStudio">AutoGen Studio: Interactively Explore Multi-Agent Workflows</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2023/11/26/Agent-AutoBuild">Agent AutoBuild - Automatically Building Multi-agent Systems</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2023/11/20/AgentEval">How to Assess Utility of LLM-powered Applications?</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2023/11/13/OAI-assistants">AutoGen Meets GPTs</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2023/11/09/EcoAssistant">EcoAssistant - Using LLM Assistants More Accurately and Affordably</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2023/11/06/LMM-Agent">Multimodal with GPT-4V and LLaVA</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2023/10/26/TeachableAgent">AutoGen&#x27;s Teachable Agents</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2023/10/18/RetrieveChat">Retrieval-Augmented Generation (RAG) Applications with AutoGen</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2023/07/14/Local-LLMs">Use AutoGen for Local LLMs</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2023/06/28/MathChat">MathChat - An Conversational Framework to Solve Math Problems</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2023/05/18/GPT-adaptive-humaneval">Achieve More, Pay Less - Use GPT-4 Smartly</a></li><li class="sidebarItem__DBe"><a aria-current="page" class="sidebarItemLink_mo7H sidebarItemLinkActive_I1ZP" href="/autogen/blog/2023/04/21/LLM-tuning-math">Does Model and Inference Parameter Matter in LLM Applications? - A Case Study for MATH</a></li></ul></nav></aside><main class="col col--7" itemscope="" itemtype="https://schema.org/Blog"><article itemprop="blogPost" itemscope="" itemtype="https://schema.org/BlogPosting"><meta itemprop="description" content="level 2 algebra"><header><h1 class="title_f1Hy" itemprop="headline">Does Model and Inference Parameter Matter in LLM Applications? - A Case Study for MATH</h1><div class="container_mt6G margin-vert--md"><time datetime="2023-04-21T00:00:00.000Z" itemprop="datePublished">April 21, 2023</time> · <!-- -->6 min read</div><div class="margin-top--md margin-bottom--sm row"><div class="col col--6 authorCol_Hf19"><div class="avatar margin-bottom--sm"><a href="https://www.linkedin.com/in/chi-wang-49b15b16/" target="_blank" rel="noopener noreferrer" class="avatar__photo-link"><img class="avatar__photo" src="https://github.com/sonichi.png" alt="Chi Wang" itemprop="image"></a><div class="avatar__intro" itemprop="author" itemscope="" itemtype="https://schema.org/Person"><div class="avatar__name"><a href="https://www.linkedin.com/in/chi-wang-49b15b16/" target="_blank" rel="noopener noreferrer" itemprop="url"><span itemprop="name">Chi Wang</span></a></div><small class="avatar__subtitle" itemprop="description">Principal Researcher at Microsoft Research</small></div></div></div></div></header><div id="__blog-post-container" class="markdown" itemprop="articleBody"><p><img decoding="async" loading="lazy" alt="level 2 algebra" src="/autogen/assets/images/level2algebra-659ba95286432d9945fc89e84d606797.png" width="575" height="469" class="img_ev3q"></p>
21 | <p><strong>TL;DR:</strong></p>
22 | <ul>
23 | <li><strong>Just by tuning the inference parameters like model, number of responses, temperature etc. without changing any model weights or prompt, the baseline accuracy of untuned gpt-4 can be improved by 20% in high school math competition problems.</strong></li>
24 | <li><strong>For easy problems, the tuned gpt-3.5-turbo model vastly outperformed untuned gpt-4 in accuracy (e.g., 90% vs. 70%) and cost efficiency. For hard problems, the tuned gpt-4 is much more accurate (e.g., 35% vs. 20%) and less expensive than untuned gpt-4.</strong></li>
25 | <li><strong>AutoGen can help with model selection, parameter tuning, and cost-saving in LLM applications.</strong></li>
26 | </ul>
27 | <p>Large language models (LLMs) are powerful tools that can generate natural language texts for various applications, such as chatbots, summarization, translation, and more. GPT-4 is currently the state of the art LLM in the world. Is model selection irrelevant? What about inference parameters?</p>
28 | <p>In this blog post, we will explore how model and inference parameter matter in LLM applications, using a case study for <a href="https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/be83ab3ecd0db773eb2dc1b0a17836a1-Abstract-round2.html" target="_blank" rel="noopener noreferrer">MATH</a>, a benchmark for evaluating LLMs on advanced mathematical problem solving. MATH consists of 12K math competition problems from AMC-10, AMC-12 and AIME. Each problem is accompanied by a step-by-step solution.</p>
29 | <p>We will use AutoGen to automatically find the best model and inference parameter for LLMs on a given task and dataset given an inference budget, using a novel low-cost search &amp; pruning strategy. AutoGen currently supports all the LLMs from OpenAI, such as GPT-3.5 and GPT-4.</p>
30 | <p>We will use AutoGen to perform model selection and inference parameter tuning. Then we compare the performance and inference cost on solving algebra problems with the untuned gpt-4. We will also analyze how different difficulty levels affect the results.</p>
31 | <h2 class="anchor anchorWithStickyNavbar_LWe7" id="experiment-setup">Experiment Setup<a href="#experiment-setup" class="hash-link" aria-label="Direct link to Experiment Setup" title="Direct link to Experiment Setup">​</a></h2>
32 | <p>We use AutoGen to select between the following models with a target inference budget $0.02 per instance:</p>
33 | <ul>
34 | <li>gpt-3.5-turbo, a relatively cheap model that powers the popular ChatGPT app</li>
35 | <li>gpt-4, the state of the art LLM that costs more than 10 times of gpt-3.5-turbo</li>
36 | </ul>
37 | <p>We adapt the models using 20 examples in the train set, using the problem statement as the input and generating the solution as the output. We use the following inference parameters:</p>
38 | <ul>
39 | <li>temperature: The parameter that controls the randomness of the output text. A higher temperature means more diversity but less coherence. We search for the optimal temperature in the range of [0, 1].</li>
40 | <li>top_p: The parameter that controls the probability mass of the output tokens. Only tokens with a cumulative probability less than or equal to top-p are considered. A lower top-p means more diversity but less coherence. We search for the optimal top-p in the range of [0, 1].</li>
41 | <li>max_tokens: The maximum number of tokens that can be generated for each output. We search for the optimal max length in the range of [50, 1000].</li>
42 | <li>n: The number of responses to generate. We search for the optimal n in the range of [1, 100].</li>
43 | <li>prompt: We use the template: &quot;{problem} Solve the problem carefully. Simplify your answer as much as possible. Put the final answer in \boxed{{}}.&quot; where {problem} will be replaced by the math problem instance.</li>
44 | </ul>
45 | <p>In this experiment, when n &gt; 1, we find the answer with highest votes among all the responses and then select it as the final answer to compare with the ground truth. For example, if n = 5 and 3 of the responses contain a final answer 301 while 2 of the responses contain a final answer 159, we choose 301 as the final answer. This can help with resolving potential errors due to randomness. We use the average accuracy and average inference cost as the metric to evaluate the performance over a dataset. The inference cost of a particular instance is measured by the price per 1K tokens and the number of tokens consumed.</p>
46 | <h2 class="anchor anchorWithStickyNavbar_LWe7" id="experiment-results">Experiment Results<a href="#experiment-results" class="hash-link" aria-label="Direct link to Experiment Results" title="Direct link to Experiment Results">​</a></h2>
47 | <p>The first figure in this blog post shows the average accuracy and average inference cost of each configuration on the level 2 Algebra test set.</p>
48 | <p>Surprisingly, the tuned gpt-3.5-turbo model is selected as a better model and it vastly outperforms untuned gpt-4 in accuracy (92% vs. 70%) with equal or 2.5 times higher inference budget.
49 | The same observation can be obtained on the level 3 Algebra test set.</p>
50 | <p><img decoding="async" loading="lazy" alt="level 3 algebra" src="/autogen/assets/images/level3algebra-94e87a683ac8832ac7ae6f41f30131a4.png" width="575" height="469" class="img_ev3q"></p>
51 | <p>However, the selected model changes on level 4 Algebra.</p>
52 | <p><img decoding="async" loading="lazy" alt="level 4 algebra" src="/autogen/assets/images/level4algebra-492beb22490df30d6cc258f061912dcd.png" width="580" height="469" class="img_ev3q"></p>
53 | <p>This time gpt-4 is selected as the best model. The tuned gpt-4 achieves much higher accuracy (56% vs. 44%) and lower cost than the untuned gpt-4.
54 | On level 5 the result is similar.</p>
55 | <p><img decoding="async" loading="lazy" alt="level 5 algebra" src="/autogen/assets/images/level5algebra-8fba701551334296d08580b4b489fe56.png" width="575" height="469" class="img_ev3q"></p>
56 | <p>We can see that AutoGen has found different optimal model and inference parameters for each subset of a particular level, which shows that these parameters matter in cost-sensitive LLM applications and need to be carefully tuned or adapted.</p>
57 | <p>An example notebook to run these experiments can be found at: <a href="https://github.com/microsoft/FLAML/blob/v1.2.1/notebook/autogen_chatgpt.ipynb" target="_blank" rel="noopener noreferrer">https://github.com/microsoft/FLAML/blob/v1.2.1/notebook/autogen_chatgpt.ipynb</a>. The experiments were run when AutoGen was a subpackage in FLAML.</p>
58 | <h2 class="anchor anchorWithStickyNavbar_LWe7" id="analysis-and-discussion">Analysis and Discussion<a href="#analysis-and-discussion" class="hash-link" aria-label="Direct link to Analysis and Discussion" title="Direct link to Analysis and Discussion">​</a></h2>
59 | <p>While gpt-3.5-turbo demonstrates competitive accuracy with voted answers in relatively easy algebra problems under the same inference budget, gpt-4 is a better choice for the most difficult problems. In general, through parameter tuning and model selection, we can identify the opportunity to save the expensive model for more challenging tasks, and improve the overall effectiveness of a budget-constrained system.</p>
60 | <p>There are many other alternative ways of solving math problems, which we have not covered in this blog post. When there are choices beyond the inference parameters, they can be generally tuned via <a href="https://microsoft.github.io/FLAML/docs/Use-Cases/Tune-User-Defined-Function" target="_blank" rel="noopener noreferrer"><code>flaml.tune</code></a>.</p>
61 | <p>The need for model selection, parameter tuning and cost saving is not specific to the math problems. The <a href="https://github.com/Significant-Gravitas/Auto-GPT" target="_blank" rel="noopener noreferrer">Auto-GPT</a> project is an example where high cost can easily prevent a generic complex task to be accomplished as it needs many LLM inference calls.</p>
62 | <h2 class="anchor anchorWithStickyNavbar_LWe7" id="for-further-reading">For Further Reading<a href="#for-further-reading" class="hash-link" aria-label="Direct link to For Further Reading" title="Direct link to For Further Reading">​</a></h2>
63 | <ul>
64 | <li><a href="https://arxiv.org/abs/2303.04673" target="_blank" rel="noopener noreferrer">Research paper about the tuning technique</a></li>
65 | <li><a href="/autogen/docs/Use-Cases/enhanced_inference">Documentation about inference tuning</a></li>
66 | </ul>
67 | <p><em>Do you have any experience to share about LLM applications? Do you like to see more support or research of LLM optimization or automation? Please join our <a href="https://discord.gg/pAbnFJrkgZ" target="_blank" rel="noopener noreferrer">Discord</a> server for discussion.</em></p></div><footer class="row docusaurus-mt-lg blogPostFooterDetailsFull_mRVl"><div class="col"><b>Tags:</b><ul class="tags_jXut padding--none margin-left--sm"><li class="tag_QGVx"><a class="tag_zVej tagRegular_sFm0" href="/autogen/blog/tags/llm">LLM</a></li><li class="tag_QGVx"><a class="tag_zVej tagRegular_sFm0" href="/autogen/blog/tags/gpt">GPT</a></li><li class="tag_QGVx"><a class="tag_zVej tagRegular_sFm0" href="/autogen/blog/tags/research">research</a></li></ul></div></footer></article><nav class="pagination-nav docusaurus-mt-lg" aria-label="Blog post page navigation"><a class="pagination-nav__link pagination-nav__link--prev" href="/autogen/blog/2023/05/18/GPT-adaptive-humaneval"><div class="pagination-nav__sublabel">Newer Post</div><div class="pagination-nav__label">Achieve More, Pay Less - Use GPT-4 Smartly</div></a></nav></main><div class="col col--2"><div class="tableOfContents_bqdL thin-scrollbar"><ul class="table-of-contents table-of-contents__left-border"><li><a href="#experiment-setup" class="table-of-contents__link toc-highlight">Experiment Setup</a></li><li><a href="#experiment-results" class="table-of-contents__link toc-highlight">Experiment Results</a></li><li><a href="#analysis-and-discussion" class="table-of-contents__link toc-highlight">Analysis and Discussion</a></li><li><a href="#for-further-reading" class="table-of-contents__link toc-highlight">For Further Reading</a></li></ul></div></div></div></div></div><footer class="footer footer--dark"><div class="container container-fluid"><div class="row footer__links"><div class="col footer__col"><div class="footer__title">Community</div><ul class="footer__items clean-list"><li class="footer__item"><a href="https://discord.gg/pAbnFJrkgZ" target="_blank" rel="noopener noreferrer" class="footer__link-item">Discord<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li><li class="footer__item"><a href="https://twitter.com/pyautogen" target="_blank" rel="noopener noreferrer" class="footer__link-item">Twitter<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li></ul></div></div><div class="footer__bottom text--center"><div class="footer__copyright">Copyright © 2024 AutoGen Authors |  <a target="_blank" style="color:#10adff" href="https://go.microsoft.com/fwlink/?LinkId=521839">Privacy and Cookies</a></div></div></div></footer></div>
68 | </body>
69 | </html>
70 | 


--------------------------------------------------------------------------------