├── .env.example
├── .gitignore
├── package.json
├── LICENSE
├── train.js
├── combine.js
├── splitXML.js
├── generate-training.js
└── README.md


/.env.example:
--------------------------------------------------------------------------------
1 | OPENAI_API_KEY=
2 | REPLICATE_API_TOKEN=
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | logs
 2 | *.log
 3 | node_modules/
 4 | .env
 5 | .env.test
 6 | *.7z
 7 | *.xml
 8 | pages/
 9 | training_data/
10 | *.DS_Store
11 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "wiki-xml-parser",
 3 |   "version": "1.0.0",
 4 |   "description": "Split out a MediaWiki XML file into files",
 5 |   "main": "index.js",
 6 |   "type": "module",
 7 |   "scripts": {
 8 |     "test": "echo \"Error: no test specified\" && exit 1"
 9 |   },
10 |   "repository": {
11 |     "type": "git",
12 |     "url": "git+https://github.com/fofr/wiki-xml-parser.git"
13 |   },
14 |   "author": "fofr",
15 |   "license": "MIT",
16 |   "bugs": {
17 |     "url": "https://github.com/fofr/wiki-xml-parser/issues"
18 |   },
19 |   "homepage": "https://github.com/fofr/wiki-xml-parser#readme",
20 |   "dependencies": {
21 |     "chatgpt": "^5.2.2",
22 |     "dotenv": "^16.0.3",
23 |     "fs-extra": "^11.1.1",
24 |     "replicate": "^0.9.1",
25 |     "xml2js": "^0.4.23"
26 |   }
27 | }
28 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 fofr
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/train.js:
--------------------------------------------------------------------------------
 1 | import Replicate from 'replicate'
 2 | import * as dotenv from 'dotenv'
 3 | dotenv.config()
 4 | 
 5 | // LLMs
 6 | const LLMs = {
 7 |   llama: ['llama-7b', '455d66312a66299fba685548fe24f66880f093007b927abd19f4356295f8577c'],
 8 |   gpt: ['gpt-j-6b', 'b3546aeec6c9891f0dd9929c2d3bedbf013c12e02e7dd0346af09c37e008c827'],
 9 |   flan: ['flan-t5-xl', '3ae0799123a1fe11f8c89fd99632f843fc5f7a761630160521c4253149754523']
10 | }
11 | 
12 | // Your Replicate username/model to save the fine tuning to
13 | // for example 'fofr/flan'
14 | const destination = ''
15 | 
16 | // URL to your training data
17 | const train_data = ''
18 | 
19 | // Choose your LLM (llama, gpt, flan)
20 | const [llm, version] = LLMs.llama
21 | 
22 | const replicate = new Replicate({
23 |   auth: process.env.REPLICATE_API_TOKEN,
24 | })
25 | 
26 | async function main() {
27 |   const training = await replicate.trainings.create('replicate', llm, version, {
28 |     destination,
29 |     input: { train_data }
30 |   })
31 | 
32 |   console.log(training)
33 |   console.log(`URL: https://replicate.com/p/${training.id}`)
34 | }
35 | 
36 | const training_id = ''
37 | async function get() {
38 |   const training = await replicate.trainings.get(training_id)
39 |   console.log(training)
40 | }
41 | 
42 | async function cancel() {
43 |   const training = await replicate.trainings.cancel(training_id)
44 |   console.log(training)
45 | }
46 | 
47 | main()
48 | 


--------------------------------------------------------------------------------
/combine.js:
--------------------------------------------------------------------------------
 1 | import fs from 'fs/promises'
 2 | import path from 'path'
 3 | 
 4 | const trainingDir = 'training_data'
 5 | const combinedFileName = process.argv[2]
 6 | 
 7 | // if the filename is not provided, exit the program
 8 | if (!combinedFileName) {
 9 |   console.error('Please provide a filename')
10 |   process.exit(1)
11 | }
12 | 
13 | async function main() {
14 |   try {
15 |     // Read all files in the training directory
16 |     const files = await fs.readdir(trainingDir)
17 | 
18 |     // Filter the .jsonl files
19 |     const jsonlFiles = files.filter(file => file.endsWith('.jsonl'))
20 | 
21 |     // Initialize an empty array to store the content of each .jsonl file
22 |     const jsonlContents = []
23 | 
24 |     // Iterate through each .jsonl file
25 |     for (const jsonlFile of jsonlFiles) {
26 |       // Read the content of the .jsonl file
27 |       const content = await fs.readFile(path.join(trainingDir, jsonlFile), 'utf-8')
28 | 
29 |       // Split the content into lines and filter the lines based on the conditions
30 |       const filteredLines = content
31 |         .split('\n')
32 |         .filter(line => {
33 |           const trimmedLine = line.trim();
34 |           return (
35 |             trimmedLine.length > 0 &&
36 |             !trimmedLine.startsWith('[') &&
37 |             !trimmedLine.endsWith(']') &&
38 |             trimmedLine.includes('{"prompt":"') &&
39 |             trimmedLine.includes(',"completion":"')
40 |           );
41 |         })
42 | 
43 |       // Join the filtered lines and add the content to the array
44 |       jsonlContents.push(filteredLines.join('\n'))
45 |     }
46 | 
47 |     // Combine all contents into a single string with newline separation
48 |     const combinedContent = jsonlContents.join('\n')
49 | 
50 |     // Split the combined content into lines and filter out the empty lines
51 |     const finalContent = combinedContent
52 |       .split('\n')
53 |       .filter(line => line.trim().length > 0)
54 |       .join('\n')
55 | 
56 |     // Write the final content to the output file
57 |     await fs.writeFile(combinedFileName, finalContent)
58 | 
59 |     console.log(`Combined .jsonl files into ${combinedFileName}`)
60 |   } catch (err) {
61 |     console.error('An error occurred:', err)
62 |   }
63 | }
64 | 
65 | main()
66 | 


--------------------------------------------------------------------------------
/splitXML.js:
--------------------------------------------------------------------------------
  1 | import fs from 'fs-extra'
  2 | import xml2js from 'xml2js'
  3 | const parser = new xml2js.Parser()
  4 | const args = process.argv.slice(2)
  5 | 
  6 | if (args.length < 1) {
  7 |   console.error('Usage: node splitXML.js <input-file>')
  8 |   process.exit(1)
  9 | }
 10 | 
 11 | const inputFile = args[0]
 12 | const outputFolder = 'pages'
 13 | const excludedPrefixes = [
 14 |   'User:',
 15 |   'User talk:',
 16 |   'User blog:',
 17 |   'User blog comment:',
 18 |   'MediaWiki:',
 19 |   'File:',
 20 |   'File talk:',
 21 |   'Talk:',
 22 |   'Template:',
 23 |   'Template talk:',
 24 |   'Forum:',
 25 |   'Forum talk:',
 26 |   'Board:',
 27 |   'Category:',
 28 |   'Category talk:',
 29 |   'Module:',
 30 |   'Thread:',
 31 | 
 32 |   // Wiki specific
 33 |   'Memory Alpha:'
 34 | ]
 35 | 
 36 | const excludedTitles = [
 37 |   'Wiki:',
 38 |   'Wiki talk:'
 39 | ]
 40 | 
 41 | const filterPages = (page) => {
 42 |   return (
 43 |     !excludedPrefixes.some(prefix => page.title[0].startsWith(prefix)) &&
 44 |     !excludedTitles.some(title => page.title[0].includes(title)) &&
 45 |     !page.hasOwnProperty('redirect')
 46 |   )
 47 | }
 48 | 
 49 | const excludeLinesWithoutContent = (text) => {
 50 |   const startsWith = [
 51 |     '[[en:',
 52 |     '[[da:',
 53 |     '[[de:',
 54 |     '[[fr:',
 55 |     '[[fi:',
 56 |     '[[es:',
 57 |     '[[it:',
 58 |     '[[no:',
 59 |     '[[nl:',
 60 |     '[[pl:',
 61 |     '[[pt:',
 62 |     '[[pt-br:',
 63 |     '[[ru:',
 64 |     '[[sv:',
 65 |     '[[zh:',
 66 |     '[[bg:',
 67 |     '[[cs:',
 68 |     '[[ja:',
 69 |     '[[Category:',
 70 |     '[[File:',
 71 |     '*{{startrek.com',
 72 |     '*{{Wikipedia}}',
 73 |     '*{{mbeta}}',
 74 |     ';{{visible'
 75 |   ]
 76 | 
 77 |   const lines = text.split('\n')
 78 |   const filteredLines = lines.filter(line => {
 79 |     const lineWithoutSpaces = line.replace(/\s/g, '')
 80 |     return !startsWith.some(prefix => lineWithoutSpaces.startsWith(prefix))
 81 |   })
 82 |   return filteredLines.join('\n')
 83 | }
 84 | 
 85 | function breakTextIntoChunks (text, maxChunkSize = 10000) {
 86 |   const lines = text.split('\n')
 87 |   const chunks = []
 88 |   let chunk = ''
 89 |   let chunkSize = 0
 90 | 
 91 |   for (const line of lines) {
 92 |     const lineSize = line.length + 1 // Adding 1 for the newline character
 93 |     if (chunkSize + lineSize <= maxChunkSize) {
 94 |       chunk += line + '\n'
 95 |       chunkSize += lineSize
 96 |     } else {
 97 |       chunks.push(chunk)
 98 |       chunk = line + '\n'
 99 |       chunkSize = lineSize
100 |     }
101 |   }
102 | 
103 |   if (chunk) {
104 |     chunks.push(chunk)
105 |   }
106 | 
107 |   return chunks
108 | }
109 | 
110 | fs.readFile(inputFile, 'utf8', (err, data) => {
111 |   if (err) {
112 |     console.error(`Error reading XML file: ${err}`)
113 |     return
114 |   }
115 | 
116 |   parser.parseString(data, (err, result) => {
117 |     if (err) {
118 |       console.error(`Error parsing XML: ${err}`)
119 |       return
120 |     }
121 | 
122 |     const pages = result.mediawiki.page.filter(filterPages)
123 | 
124 |     fs.ensureDir(outputFolder)
125 |       .then(() => {
126 |         pages.forEach(page => {
127 |           const title = page.title[0].replace(/\//g, '-')
128 |           const timestamp = page.revision[0].timestamp[0].split('T')[0]
129 |           const filename = `${title}_${timestamp}`.replace(/[^\w\/]|_/g, '-').toLowerCase()
130 |           let text = page.revision[0].text[0]._
131 |           text = excludeLinesWithoutContent(text.replace(/<[^>]*>/g, ''))
132 |           const chunks = breakTextIntoChunks(text)
133 | 
134 |           for (const [index, chunk] of chunks.entries()) {
135 |             const outputData = `
136 | ${title}
137 | 
138 | ${chunk}`.trim()
139 | 
140 |             const outputFile = `${outputFolder}/${filename}_${index}.txt`
141 |             fs.writeFile(outputFile, outputData)
142 |               .then(() => console.log(`Created: ${outputFile}`))
143 |           }
144 |         })
145 |       })
146 |       .catch(err => console.error(`Error creating output folder: ${err}`))
147 |   })
148 | })
149 | 


--------------------------------------------------------------------------------
/generate-training.js:
--------------------------------------------------------------------------------
  1 | import fs from 'fs'
  2 | import path from 'path'
  3 | import * as dotenv from 'dotenv'
  4 | import { ChatGPTAPI } from 'chatgpt'
  5 | dotenv.config()
  6 | 
  7 | const outputDir = 'pages'
  8 | const trainingDir = 'training_data'
  9 | const concurrentRequests = 10
 10 | 
 11 | const getSystemMessage = (questionCount) => {
 12 |   return `
 13 | Act as a javascript API, you only return JSON. Output will be parsed as JSONL, non-JSONL output will be ignored.
 14 | 
 15 | You will be given a text block. Based on that text you must craft ${questionCount} questions and answers.
 16 | Return those in the JSON format. You do not need to access any external content.
 17 | 
 18 | In the text block, the first line is the title of the content. Incorporate the title into your question.
 19 | 
 20 | Return your JSON using JSONL in the format:
 21 | { "prompt": [the first question], "completion": [the answer] }
 22 | { "prompt": [the second question], "completion": [the answer] }
 23 | { "prompt": [the third question], "completion": [the answer] }
 24 | ...`
 25 | }
 26 | 
 27 | const isValidJSONL = (jsonl) => {
 28 |   const lines = jsonl.split('\n')
 29 |   return lines.every((line) => {
 30 |     try {
 31 |       if (line.trim() === '') return true
 32 |       const obj = JSON.parse(line)
 33 | 
 34 |       if (typeof obj.prompt !== 'string') {
 35 |         if (Array.isArray(obj.prompt)) {
 36 |           obj.prompt = obj.prompt.join(' ')
 37 |         } else {
 38 |           return false
 39 |         }
 40 |       }
 41 | 
 42 |       if (typeof obj.completion !== 'string') {
 43 |         if (Array.isArray(obj.completion)) {
 44 |           obj.completion = obj.completion.join(' ')
 45 |         } else {
 46 |           return false
 47 |         }
 48 |       }
 49 | 
 50 |       return true
 51 |     } catch (e) {
 52 |       return false
 53 |     }
 54 |   })
 55 | }
 56 | 
 57 | const parseJSONL = (jsonl) => {
 58 |   const lines = jsonl.split('\n')
 59 |   return lines
 60 |     .filter(line => line.trim() !== '')
 61 |     .map(line => JSON.stringify(JSON.parse(line)))
 62 |     .join('\n')
 63 | }
 64 | 
 65 | async function main() {
 66 |   const apiKey = process.env.OPENAI_API_KEY
 67 |   if (!apiKey) {
 68 |     console.error('Please set the OPENAI_API_KEY environment variable')
 69 |     process.exit(1)
 70 |   }
 71 | 
 72 |   // Make the training directory if it doesn't exist
 73 |   fs.mkdirSync(trainingDir, { recursive: true })
 74 | 
 75 |   const files = fs.readdirSync(outputDir)
 76 |   const fileChunks = []
 77 | 
 78 |   for (let i = 0; i < files.length; i += concurrentRequests) {
 79 |     fileChunks.push(files.slice(i, i + concurrentRequests))
 80 |   }
 81 | 
 82 |   for (const chunk of fileChunks) {
 83 |     const chunkPromises = chunk.map(async (file) => {
 84 |       const trainingPath = path.join(trainingDir, path.basename(file, '.txt') + '.jsonl')
 85 |       if (fs.existsSync(trainingPath)) {
 86 |         console.log(`Skipping ${file} as a .jsonl file already exists`)
 87 |         return
 88 |       }
 89 | 
 90 |       console.log(`Processing ${file}`)
 91 |       const content = fs.readFileSync(path.join(outputDir, file), 'utf-8')
 92 |       const questionCount = content.length > 5000 ? 10 : Math.ceil(content.length / 500)
 93 | 
 94 |       console.log(content.length, questionCount)
 95 |       const chatAgent = new ChatGPTAPI({
 96 |         apiKey,
 97 |         systemMessage: getSystemMessage(questionCount)
 98 |       })
 99 | 
100 |       try {
101 |         const res = await chatAgent.sendMessage(content)
102 | 
103 |         if (isValidJSONL(res.text)) {
104 |           const parsedContent = parseJSONL(res.text)
105 |           fs.writeFileSync(trainingPath, parsedContent)
106 |           console.log(`Wrote parsed JSONL to ${trainingPath}`)
107 |         } else {
108 |           console.error(`Invalid JSONL in:\n\n${res.text}`)
109 |         }
110 |       } catch (err) {
111 |         console.error(`An error occurred while processing ${file}:`, err)
112 |       }
113 |     })
114 | 
115 |     await Promise.all(chunkPromises)
116 |   }
117 | }
118 | 
119 | main().catch((err) => {
120 |   console.error('An error occurred:', err)
121 |   process.exit(1)
122 | })
123 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Generate training data and fine-tune an LLM from a MediaWiki XML file
  2 | 
  3 | Creates a `.jsonl` file in the format:
  4 | 
  5 | ```json
  6 | {"prompt":"Question","completion":"Answer"}
  7 | ```
  8 | 
  9 | You can use this data can be used to fine tune a large language model.
 10 | 
 11 | This process uses GPT3.5 to generate training data. The data can only be used to create models that do not compete with OpenAI, as per the OpenAI terms of service.
 12 | 
 13 | ## Usage
 14 | 
 15 | Clone the repo and install dependencies:
 16 | 
 17 | ```sh
 18 | git clone https://github.com/fofr/mediawiki-to-training-data
 19 | cd mediawiki-to-training-data
 20 | npm install
 21 | ```
 22 | 
 23 | Then copy your [OpenAI API key](https://platform.openai.com/account/api-keys) and [Replicate API token](https://replicate.com/docs/get-started/nodejs) and add them to a `.env` file:
 24 | 
 25 | ```sh
 26 | echo "OPENAI_API_KEY=<your-key-here>" > .env
 27 | echo "REPLICATE_API_TOKEN=<your-replicate-yet-here>" > .env
 28 | ```
 29 | 
 30 | ### Download MediaWiki content
 31 | 
 32 | You can download MediaWiki XMLs file from any MediaWiki site, Wikipedia being the most well-known of these.
 33 | 
 34 | To download an individual article from Wikipedia, use URLs in this format: `https://en.wikipedia.org/wiki/Special:Export/Title_of_the_article`
 35 | 
 36 | Examples:
 37 | 
 38 | - https://en.wikipedia.org/wiki/Special:Export/Natural_language_processing
 39 | - https://en.wikipedia.org/wiki/Special:Export/Super_Mario_Bros.
 40 | - https://en.wikipedia.org/wiki/Special:Export/Neo_(The_Matrix)
 41 | 
 42 | ### Parse XML
 43 | 
 44 | Split a MediaWiki XML file into files.
 45 | 
 46 | Usage:
 47 | 
 48 | ```sh
 49 | node splitXML.js your-media-wiki-file.xml
 50 | ```
 51 | 
 52 | Will write a text file for each Wiki page into the `pages/` directory.
 53 | 
 54 | Long pages get chunked into separate files, a maximum of 10,000 characters each. Chunks break on newlines, and each chunk keeps the title for context.
 55 | 
 56 | MediaWiki redirects and pages such as users, talk pages and files, are all ignored.
 57 | 
 58 | ### Generate training data
 59 | 
 60 | Use the OpenAI API to pass MediaWiki content to GPT3.5, and ask it to create questions and answers based on the information given. The WikiText formatting is passed directly to GPT as it can interpret it.
 61 | 
 62 | ```sh
 63 | node generate-training.js
 64 | ```
 65 | 
 66 | A maximum of 10 questions are requested per file, shorter content gets fewer questions.
 67 | 
 68 | JSONL is requested. If GPT returns invalid JSON the output is not saved.
 69 | 
 70 | A `.jsonl` file is created for each text file in `training_data/`.
 71 | 
 72 | You can pause and resume the generation of training data. Pages with a corresponding `.jsonl` file are skipped.
 73 | 
 74 | ### Combine training data
 75 | 
 76 | When you have enough training data, combine them into a single `.jsonl` file for training.
 77 | 
 78 | ```sh
 79 | node combine.js your-output-file.jsonl
 80 | ```
 81 | 
 82 | ### Fine tune an LLM using Replicate
 83 | 
 84 | Now you have your training data you can fine-tune an LLM.
 85 | 
 86 | `train.js` is a script for [fine-tuning pre-trained language models using the Replicate API](https://replicate.com/docs/guides/fine-tune-a-language-model). You can fine-tune your language model with your own training data and choose from one of the available models: Llama, GPT, and Flan.
 87 | 
 88 | You need:
 89 | 
 90 | - An account on Replicate
 91 | - Your Replicate API token
 92 | - A destination model to save your fine tuned model ([Create a model](https://replicate.com/create))
 93 | - A URL pointing to your generated training data
 94 | 
 95 | Set the destination variable to the username and model name you want to save the fine-tuned model to. For example, if your username is `myusername` and you want to save the model as `mymodel`, set destination to 'myusername/mymodel'.
 96 | 
 97 | Set the `train_data` variable to the URL of your training data.
 98 | 
 99 | Choose the pre-trained model you want to fine-tune. Use one of the following:
100 | 
101 | ```js
102 | const [llm, version] = LLMs.llama // For Llama model
103 | const [llm, version] = LLMs.gpt   // For GPT model
104 | const [llm, version] = LLMs.flan  // For Flan model
105 | ```
106 | 
107 | After setting the variables, save the file and run the script with `node train.js`. The script will create a new fine-tuning job using the Replicate API, and it will print the details of the training job, including the URL to track the progress of your fine-tuning.
108 | 
109 | ### Track progress or cancel job
110 | 
111 | `train.js` also includes two functions:
112 | 
113 | - get()
114 | - cancel()
115 | 
116 | They can be used to retrieve the details of an existing training job or cancel a training job.
117 | 
118 | You can also see progress or cancel the job from the web UI.
119 | 


--------------------------------------------------------------------------------