├── .env.example ├── .gitignore ├── package.json ├── LICENSE ├── train.js ├── combine.js ├── splitXML.js ├── generate-training.js └── README.md /.env.example: -------------------------------------------------------------------------------- 1 | OPENAI_API_KEY= 2 | REPLICATE_API_TOKEN= 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | logs 2 | *.log 3 | node_modules/ 4 | .env 5 | .env.test 6 | *.7z 7 | *.xml 8 | pages/ 9 | training_data/ 10 | *.DS_Store 11 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "wiki-xml-parser", 3 | "version": "1.0.0", 4 | "description": "Split out a MediaWiki XML file into files", 5 | "main": "index.js", 6 | "type": "module", 7 | "scripts": { 8 | "test": "echo \"Error: no test specified\" && exit 1" 9 | }, 10 | "repository": { 11 | "type": "git", 12 | "url": "git+https://github.com/fofr/wiki-xml-parser.git" 13 | }, 14 | "author": "fofr", 15 | "license": "MIT", 16 | "bugs": { 17 | "url": "https://github.com/fofr/wiki-xml-parser/issues" 18 | }, 19 | "homepage": "https://github.com/fofr/wiki-xml-parser#readme", 20 | "dependencies": { 21 | "chatgpt": "^5.2.2", 22 | "dotenv": "^16.0.3", 23 | "fs-extra": "^11.1.1", 24 | "replicate": "^0.9.1", 25 | "xml2js": "^0.4.23" 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 fofr 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /train.js: -------------------------------------------------------------------------------- 1 | import Replicate from 'replicate' 2 | import * as dotenv from 'dotenv' 3 | dotenv.config() 4 | 5 | // LLMs 6 | const LLMs = { 7 | llama: ['llama-7b', '455d66312a66299fba685548fe24f66880f093007b927abd19f4356295f8577c'], 8 | gpt: ['gpt-j-6b', 'b3546aeec6c9891f0dd9929c2d3bedbf013c12e02e7dd0346af09c37e008c827'], 9 | flan: ['flan-t5-xl', '3ae0799123a1fe11f8c89fd99632f843fc5f7a761630160521c4253149754523'] 10 | } 11 | 12 | // Your Replicate username/model to save the fine tuning to 13 | // for example 'fofr/flan' 14 | const destination = '' 15 | 16 | // URL to your training data 17 | const train_data = '' 18 | 19 | // Choose your LLM (llama, gpt, flan) 20 | const [llm, version] = LLMs.llama 21 | 22 | const replicate = new Replicate({ 23 | auth: process.env.REPLICATE_API_TOKEN, 24 | }) 25 | 26 | async function main() { 27 | const training = await replicate.trainings.create('replicate', llm, version, { 28 | destination, 29 | input: { train_data } 30 | }) 31 | 32 | console.log(training) 33 | console.log(`URL: https://replicate.com/p/${training.id}`) 34 | } 35 | 36 | const training_id = '' 37 | async function get() { 38 | const training = await replicate.trainings.get(training_id) 39 | console.log(training) 40 | } 41 | 42 | async function cancel() { 43 | const training = await replicate.trainings.cancel(training_id) 44 | console.log(training) 45 | } 46 | 47 | main() 48 | -------------------------------------------------------------------------------- /combine.js: -------------------------------------------------------------------------------- 1 | import fs from 'fs/promises' 2 | import path from 'path' 3 | 4 | const trainingDir = 'training_data' 5 | const combinedFileName = process.argv[2] 6 | 7 | // if the filename is not provided, exit the program 8 | if (!combinedFileName) { 9 | console.error('Please provide a filename') 10 | process.exit(1) 11 | } 12 | 13 | async function main() { 14 | try { 15 | // Read all files in the training directory 16 | const files = await fs.readdir(trainingDir) 17 | 18 | // Filter the .jsonl files 19 | const jsonlFiles = files.filter(file => file.endsWith('.jsonl')) 20 | 21 | // Initialize an empty array to store the content of each .jsonl file 22 | const jsonlContents = [] 23 | 24 | // Iterate through each .jsonl file 25 | for (const jsonlFile of jsonlFiles) { 26 | // Read the content of the .jsonl file 27 | const content = await fs.readFile(path.join(trainingDir, jsonlFile), 'utf-8') 28 | 29 | // Split the content into lines and filter the lines based on the conditions 30 | const filteredLines = content 31 | .split('\n') 32 | .filter(line => { 33 | const trimmedLine = line.trim(); 34 | return ( 35 | trimmedLine.length > 0 && 36 | !trimmedLine.startsWith('[') && 37 | !trimmedLine.endsWith(']') && 38 | trimmedLine.includes('{"prompt":"') && 39 | trimmedLine.includes(',"completion":"') 40 | ); 41 | }) 42 | 43 | // Join the filtered lines and add the content to the array 44 | jsonlContents.push(filteredLines.join('\n')) 45 | } 46 | 47 | // Combine all contents into a single string with newline separation 48 | const combinedContent = jsonlContents.join('\n') 49 | 50 | // Split the combined content into lines and filter out the empty lines 51 | const finalContent = combinedContent 52 | .split('\n') 53 | .filter(line => line.trim().length > 0) 54 | .join('\n') 55 | 56 | // Write the final content to the output file 57 | await fs.writeFile(combinedFileName, finalContent) 58 | 59 | console.log(`Combined .jsonl files into ${combinedFileName}`) 60 | } catch (err) { 61 | console.error('An error occurred:', err) 62 | } 63 | } 64 | 65 | main() 66 | -------------------------------------------------------------------------------- /splitXML.js: -------------------------------------------------------------------------------- 1 | import fs from 'fs-extra' 2 | import xml2js from 'xml2js' 3 | const parser = new xml2js.Parser() 4 | const args = process.argv.slice(2) 5 | 6 | if (args.length < 1) { 7 | console.error('Usage: node splitXML.js ') 8 | process.exit(1) 9 | } 10 | 11 | const inputFile = args[0] 12 | const outputFolder = 'pages' 13 | const excludedPrefixes = [ 14 | 'User:', 15 | 'User talk:', 16 | 'User blog:', 17 | 'User blog comment:', 18 | 'MediaWiki:', 19 | 'File:', 20 | 'File talk:', 21 | 'Talk:', 22 | 'Template:', 23 | 'Template talk:', 24 | 'Forum:', 25 | 'Forum talk:', 26 | 'Board:', 27 | 'Category:', 28 | 'Category talk:', 29 | 'Module:', 30 | 'Thread:', 31 | 32 | // Wiki specific 33 | 'Memory Alpha:' 34 | ] 35 | 36 | const excludedTitles = [ 37 | 'Wiki:', 38 | 'Wiki talk:' 39 | ] 40 | 41 | const filterPages = (page) => { 42 | return ( 43 | !excludedPrefixes.some(prefix => page.title[0].startsWith(prefix)) && 44 | !excludedTitles.some(title => page.title[0].includes(title)) && 45 | !page.hasOwnProperty('redirect') 46 | ) 47 | } 48 | 49 | const excludeLinesWithoutContent = (text) => { 50 | const startsWith = [ 51 | '[[en:', 52 | '[[da:', 53 | '[[de:', 54 | '[[fr:', 55 | '[[fi:', 56 | '[[es:', 57 | '[[it:', 58 | '[[no:', 59 | '[[nl:', 60 | '[[pl:', 61 | '[[pt:', 62 | '[[pt-br:', 63 | '[[ru:', 64 | '[[sv:', 65 | '[[zh:', 66 | '[[bg:', 67 | '[[cs:', 68 | '[[ja:', 69 | '[[Category:', 70 | '[[File:', 71 | '*{{startrek.com', 72 | '*{{Wikipedia}}', 73 | '*{{mbeta}}', 74 | ';{{visible' 75 | ] 76 | 77 | const lines = text.split('\n') 78 | const filteredLines = lines.filter(line => { 79 | const lineWithoutSpaces = line.replace(/\s/g, '') 80 | return !startsWith.some(prefix => lineWithoutSpaces.startsWith(prefix)) 81 | }) 82 | return filteredLines.join('\n') 83 | } 84 | 85 | function breakTextIntoChunks (text, maxChunkSize = 10000) { 86 | const lines = text.split('\n') 87 | const chunks = [] 88 | let chunk = '' 89 | let chunkSize = 0 90 | 91 | for (const line of lines) { 92 | const lineSize = line.length + 1 // Adding 1 for the newline character 93 | if (chunkSize + lineSize <= maxChunkSize) { 94 | chunk += line + '\n' 95 | chunkSize += lineSize 96 | } else { 97 | chunks.push(chunk) 98 | chunk = line + '\n' 99 | chunkSize = lineSize 100 | } 101 | } 102 | 103 | if (chunk) { 104 | chunks.push(chunk) 105 | } 106 | 107 | return chunks 108 | } 109 | 110 | fs.readFile(inputFile, 'utf8', (err, data) => { 111 | if (err) { 112 | console.error(`Error reading XML file: ${err}`) 113 | return 114 | } 115 | 116 | parser.parseString(data, (err, result) => { 117 | if (err) { 118 | console.error(`Error parsing XML: ${err}`) 119 | return 120 | } 121 | 122 | const pages = result.mediawiki.page.filter(filterPages) 123 | 124 | fs.ensureDir(outputFolder) 125 | .then(() => { 126 | pages.forEach(page => { 127 | const title = page.title[0].replace(/\//g, '-') 128 | const timestamp = page.revision[0].timestamp[0].split('T')[0] 129 | const filename = `${title}_${timestamp}`.replace(/[^\w\/]|_/g, '-').toLowerCase() 130 | let text = page.revision[0].text[0]._ 131 | text = excludeLinesWithoutContent(text.replace(/<[^>]*>/g, '')) 132 | const chunks = breakTextIntoChunks(text) 133 | 134 | for (const [index, chunk] of chunks.entries()) { 135 | const outputData = ` 136 | ${title} 137 | 138 | ${chunk}`.trim() 139 | 140 | const outputFile = `${outputFolder}/${filename}_${index}.txt` 141 | fs.writeFile(outputFile, outputData) 142 | .then(() => console.log(`Created: ${outputFile}`)) 143 | } 144 | }) 145 | }) 146 | .catch(err => console.error(`Error creating output folder: ${err}`)) 147 | }) 148 | }) 149 | -------------------------------------------------------------------------------- /generate-training.js: -------------------------------------------------------------------------------- 1 | import fs from 'fs' 2 | import path from 'path' 3 | import * as dotenv from 'dotenv' 4 | import { ChatGPTAPI } from 'chatgpt' 5 | dotenv.config() 6 | 7 | const outputDir = 'pages' 8 | const trainingDir = 'training_data' 9 | const concurrentRequests = 10 10 | 11 | const getSystemMessage = (questionCount) => { 12 | return ` 13 | Act as a javascript API, you only return JSON. Output will be parsed as JSONL, non-JSONL output will be ignored. 14 | 15 | You will be given a text block. Based on that text you must craft ${questionCount} questions and answers. 16 | Return those in the JSON format. You do not need to access any external content. 17 | 18 | In the text block, the first line is the title of the content. Incorporate the title into your question. 19 | 20 | Return your JSON using JSONL in the format: 21 | { "prompt": [the first question], "completion": [the answer] } 22 | { "prompt": [the second question], "completion": [the answer] } 23 | { "prompt": [the third question], "completion": [the answer] } 24 | ...` 25 | } 26 | 27 | const isValidJSONL = (jsonl) => { 28 | const lines = jsonl.split('\n') 29 | return lines.every((line) => { 30 | try { 31 | if (line.trim() === '') return true 32 | const obj = JSON.parse(line) 33 | 34 | if (typeof obj.prompt !== 'string') { 35 | if (Array.isArray(obj.prompt)) { 36 | obj.prompt = obj.prompt.join(' ') 37 | } else { 38 | return false 39 | } 40 | } 41 | 42 | if (typeof obj.completion !== 'string') { 43 | if (Array.isArray(obj.completion)) { 44 | obj.completion = obj.completion.join(' ') 45 | } else { 46 | return false 47 | } 48 | } 49 | 50 | return true 51 | } catch (e) { 52 | return false 53 | } 54 | }) 55 | } 56 | 57 | const parseJSONL = (jsonl) => { 58 | const lines = jsonl.split('\n') 59 | return lines 60 | .filter(line => line.trim() !== '') 61 | .map(line => JSON.stringify(JSON.parse(line))) 62 | .join('\n') 63 | } 64 | 65 | async function main() { 66 | const apiKey = process.env.OPENAI_API_KEY 67 | if (!apiKey) { 68 | console.error('Please set the OPENAI_API_KEY environment variable') 69 | process.exit(1) 70 | } 71 | 72 | // Make the training directory if it doesn't exist 73 | fs.mkdirSync(trainingDir, { recursive: true }) 74 | 75 | const files = fs.readdirSync(outputDir) 76 | const fileChunks = [] 77 | 78 | for (let i = 0; i < files.length; i += concurrentRequests) { 79 | fileChunks.push(files.slice(i, i + concurrentRequests)) 80 | } 81 | 82 | for (const chunk of fileChunks) { 83 | const chunkPromises = chunk.map(async (file) => { 84 | const trainingPath = path.join(trainingDir, path.basename(file, '.txt') + '.jsonl') 85 | if (fs.existsSync(trainingPath)) { 86 | console.log(`Skipping ${file} as a .jsonl file already exists`) 87 | return 88 | } 89 | 90 | console.log(`Processing ${file}`) 91 | const content = fs.readFileSync(path.join(outputDir, file), 'utf-8') 92 | const questionCount = content.length > 5000 ? 10 : Math.ceil(content.length / 500) 93 | 94 | console.log(content.length, questionCount) 95 | const chatAgent = new ChatGPTAPI({ 96 | apiKey, 97 | systemMessage: getSystemMessage(questionCount) 98 | }) 99 | 100 | try { 101 | const res = await chatAgent.sendMessage(content) 102 | 103 | if (isValidJSONL(res.text)) { 104 | const parsedContent = parseJSONL(res.text) 105 | fs.writeFileSync(trainingPath, parsedContent) 106 | console.log(`Wrote parsed JSONL to ${trainingPath}`) 107 | } else { 108 | console.error(`Invalid JSONL in:\n\n${res.text}`) 109 | } 110 | } catch (err) { 111 | console.error(`An error occurred while processing ${file}:`, err) 112 | } 113 | }) 114 | 115 | await Promise.all(chunkPromises) 116 | } 117 | } 118 | 119 | main().catch((err) => { 120 | console.error('An error occurred:', err) 121 | process.exit(1) 122 | }) 123 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Generate training data and fine-tune an LLM from a MediaWiki XML file 2 | 3 | Creates a `.jsonl` file in the format: 4 | 5 | ```json 6 | {"prompt":"Question","completion":"Answer"} 7 | ``` 8 | 9 | You can use this data can be used to fine tune a large language model. 10 | 11 | This process uses GPT3.5 to generate training data. The data can only be used to create models that do not compete with OpenAI, as per the OpenAI terms of service. 12 | 13 | ## Usage 14 | 15 | Clone the repo and install dependencies: 16 | 17 | ```sh 18 | git clone https://github.com/fofr/mediawiki-to-training-data 19 | cd mediawiki-to-training-data 20 | npm install 21 | ``` 22 | 23 | Then copy your [OpenAI API key](https://platform.openai.com/account/api-keys) and [Replicate API token](https://replicate.com/docs/get-started/nodejs) and add them to a `.env` file: 24 | 25 | ```sh 26 | echo "OPENAI_API_KEY=" > .env 27 | echo "REPLICATE_API_TOKEN=" > .env 28 | ``` 29 | 30 | ### Download MediaWiki content 31 | 32 | You can download MediaWiki XMLs file from any MediaWiki site, Wikipedia being the most well-known of these. 33 | 34 | To download an individual article from Wikipedia, use URLs in this format: `https://en.wikipedia.org/wiki/Special:Export/Title_of_the_article` 35 | 36 | Examples: 37 | 38 | - https://en.wikipedia.org/wiki/Special:Export/Natural_language_processing 39 | - https://en.wikipedia.org/wiki/Special:Export/Super_Mario_Bros. 40 | - https://en.wikipedia.org/wiki/Special:Export/Neo_(The_Matrix) 41 | 42 | ### Parse XML 43 | 44 | Split a MediaWiki XML file into files. 45 | 46 | Usage: 47 | 48 | ```sh 49 | node splitXML.js your-media-wiki-file.xml 50 | ``` 51 | 52 | Will write a text file for each Wiki page into the `pages/` directory. 53 | 54 | Long pages get chunked into separate files, a maximum of 10,000 characters each. Chunks break on newlines, and each chunk keeps the title for context. 55 | 56 | MediaWiki redirects and pages such as users, talk pages and files, are all ignored. 57 | 58 | ### Generate training data 59 | 60 | Use the OpenAI API to pass MediaWiki content to GPT3.5, and ask it to create questions and answers based on the information given. The WikiText formatting is passed directly to GPT as it can interpret it. 61 | 62 | ```sh 63 | node generate-training.js 64 | ``` 65 | 66 | A maximum of 10 questions are requested per file, shorter content gets fewer questions. 67 | 68 | JSONL is requested. If GPT returns invalid JSON the output is not saved. 69 | 70 | A `.jsonl` file is created for each text file in `training_data/`. 71 | 72 | You can pause and resume the generation of training data. Pages with a corresponding `.jsonl` file are skipped. 73 | 74 | ### Combine training data 75 | 76 | When you have enough training data, combine them into a single `.jsonl` file for training. 77 | 78 | ```sh 79 | node combine.js your-output-file.jsonl 80 | ``` 81 | 82 | ### Fine tune an LLM using Replicate 83 | 84 | Now you have your training data you can fine-tune an LLM. 85 | 86 | `train.js` is a script for [fine-tuning pre-trained language models using the Replicate API](https://replicate.com/docs/guides/fine-tune-a-language-model). You can fine-tune your language model with your own training data and choose from one of the available models: Llama, GPT, and Flan. 87 | 88 | You need: 89 | 90 | - An account on Replicate 91 | - Your Replicate API token 92 | - A destination model to save your fine tuned model ([Create a model](https://replicate.com/create)) 93 | - A URL pointing to your generated training data 94 | 95 | Set the destination variable to the username and model name you want to save the fine-tuned model to. For example, if your username is `myusername` and you want to save the model as `mymodel`, set destination to 'myusername/mymodel'. 96 | 97 | Set the `train_data` variable to the URL of your training data. 98 | 99 | Choose the pre-trained model you want to fine-tune. Use one of the following: 100 | 101 | ```js 102 | const [llm, version] = LLMs.llama // For Llama model 103 | const [llm, version] = LLMs.gpt // For GPT model 104 | const [llm, version] = LLMs.flan // For Flan model 105 | ``` 106 | 107 | After setting the variables, save the file and run the script with `node train.js`. The script will create a new fine-tuning job using the Replicate API, and it will print the details of the training job, including the URL to track the progress of your fine-tuning. 108 | 109 | ### Track progress or cancel job 110 | 111 | `train.js` also includes two functions: 112 | 113 | - get() 114 | - cancel() 115 | 116 | They can be used to retrieve the details of an existing training job or cancel a training job. 117 | 118 | You can also see progress or cancel the job from the web UI. 119 | --------------------------------------------------------------------------------