├── .gitignore ├── index.js ├── package-lock.json ├── package.json ├── processedData_1700838002264.csv ├── readme.md ├── system_template.js └── test.csv /.gitignore: -------------------------------------------------------------------------------- 1 | .env -------------------------------------------------------------------------------- /index.js: -------------------------------------------------------------------------------- 1 | import OpenAI from "openai"; 2 | import fs from "fs"; 3 | import { parse, stringify } from "csv"; 4 | import { systemTemplate } from "./system_template.js"; 5 | import dotenv from 'dotenv'; 6 | dotenv.config(); 7 | 8 | const model = "gpt-4-1106-preview" // set model here - set to GPT-4 turbo as default - recommended 9 | const batchSize = 10; // set amount of batch for each api call 10 | const csv_path = 'yourcsvpath.csv'; // set name of csv file that should get processed 11 | const input_name = "text" // set input name here (i.e. the column in the csv file that will be used) 12 | 13 | async function readCsv(filePath) { 14 | return new Promise((resolve, reject) => { 15 | let rows = []; 16 | fs.createReadStream(filePath) 17 | .pipe(parse({ delimiter: ',', columns: true })) 18 | .on('data', (row) => rows.push(row)) 19 | .on('end', () => resolve(rows)) 20 | .on('error', reject); 21 | }); 22 | } 23 | 24 | function escapeQuotes(str) { 25 | return str.replace(/"/g, '\\"').replace(/\n/g, "\\n"); 26 | } 27 | 28 | async function runOpenAIChat(rows, openai) { 29 | try { 30 | const system = escapeQuotes(systemTemplate); 31 | // will create a row of input_name: input_value \n x batch size 32 | const textInputs = rows.map((row, index) => `${input_name}: ${escapeQuotes(row.text)}`).join(' \n '); 33 | const response = await openai.chat.completions.create({ 34 | model, 35 | messages: [ 36 | { 37 | role: "system", 38 | content: system, 39 | }, 40 | { 41 | role: "user", 42 | content: textInputs, 43 | }, 44 | ], 45 | temperature: 0, 46 | max_tokens: 1000, 47 | top_p: 1, 48 | frequency_penalty: 0, 49 | presence_penalty: 0, 50 | response_format: { "type": "json_object" } // set to return a json object 51 | }); 52 | const content = JSON.parse(response.choices[0].message.content); 53 | console.log(content) 54 | return content; 55 | } catch (error) { 56 | console.error("Error occurred:", error); 57 | return null; 58 | } 59 | } 60 | 61 | async function processCsv(filePath) { 62 | const rows = await readCsv(filePath); 63 | let allResults = []; 64 | 65 | const openai = new OpenAI({ 66 | apiKey: process.env.OPENAI_API_KEY 67 | }); 68 | 69 | for (let i = 0; i < rows.length; i += batchSize) { 70 | const batchRows = rows.slice(i, i + batchSize); 71 | const batchResults = await runOpenAIChat(batchRows, openai); 72 | 73 | if (batchResults) { 74 | batchRows.forEach((row, index) => { 75 | const result = batchResults[`text${index + 1}`]; 76 | if (result) { 77 | // dynamically add fields from the response 78 | Object.keys(result).forEach(key => { 79 | row[key] = result[key]; 80 | }); 81 | } 82 | }); 83 | allResults.push(...batchRows); 84 | } 85 | } 86 | 87 | const outputFile = `processedData_${new Date().getTime()}.csv`; 88 | const csvStream = stringify(allResults, { header: true }); 89 | const writableStream = fs.createWriteStream(outputFile); 90 | 91 | csvStream.pipe(writableStream); 92 | 93 | writableStream.on('finish', () => { 94 | console.log(`CSV updated and saved as '${outputFile}'`); 95 | }); 96 | 97 | writableStream.on('error', (err) => { 98 | console.error('Error writing CSV file:', err); 99 | }); 100 | } 101 | 102 | processCsv(csv_path); -------------------------------------------------------------------------------- /package-lock.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "dataset_creator", 3 | "version": "1.0.0", 4 | "lockfileVersion": 3, 5 | "requires": true, 6 | "packages": { 7 | "": { 8 | "name": "dataset_creator", 9 | "version": "1.0.0", 10 | "license": "ISC", 11 | "dependencies": { 12 | "csv": "^6.3.5", 13 | "dotenv": "^16.3.1", 14 | "openai": "^4.19.1" 15 | } 16 | }, 17 | "node_modules/@types/node": { 18 | "version": "18.18.12", 19 | "resolved": "https://registry.npmjs.org/@types/node/-/node-18.18.12.tgz", 20 | "integrity": "sha512-G7slVfkwOm7g8VqcEF1/5SXiMjP3Tbt+pXDU3r/qhlM2KkGm786DUD4xyMA2QzEElFrv/KZV9gjygv4LnkpbMQ==", 21 | "dependencies": { 22 | "undici-types": "~5.26.4" 23 | } 24 | }, 25 | "node_modules/@types/node-fetch": { 26 | "version": "2.6.9", 27 | "resolved": "https://registry.npmjs.org/@types/node-fetch/-/node-fetch-2.6.9.tgz", 28 | "integrity": "sha512-bQVlnMLFJ2d35DkPNjEPmd9ueO/rh5EiaZt2bhqiSarPjZIuIV6bPQVqcrEyvNo+AfTrRGVazle1tl597w3gfA==", 29 | "dependencies": { 30 | "@types/node": "*", 31 | "form-data": "^4.0.0" 32 | } 33 | }, 34 | "node_modules/abort-controller": { 35 | "version": "3.0.0", 36 | "resolved": "https://registry.npmjs.org/abort-controller/-/abort-controller-3.0.0.tgz", 37 | "integrity": "sha512-h8lQ8tacZYnR3vNQTgibj+tODHI5/+l06Au2Pcriv/Gmet0eaj4TwWH41sO9wnHDiQsEj19q0drzdWdeAHtweg==", 38 | "dependencies": { 39 | "event-target-shim": "^5.0.0" 40 | }, 41 | "engines": { 42 | "node": ">=6.5" 43 | } 44 | }, 45 | "node_modules/agentkeepalive": { 46 | "version": "4.5.0", 47 | "resolved": "https://registry.npmjs.org/agentkeepalive/-/agentkeepalive-4.5.0.tgz", 48 | "integrity": "sha512-5GG/5IbQQpC9FpkRGsSvZI5QYeSCzlJHdpBQntCsuTOxhKD8lqKhrleg2Yi7yvMIf82Ycmmqln9U8V9qwEiJew==", 49 | "dependencies": { 50 | "humanize-ms": "^1.2.1" 51 | }, 52 | "engines": { 53 | "node": ">= 8.0.0" 54 | } 55 | }, 56 | "node_modules/asynckit": { 57 | "version": "0.4.0", 58 | "resolved": "https://registry.npmjs.org/asynckit/-/asynckit-0.4.0.tgz", 59 | "integrity": "sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q==" 60 | }, 61 | "node_modules/base-64": { 62 | "version": "0.1.0", 63 | "resolved": "https://registry.npmjs.org/base-64/-/base-64-0.1.0.tgz", 64 | "integrity": "sha512-Y5gU45svrR5tI2Vt/X9GPd3L0HNIKzGu202EjxrXMpuc2V2CiKgemAbUUsqYmZJvPtCXoUKjNZwBJzsNScUbXA==" 65 | }, 66 | "node_modules/charenc": { 67 | "version": "0.0.2", 68 | "resolved": "https://registry.npmjs.org/charenc/-/charenc-0.0.2.tgz", 69 | "integrity": "sha512-yrLQ/yVUFXkzg7EDQsPieE/53+0RlaWTs+wBrvW36cyilJ2SaDWfl4Yj7MtLTXleV9uEKefbAGUPv2/iWSooRA==", 70 | "engines": { 71 | "node": "*" 72 | } 73 | }, 74 | "node_modules/combined-stream": { 75 | "version": "1.0.8", 76 | "resolved": "https://registry.npmjs.org/combined-stream/-/combined-stream-1.0.8.tgz", 77 | "integrity": "sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg==", 78 | "dependencies": { 79 | "delayed-stream": "~1.0.0" 80 | }, 81 | "engines": { 82 | "node": ">= 0.8" 83 | } 84 | }, 85 | "node_modules/crypt": { 86 | "version": "0.0.2", 87 | "resolved": "https://registry.npmjs.org/crypt/-/crypt-0.0.2.tgz", 88 | "integrity": "sha512-mCxBlsHFYh9C+HVpiEacem8FEBnMXgU9gy4zmNC+SXAZNB/1idgp/aulFJ4FgCi7GPEVbfyng092GqL2k2rmow==", 89 | "engines": { 90 | "node": "*" 91 | } 92 | }, 93 | "node_modules/csv": { 94 | "version": "6.3.5", 95 | "resolved": "https://registry.npmjs.org/csv/-/csv-6.3.5.tgz", 96 | "integrity": "sha512-Y+KTCAUljtq2JaGP42ZL1bymqlU5BkfnFpZhxRczGFDZox2VXhlRHnG5DRshyUrwQzmCdEiLjSqNldCfm1OVCA==", 97 | "dependencies": { 98 | "csv-generate": "^4.3.0", 99 | "csv-parse": "^5.5.2", 100 | "csv-stringify": "^6.4.4", 101 | "stream-transform": "^3.2.10" 102 | }, 103 | "engines": { 104 | "node": ">= 0.1.90" 105 | } 106 | }, 107 | "node_modules/csv-generate": { 108 | "version": "4.3.0", 109 | "resolved": "https://registry.npmjs.org/csv-generate/-/csv-generate-4.3.0.tgz", 110 | "integrity": "sha512-7KdVId/2RgwPIKfWHaHtjBq7I9mgdi8ICzsUyIhP8is6UwpwVGGSC/aPnrZ8/SkgBcCP20lXrdPuP64Irs1VBg==" 111 | }, 112 | "node_modules/csv-parse": { 113 | "version": "5.5.2", 114 | "resolved": "https://registry.npmjs.org/csv-parse/-/csv-parse-5.5.2.tgz", 115 | "integrity": "sha512-YRVtvdtUNXZCMyK5zd5Wty1W6dNTpGKdqQd4EQ8tl/c6KW1aMBB1Kg1ppky5FONKmEqGJ/8WjLlTNLPne4ioVA==" 116 | }, 117 | "node_modules/csv-stringify": { 118 | "version": "6.4.4", 119 | "resolved": "https://registry.npmjs.org/csv-stringify/-/csv-stringify-6.4.4.tgz", 120 | "integrity": "sha512-NDshLupGa7gp4UG4sSNIqwYJqgSwvds0SvENntxoVoVvTzXcrHvd5gG2MWpbRpSNvk59dlmIe1IwNvSxN4IVmg==" 121 | }, 122 | "node_modules/delayed-stream": { 123 | "version": "1.0.0", 124 | "resolved": "https://registry.npmjs.org/delayed-stream/-/delayed-stream-1.0.0.tgz", 125 | "integrity": "sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ==", 126 | "engines": { 127 | "node": ">=0.4.0" 128 | } 129 | }, 130 | "node_modules/digest-fetch": { 131 | "version": "1.3.0", 132 | "resolved": "https://registry.npmjs.org/digest-fetch/-/digest-fetch-1.3.0.tgz", 133 | "integrity": "sha512-CGJuv6iKNM7QyZlM2T3sPAdZWd/p9zQiRNS9G+9COUCwzWFTs0Xp8NF5iePx7wtvhDykReiRRrSeNb4oMmB8lA==", 134 | "dependencies": { 135 | "base-64": "^0.1.0", 136 | "md5": "^2.3.0" 137 | } 138 | }, 139 | "node_modules/dotenv": { 140 | "version": "16.3.1", 141 | "resolved": "https://registry.npmjs.org/dotenv/-/dotenv-16.3.1.tgz", 142 | "integrity": "sha512-IPzF4w4/Rd94bA9imS68tZBaYyBWSCE47V1RGuMrB94iyTOIEwRmVL2x/4An+6mETpLrKJ5hQkB8W4kFAadeIQ==", 143 | "engines": { 144 | "node": ">=12" 145 | }, 146 | "funding": { 147 | "url": "https://github.com/motdotla/dotenv?sponsor=1" 148 | } 149 | }, 150 | "node_modules/event-target-shim": { 151 | "version": "5.0.1", 152 | "resolved": "https://registry.npmjs.org/event-target-shim/-/event-target-shim-5.0.1.tgz", 153 | "integrity": "sha512-i/2XbnSz/uxRCU6+NdVJgKWDTM427+MqYbkQzD321DuCQJUqOuJKIA0IM2+W2xtYHdKOmZ4dR6fExsd4SXL+WQ==", 154 | "engines": { 155 | "node": ">=6" 156 | } 157 | }, 158 | "node_modules/form-data": { 159 | "version": "4.0.0", 160 | "resolved": "https://registry.npmjs.org/form-data/-/form-data-4.0.0.tgz", 161 | "integrity": "sha512-ETEklSGi5t0QMZuiXoA/Q6vcnxcLQP5vdugSpuAyi6SVGi2clPPp+xgEhuMaHC+zGgn31Kd235W35f7Hykkaww==", 162 | "dependencies": { 163 | "asynckit": "^0.4.0", 164 | "combined-stream": "^1.0.8", 165 | "mime-types": "^2.1.12" 166 | }, 167 | "engines": { 168 | "node": ">= 6" 169 | } 170 | }, 171 | "node_modules/form-data-encoder": { 172 | "version": "1.7.2", 173 | "resolved": "https://registry.npmjs.org/form-data-encoder/-/form-data-encoder-1.7.2.tgz", 174 | "integrity": "sha512-qfqtYan3rxrnCk1VYaA4H+Ms9xdpPqvLZa6xmMgFvhO32x7/3J/ExcTd6qpxM0vH2GdMI+poehyBZvqfMTto8A==" 175 | }, 176 | "node_modules/formdata-node": { 177 | "version": "4.4.1", 178 | "resolved": "https://registry.npmjs.org/formdata-node/-/formdata-node-4.4.1.tgz", 179 | "integrity": "sha512-0iirZp3uVDjVGt9p49aTaqjk84TrglENEDuqfdlZQ1roC9CWlPk6Avf8EEnZNcAqPonwkG35x4n3ww/1THYAeQ==", 180 | "dependencies": { 181 | "node-domexception": "1.0.0", 182 | "web-streams-polyfill": "4.0.0-beta.3" 183 | }, 184 | "engines": { 185 | "node": ">= 12.20" 186 | } 187 | }, 188 | "node_modules/formdata-node/node_modules/web-streams-polyfill": { 189 | "version": "4.0.0-beta.3", 190 | "resolved": "https://registry.npmjs.org/web-streams-polyfill/-/web-streams-polyfill-4.0.0-beta.3.tgz", 191 | "integrity": "sha512-QW95TCTaHmsYfHDybGMwO5IJIM93I/6vTRk+daHTWFPhwh+C8Cg7j7XyKrwrj8Ib6vYXe0ocYNrmzY4xAAN6ug==", 192 | "engines": { 193 | "node": ">= 14" 194 | } 195 | }, 196 | "node_modules/humanize-ms": { 197 | "version": "1.2.1", 198 | "resolved": "https://registry.npmjs.org/humanize-ms/-/humanize-ms-1.2.1.tgz", 199 | "integrity": "sha512-Fl70vYtsAFb/C06PTS9dZBo7ihau+Tu/DNCk/OyHhea07S+aeMWpFFkUaXRa8fI+ScZbEI8dfSxwY7gxZ9SAVQ==", 200 | "dependencies": { 201 | "ms": "^2.0.0" 202 | } 203 | }, 204 | "node_modules/is-buffer": { 205 | "version": "1.1.6", 206 | "resolved": "https://registry.npmjs.org/is-buffer/-/is-buffer-1.1.6.tgz", 207 | "integrity": "sha512-NcdALwpXkTm5Zvvbk7owOUSvVvBKDgKP5/ewfXEznmQFfs4ZRmanOeKBTjRVjka3QFoN6XJ+9F3USqfHqTaU5w==" 208 | }, 209 | "node_modules/md5": { 210 | "version": "2.3.0", 211 | "resolved": "https://registry.npmjs.org/md5/-/md5-2.3.0.tgz", 212 | "integrity": "sha512-T1GITYmFaKuO91vxyoQMFETst+O71VUPEU3ze5GNzDm0OWdP8v1ziTaAEPUr/3kLsY3Sftgz242A1SetQiDL7g==", 213 | "dependencies": { 214 | "charenc": "0.0.2", 215 | "crypt": "0.0.2", 216 | "is-buffer": "~1.1.6" 217 | } 218 | }, 219 | "node_modules/mime-db": { 220 | "version": "1.52.0", 221 | "resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.52.0.tgz", 222 | "integrity": "sha512-sPU4uV7dYlvtWJxwwxHD0PuihVNiE7TyAbQ5SWxDCB9mUYvOgroQOwYQQOKPJ8CIbE+1ETVlOoK1UC2nU3gYvg==", 223 | "engines": { 224 | "node": ">= 0.6" 225 | } 226 | }, 227 | "node_modules/mime-types": { 228 | "version": "2.1.35", 229 | "resolved": "https://registry.npmjs.org/mime-types/-/mime-types-2.1.35.tgz", 230 | "integrity": "sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw==", 231 | "dependencies": { 232 | "mime-db": "1.52.0" 233 | }, 234 | "engines": { 235 | "node": ">= 0.6" 236 | } 237 | }, 238 | "node_modules/ms": { 239 | "version": "2.1.3", 240 | "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz", 241 | "integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==" 242 | }, 243 | "node_modules/node-domexception": { 244 | "version": "1.0.0", 245 | "resolved": "https://registry.npmjs.org/node-domexception/-/node-domexception-1.0.0.tgz", 246 | "integrity": "sha512-/jKZoMpw0F8GRwl4/eLROPA3cfcXtLApP0QzLmUT/HuPCZWyB7IY9ZrMeKw2O/nFIqPQB3PVM9aYm0F312AXDQ==", 247 | "funding": [ 248 | { 249 | "type": "github", 250 | "url": "https://github.com/sponsors/jimmywarting" 251 | }, 252 | { 253 | "type": "github", 254 | "url": "https://paypal.me/jimmywarting" 255 | } 256 | ], 257 | "engines": { 258 | "node": ">=10.5.0" 259 | } 260 | }, 261 | "node_modules/node-fetch": { 262 | "version": "2.7.0", 263 | "resolved": "https://registry.npmjs.org/node-fetch/-/node-fetch-2.7.0.tgz", 264 | "integrity": "sha512-c4FRfUm/dbcWZ7U+1Wq0AwCyFL+3nt2bEw05wfxSz+DWpWsitgmSgYmy2dQdWyKC1694ELPqMs/YzUSNozLt8A==", 265 | "dependencies": { 266 | "whatwg-url": "^5.0.0" 267 | }, 268 | "engines": { 269 | "node": "4.x || >=6.0.0" 270 | }, 271 | "peerDependencies": { 272 | "encoding": "^0.1.0" 273 | }, 274 | "peerDependenciesMeta": { 275 | "encoding": { 276 | "optional": true 277 | } 278 | } 279 | }, 280 | "node_modules/openai": { 281 | "version": "4.19.1", 282 | "resolved": "https://registry.npmjs.org/openai/-/openai-4.19.1.tgz", 283 | "integrity": "sha512-9TddzuZBn2xxhghGGTHLZ4EeNBGTLs3xVzh266NiSJvtUsCsZQ5yVV6H5NhnhyAkKK8uUiZOUUlUAk3HdV+4xg==", 284 | "dependencies": { 285 | "@types/node": "^18.11.18", 286 | "@types/node-fetch": "^2.6.4", 287 | "abort-controller": "^3.0.0", 288 | "agentkeepalive": "^4.2.1", 289 | "digest-fetch": "^1.3.0", 290 | "form-data-encoder": "1.7.2", 291 | "formdata-node": "^4.3.2", 292 | "node-fetch": "^2.6.7", 293 | "web-streams-polyfill": "^3.2.1" 294 | }, 295 | "bin": { 296 | "openai": "bin/cli" 297 | } 298 | }, 299 | "node_modules/stream-transform": { 300 | "version": "3.2.10", 301 | "resolved": "https://registry.npmjs.org/stream-transform/-/stream-transform-3.2.10.tgz", 302 | "integrity": "sha512-Yu+x7zcWbWdyB0Td8dFzHt2JEyD6694CNq2lqh1rbuEBVxPtjb/GZ7xDnZcdYiU5E/RtufM54ClSEOzZDeWguA==" 303 | }, 304 | "node_modules/tr46": { 305 | "version": "0.0.3", 306 | "resolved": "https://registry.npmjs.org/tr46/-/tr46-0.0.3.tgz", 307 | "integrity": "sha512-N3WMsuqV66lT30CrXNbEjx4GEwlow3v6rr4mCcv6prnfwhS01rkgyFdjPNBYd9br7LpXV1+Emh01fHnq2Gdgrw==" 308 | }, 309 | "node_modules/undici-types": { 310 | "version": "5.26.5", 311 | "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-5.26.5.tgz", 312 | "integrity": "sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==" 313 | }, 314 | "node_modules/web-streams-polyfill": { 315 | "version": "3.2.1", 316 | "resolved": "https://registry.npmjs.org/web-streams-polyfill/-/web-streams-polyfill-3.2.1.tgz", 317 | "integrity": "sha512-e0MO3wdXWKrLbL0DgGnUV7WHVuw9OUvL4hjgnPkIeEvESk74gAITi5G606JtZPp39cd8HA9VQzCIvA49LpPN5Q==", 318 | "engines": { 319 | "node": ">= 8" 320 | } 321 | }, 322 | "node_modules/webidl-conversions": { 323 | "version": "3.0.1", 324 | "resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-3.0.1.tgz", 325 | "integrity": "sha512-2JAn3z8AR6rjK8Sm8orRC0h/bcl/DqL7tRPdGZ4I1CjdF+EaMLmYxBHyXuKL849eucPFhvBoxMsflfOb8kxaeQ==" 326 | }, 327 | "node_modules/whatwg-url": { 328 | "version": "5.0.0", 329 | "resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-5.0.0.tgz", 330 | "integrity": "sha512-saE57nupxk6v3HY35+jzBwYa0rKSy0XR8JSxZPwgLr7ys0IBzhGviA1/TUGJLmSVqs8pb9AnvICXEuOHLprYTw==", 331 | "dependencies": { 332 | "tr46": "~0.0.3", 333 | "webidl-conversions": "^3.0.0" 334 | } 335 | } 336 | } 337 | } 338 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "dataset_creator", 3 | "version": "1.0.0", 4 | "description": "", 5 | "main": "index.js", 6 | "type": "module", 7 | "scripts": { 8 | "test": "echo \"Error: no test specified\" && exit 1" 9 | }, 10 | "author": "", 11 | "license": "ISC", 12 | "dependencies": { 13 | "csv": "^6.3.5", 14 | "dotenv": "^16.3.1", 15 | "openai": "^4.19.1" 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # Create Dataset with GPT-4 Turbo 2 | 3 | **Notes:** the code needs to be tweaked if you're using something else than a 'text' field. 4 | 5 | ## Overview 6 | This script processes a CSV file containing any field(s) that you set and uses the OpenAI GPT-4 API to create new fields. It is useful to generate a dataset for fine-tuning. I.e. using a text field to generate summaries, keywords or other to another field with a custom system template. 7 | 8 | The script reads the CSV, processes the specified text fields, and appends the extracted information as new fields in the CSV with the help of GPT-4. The final output is a new CSV file with these enriched data fields that you can use to fine-tune a model. 9 | 10 | ### Example Result 11 | 12 | A CSV file like this 13 | 14 | ```markdown 15 | | Text | 16 | |----------| 17 | | "text 1" | 18 | | "text 2" | 19 | ``` 20 | 21 | Will loop through the text fields and make API calls (batched by 10) to GPT-4 Trubo that will return keywords (or whatever field you want it to return), the script collects them and at the end generates a new CSV file that looks like this and saves it in your root folder. 22 | 23 | ```markdown 24 | | Text | Keywords | 25 | |----------|----------| 26 | | "text 1" | "key 1" | 27 | | "text 2" | "key 2" | 28 | ``` 29 | 30 | This is based on what you ask GPT-4 to do with the fields. See the system template to make sure it generates the correct fields. See example test.csv file and the output csv file processedData.csv. 31 | 32 | ## Requirements 33 | - Node.js 34 | - OpenAI API key 35 | - CSV file with at least one text field 36 | 37 | ## Setup 38 | 1. **Node.js**: Ensure Node.js is installed. 39 | 2. **Clone the Repository:** If you haven't already, clone the repository to your local machine: 40 | ``` 41 | git clone https://github.com/ilsilfverskiold/gpt-create-dataset.git 42 | ``` 43 | 3. **Navigate to the Directory:** Once cloned, navigate to the project directory: 44 | ``` 45 | cd gpt-create-dataset 46 | ``` 47 | 4. **Dependencies**: Run `npm install` to install required packages. 48 | 5. **API Key**: Place your OpenAI API key in a `.env` file as `OPENAI_API_KEY=your_api_key`. 49 | 6. **CSV File**: Add in your csv file in the root folder and make sure the index.js script has the correct path. 50 | 7. **Tweak the System Template**: Tweak the system template to enable you to get the results you want. 51 | 52 | ## Configuration 53 | - **Model**: Default is "gpt-4-1106-preview". Modify as needed. 54 | - **Batch Size**: Adjust the batch size based on performance needs. 55 | - **CSV Path**: Set the path of your CSV in the script. 56 | - **Input Field**: Default is 'text'. Change to match your CSV column name. 57 | 58 | ## System Template Customization (important!) 59 | Modify `system_template.js` to fit your data extraction needs. This template guides the GPT model for extracting relevant information from your texts. 60 | 61 | ## Output 62 | Generates a new CSV file processedData_.csv with original and new fields. 63 | -------------------------------------------------------------------------------- /system_template.js: -------------------------------------------------------------------------------- 1 | export const systemTemplate = ` 2 | 3 | I need you to extract keywords, topics and summaries from texts scraped from various software development forums and blogs. 4 | 5 | Further Instructions for Filtering Technical Terms: 6 | 7 | General Principle: When filtering a list for technical relevance, lean towards inclusion rather than exclusion. The cost of erroneously excluding a relevant term is likely higher than including a potentially irrelevant one. 8 | 9 | Tools & Platforms: Any term that appears to be a tool, software, platform, language or service (e.g., "Bubble", "AWS", "Docker", "Node.js", "Mongoose", "JavaScript") should be retained. Even if the term might seem generic in other contexts, consider its potential technical relevance. Here there might be a new tool trending so we need to be on the look out for these. 10 | 11 | Exclude General Development Terms: Avoid general development terms unless they are unique identifiers of a technology or methodology. Terms like "user model", "controller", "module", "model", "services", "deploy to" should be excluded. 12 | 13 | Extract the keyword: Extract the tool, platform, company name so it is easily identified. I.e. "amazon bedrock playground" would be "Amazon, Amazon Bedrock" as Amazon is a company and Amazon Bedrock is a tool so the response is Amazon AND Amazon Bedrock. Same with "sign in with Apple" would be "Apple, Authentication" 14 | 15 | Specific Terminology: Retain words or phrases that indicate specific methodologies, concepts, or paradigms, even if they might be two or more words long (e.g., "low code", "no code", "neural network", "quantum computing", "finetuning", "fine tuning"). 16 | 17 | Unique Feature Identification: Determine if the combination of terms points to a unique feature or characteristic of the tool/platform. For example, "GitHub Actions" is a specific feature of GitHub and should be included, whereas "Clone from GitHub" is a general action not unique so only Github would be included. 18 | 19 | Simplified Topic Extraction: Define the topic in simple terms that capture the main technology or methodology trend. Topics should be separated by commas for texts covering multiple areas. Example: "Backend Web Development, Database Modeling". 20 | 21 | Contextual Importance: Include keywords that are crucial in the context, even if they are not traditional technical terms, but are significant for understanding the technology or methodology. 22 | 23 | Single Mention: Each keyword should be mentioned only once, irrespective of its frequency in the text. 24 | 25 | Exclude Non-important Terms: Omit words that are known to be non-important or too generic in technology discussions. 26 | 27 | Industry Terms and Acronyms: Include relevant industry terms and acronyms that have specific meanings in a tech context. 28 | 29 | Spelling and grammar: make sure you fix spelling mistakes and add the right acronyms when applicable. natural language processing would be "Natural Language Processing, NLP". aws would be "AWS". 30 | 31 | Output Format: Deliver your response in valid json format with the following keys: keywords, topics, summary. For keywords, list them in a comma-separated string within the "keywords" field. For topics, list them in a comma-separated string within the "topic" field. For summaries, create concise summaries of 3-5 words max. 32 | 33 | Example Input: 34 | 35 | text: "PartyRock With Leftovers! - As an early Beta tester for the PartyRock, an Amazon Bedrock Playground.” 36 | 37 | text: "The Smoking Gun: Adam DAngelo Has Side Deal Business Interest for POE And Now Wont Leave - The Irony of it All - TLDR Adam is the ringleader and was mad that Dev day caused POE to become instantly Obsolete." 38 | 39 | text: "In the right place - On thriving as a woman in a male-dominated environment" 40 | 41 | Example STRUCTURED Output object with the keywords, topic and summary for all texts provided: 42 | 43 | { 44 | text1: { 45 | "keywords": "PartyRock, Amazon Bedrock, Amazon, Beta Testing", 46 | "topic": "AI Application Development, Cloud Services", 47 | "summary": "Beta testing PartyRock" 48 | }, 49 | text2: { 50 | "keywords": "Adam D’Angelo, POE, Dev day, OpenAI, Sam Altman", 51 | "topic": "Tech Industry News, Artificial Intelligence, Corporate Politics", 52 | "summary": "Adam DAngelo's POE conflict" 53 | }, 54 | text3: { 55 | "keywords": "Women in Tech, Male Dominated", 56 | "topic": "Women in Tech, Male Dominated Environments", 57 | "summary": "Coping in tech as female" 58 | } 59 | }` 60 | 61 | // Use ChatGPT or the GPT-4 playground to generate your system template. Cheaper to use ChatGPT to play around with it. --------------------------------------------------------------------------------