├── .env.example
├── .eslintrc.cjs
├── .gitignore
├── .vscode
    └── settings.json
├── README.md
├── package-lock.json
├── package.json
├── src
    ├── embeddings.ts
    ├── index.ts
    ├── recommend.ts
    ├── types.ts
    └── utils
    │   ├── chunkedUpsert.ts
    │   ├── csvLoader.ts
    │   ├── dataLoader.ts
    │   ├── fileSplitter.ts
    │   └── util.ts
├── tsconfig.json
├── tsup.config.js
└── turbo.json


/.env.example:
--------------------------------------------------------------------------------
1 | PINECONE_API_KEY=
2 | PINECONE_INDEX=
3 | PINECONE_CLOUD="aws"
4 | PINECONE_REGION="us-west-2"


--------------------------------------------------------------------------------
/.eslintrc.cjs:
--------------------------------------------------------------------------------
 1 | module.exports = {
 2 |   extends: [
 3 |     "airbnb-base",
 4 |     "eslint:recommended",
 5 |     "prettier",
 6 |     "plugin:@typescript-eslint/recommended",
 7 |   ],
 8 |   parserOptions: {
 9 |     ecmaVersion: 12,
10 |     parser: "@typescript-eslint/parser",
11 |     sourceType: "module",
12 |   },
13 |   plugins: ["@typescript-eslint"],
14 |   rules: {
15 |     "@typescript-eslint/explicit-module-boundary-types": 0,
16 |     "@typescript-eslint/no-empty-function": 0,
17 |     "@typescript-eslint/no-empty-any": 0,
18 |     "@typescript-eslint/no-shadow": 0,
19 |     "@typescript-eslint/no-use-before-define": ["error", "nofunc"],
20 |     "@typescript-eslint/no-unused-vars": ["warn", { args: "none" }],
21 |     "@typescript-eslint/no-explicit-any": 0,
22 |     camelcase: 0,
23 |     "import/no-extraneous-dependencies": 0,
24 |     "class-methods-use-this": 0,
25 |     "import/extensions": 0,
26 |     "import/no-unresolved": 0,
27 |     "import/prefer-default-export": 0,
28 |     "keyword-spacing": "error",
29 |     "max-classes-per-file": 0,
30 |     "max-len": 0,
31 |     "no-await-in-loop": 0,
32 |     "no-bitwise": 0,
33 |     "no-console": 0,
34 |     "no-restricted-syntax": 0,
35 |     "no-shadow": 0,
36 |     "no-underscore-dangle": 0,
37 |     "no-use-before-define": 0,
38 |     "no-useless-constructor": 0,
39 |     semi: ["error", "always"],
40 |   },
41 | };
42 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | node_modules
2 | .env
3 | dist
4 | .turbo
5 | data/
6 | all-the-news-2-1.zip
7 | all-the-news-2-1.csv
8 | .DS_Store


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 |   "dotenv.enableAutocloaking": false
3 | }
4 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Article Recommender
  2 | 
  3 | This tutorial demonstrates how to use Pinecone's similarity search to create a simple personalized article or content recommender.
  4 | 
  5 | The goal is to create a recommendation engine that retrieves the best article recommendations for each user. When making recommendations with content-based filtering, we evaluate the user’s past behavior and the content items themselves. So in this example, users will be recommended articles that are similar to those they've already read.
  6 | 
  7 | ```bash
  8 | npm install
  9 | ```
 10 | 
 11 | ## Required configuration
 12 | 
 13 | In order to run this example, you have to supply the Pinecone credentials needed to interact with the Pinecone API. You can find these credentials in the [Pinecone web console](https://app.pinecone.io) under **API Keys**. This project uses `dotenv` to easily load values from the `.env` file into the environment when executing.
 14 | 
 15 | Copy the template file:
 16 | 
 17 | ```sh
 18 | cp .env.example .env
 19 | ```
 20 | 
 21 | And fill in your API key and index name:
 22 | 
 23 | ```sh
 24 | PINECONE_API_KEY=<your-api-key>
 25 | PINECONE_INDEX="article-recommendations"
 26 | PINECONE_CLOUD="aws"
 27 | PINECONE_REGION="us-west-2"
 28 | ```
 29 | 
 30 | `PINECONE_INDEX` is the name of the index where this demo will store and query embeddings. You can change `PINECONE_INDEX` to any name you like, but make sure the name not going to collide with any indexes you are already using.
 31 | 
 32 | `PINECONE_CLOUD` and `PINECONE_REGION` define where the index should be deployed. Currently, this is the only available cloud and region combination (`aws` and `us-west-2`), so it's recommended to leave them defaulted.
 33 | 
 34 | ## Data preparation
 35 | 
 36 | Next, we will prepare data for the Pinecone vector index, and insert it in batches.
 37 | 
 38 | The [dataset](https://components.one/datasets/all-the-news-2-news-articles-dataset/) used throughout this example contains 2.7 million news articles and essays from 27 American publications.
 39 | 
 40 | Let's download the dataset.
 41 | 
 42 | ```bash
 43 | wget https://www.dropbox.com/s/cn2utnr5ipathhh/all-the-news-2-1.zip -q --show-progress
 44 | unzip -q all-the-news-2-1.zip
 45 | mkdir data
 46 | mv all-the-news-2-1.csv data/.
 47 | ```
 48 | 
 49 | ## Create Vector embeddings
 50 | 
 51 | To load data into our index, we need to create embeddings and upsert records into Pinecone. Run `npm run index` to do that in this project.
 52 | 
 53 | Since the dataset could be pretty big, this project uses a generator function that will yield chunks of data to be processed.
 54 | 
 55 | ```typescript
 56 | async function* processInChunks<T, M extends keyof T, P extends keyof T>(
 57 |   dataFrame: dfd.DataFrame,
 58 |   chunkSize: number,
 59 |   metadataFields: M[],
 60 |   pageContentField: P
 61 | ): AsyncGenerator<Document[]> {
 62 |   for (let i = 0; i < dataFrame.shape[0]; i += chunkSize) {
 63 |     const chunk = await getChunk(dataFrame, i, chunkSize);
 64 |     const records = dfd.toJSON(chunk) as T[];
 65 |     yield records.map((record: T) => {
 66 |       const metadata: Partial<Record<M, T[M]>> = {};
 67 |       for (const field of metadataFields) {
 68 |         metadata[field] = record[field];
 69 |       }
 70 |       return new Document({
 71 |         pageContent: record[pageContentField] as string,
 72 |         metadata,
 73 |       });
 74 |     });
 75 |   }
 76 | }
 77 | ```
 78 | 
 79 | For each chunk, the function generates an array of `Document` objects. The function is defined with three type parameters: `T`, `M`, and `P`.
 80 | 
 81 | Here are the parameters the function accepts:
 82 | 
 83 | - `dataFrame`: This is the DataFrame that the function will process.
 84 | - `chunkSize`: This is the number of records that will be processed in each chunk.
 85 | - `metadataFields`: This is an array of field names (which are keys of `T`) to be included in the metadata of each `Document`.
 86 | - `pageContentField`: This is the field name (which is a key of `T`) to be used for the page content of each `Document`.
 87 | 
 88 | Here's what the function does:
 89 | 
 90 | 1. It loops over the DataFrame in chunks of size `chunkSize`.
 91 | 2. For each chunk, it converts the chunk to JSON to get an array of records (of type `T`).
 92 | 3. Then, for each record in the chunk, it:
 93 |    - Creates a `metadata` object that includes the specified metadata fields from the record.
 94 |    - Creates a new `Document` with the `pageContent` from the specified field in the record, and the `metadata` object.
 95 | 4. It then yields an array of the created `Document` objects for the chunk.
 96 | 
 97 | The `yield` keyword is used here to produce a value from the generator function. This allows the function to produce a sequence of values over time, rather than computing them all at once and returning them in a single array.
 98 | 
 99 | Next we'll create a function that will generate the embeddings and upsert them into Pinecone. We'll use the `processInChunks` generator function to process the data in chunks. We'll also use the `chunkedUpsert` method to insert the embeddings into Pinecone in batches.
100 | 
101 | ```typescript
102 | async function embedAndUpsert(dataFrame: dfd.DataFrame, chunkSize: number) {
103 |   const chunkGenerator = processInChunks<
104 |     ArticleRecord,
105 |     "section" | "url" | "title" | "publication" | "author" | "article",
106 |     "article"
107 |   >(
108 |     dataFrame,
109 |     100,
110 |     ["section", "url", "title", "publication", "author", "article"],
111 |     "article"
112 |   );
113 |   const index = pinecone.index(indexName);
114 | 
115 |   for await (const documents of chunkGenerator) {
116 |     await embedder.embedBatch(
117 |       documents,
118 |       chunkSize,
119 |       async (embeddings: PineconeRecord[]) => {
120 |         await chunkedUpsert(index, embeddings, "default");
121 |         progressBar.increment(embeddings.length);
122 |       }
123 |     );
124 |   }
125 | }
126 | ```
127 | 
128 | We'll use the `splitFile` utility function to split the CSV file we downloaded into chunks of 100k parts each. For the purposes of this example, we'll only use the first 100k records.
129 | 
130 | ```typescript
131 | const fileParts = await splitFile("./data/all-the-news-2-1.csv", 100000);
132 | const firstFile = fileParts[0];
133 | ```
134 | 
135 | Next, we'll load the data into a DataFrame using `loadCSVFile` and to simplify things, we'll also drop all rows which include a null value.
136 | 
137 | ```typescript
138 | const data = await loadCSVFile(firstFile);
139 | const clean = data.dropNa() as dfd.DataFrame;
140 | ```
141 | 
142 | Now we'll create the Pinecone index and kick off the embedding and upserting process.
143 | 
144 | ```typescript
145 | // Create the index if it doesn't already exist
146 | const indexList = await pinecone.listIndexes();
147 | if (!indexList.indexes?.some((index) => index.name === indexName)) {
148 |   await pinecone.createIndex({
149 |     name: indexName,
150 |     dimension: 384,
151 |     spec: { serverless: { region: indexRegion, cloud: indexCloud } },
152 |     waitUntilReady: true,
153 |   });
154 | }
155 | 
156 | progressBar.start(clean.shape[0], 0);
157 | await embedder.init("Xenova/all-MiniLM-L6-v2");
158 | await embedAndUpsert(clean, 1);
159 | progressBar.stop();
160 | ```
161 | 
162 | ## Query the Pinecone Index
163 | 
164 | We will query the index for the an imagined user. We'll simulate a set of the articles that the user previously read. Based on the article embeddings, we will define a unique embedding for the user.
165 | 
166 | ```typescript
167 | const indexName = getEnv("PINECONE_INDEX");
168 | const pinecone = new Pinecone();
169 | 
170 | // Ensure the index exists
171 | try {
172 |   const description = await pinecone.describeIndex(indexName);
173 |   if (!description.status?.ready) {
174 |     throw new Error(
175 |       `Index not ready, description was ${JSON.stringify(description)}`
176 |     );
177 |   }
178 | } catch (e) {
179 |   console.log(
180 |     'An error occurred. Run "npm run index" to load data into the index before querying.'
181 |   );
182 |   throw e;
183 | }
184 | 
185 | const index = pinecone.index<ArticleRecord>(indexName).namespace("default");
186 | 
187 | await embedder.init("Xenova/all-MiniLM-L6-v2");
188 | 
189 | const { query, section } = getQueryingCommandLineArguments();
190 | 
191 | // We create a simulated user with an interest given a query and a specific section
192 | const queryEmbedding = await embedder.embed(query);
193 | const queryResult = await index.query({
194 |   vector: queryEmbedding.values,
195 |   includeMetadata: true,
196 |   includeValues: true,
197 |   filter: {
198 |     section: { $eq: section },
199 |   },
200 |   topK: 10,
201 | });
202 | ```
203 | 
204 | We'll calculate the **mean** vector given the results of the query. The mean vector represents the user's interests based on the articles they've read.
205 | 
206 | ```typescript
207 | // We extract the vectors of the results
208 | const userVectors = queryResult?.matches?.map(
209 |   (result: ScoredPineconeRecord<ArticleRecord>) => result.values
210 | );
211 | 
212 | // A couple of functions to calculate mean vector
213 | const mean = (arr: number[]): number =>
214 |   arr.reduce((a, b) => a + b, 0) / arr.length;
215 | const meanVector = (vectors: number[][]): number[] => {
216 |   const { length } = vectors[0];
217 | 
218 |   return Array.from({ length }).map((_, i) =>
219 |     mean(vectors.map((vec) => vec[i]))
220 |   );
221 | };
222 | 
223 | // We calculate the mean vector of the results
224 | // eslint-disable-next-line @typescript-eslint/no-non-null-assertion
225 | const meanVec = meanVector(userVectors!);
226 | ```
227 | 
228 | To resolve the recommendations, we'll query the index with the mean vector and filter out the articles that the user has already read.
229 | 
230 | ```typescript
231 | const recommendations = await index.query({
232 |   vector: meanVec,
233 |   includeMetadata: true,
234 |   includeValues: true,
235 |   topK: 10,
236 | });
237 | ```
238 | 
239 | Finally, we'll use `console-table-printer` to print out the recommendations.
240 | 
241 | ```typescript
242 | const userPreferences = new Table({
243 |   columns: [
244 |     { name: "title", alignment: "left" },
245 |     { name: "author", alignment: "left" },
246 |     { name: "section", alignment: "left" },
247 |   ],
248 | });
249 | 
250 | const userRecommendations = new Table({
251 |   columns: [
252 |     { name: "title", alignment: "left" },
253 |     { name: "author", alignment: "left" },
254 |     { name: "section", alignment: "left" },
255 |   ],
256 | });
257 | 
258 | queryResult?.matches?.slice(0, 10).forEach((result: any) => {
259 |   const { title, article, publication, section } = result.metadata;
260 |   userPreferences.addRow({
261 |     title,
262 |     article: `${article.slice(0, 70)}...`,
263 |     publication,
264 |     section,
265 |   });
266 | });
267 | 
268 | console.log("========== User Preferences ==========");
269 | userPreferences.printTable();
270 | 
271 | recommendations?.matches?.slice(0, 10).forEach((result: any) => {
272 |   const { title, article, publication, section } = result.metadata;
273 |   userRecommendations.addRow({
274 |     title,
275 |     article: `${article.slice(0, 70)}...`,
276 |     publication,
277 |     section,
278 |   });
279 | });
280 | console.log("=========== Recommendations ==========");
281 | userRecommendations.printTable();
282 | ```
283 | 
284 | ### Query Sports user
285 | 
286 | To get the result for a simulated user with an interest in Sports and specifically Tennis, we'll run:
287 | 
288 | ```bash
289 | npm run recommend -- --query="tennis" --section="Sports"
290 | ```
291 | 
292 | | Index | Title                                                                                                  | Article                                                                   | Section | Publication |
293 | | :---- | :----------------------------------------------------------------------------------------------------- | :------------------------------------------------------------------------ | :------ | :---------- |
294 | | 0     | "Just Ask Anybody": Searching for Match-Fixing in Minor League Tennis                                  | Uladzimir Ignatik, a six-foot-tall 25-year-old who, as a junior, was t... | Sports  | Vice        |
295 | | 1     | MaliVai Washington on Men's Tennis Today and His Historic Wimbledon Run 20 Years Ago                   | Welcome to VICE Sports Q&A, where we talk to authors, directors, and o... | Sports  | Vice        |
296 | | 2     | Venus, Serena, and the Power of Believing                                                              | On Saturday, another chapter in the best story in the history of sport... | Sports  | Vice        |
297 | | 3     | John McEnroe Says Serena is "Best Female Player Ever," Reheats Dumb Debate                             | Here we ago again. Almost two decades after saying that any male colle... | Sports  | Vice        |
298 | | 4     | Tennis Legend Margaret Court Went Off the Rails in Anti-LGBTQ Tirade                                   | Margaret Court, the most decorated tennis player in history, has had a... | Sports  | Vice        |
299 | | 5     | Here is Wimbledon Darling Marcus Willis with a Mid-Match Soda and Candy Bar Back in His "Tubster" Days | Despite being currently ranked 772nd in the world, Englishman Marcus W... | Sports  | Vice        |
300 | | 6     | This Backhand by Kiki Mladenovic Is So Good, It Just Doesn't Make Sense                                | French No. 1 Kiki Mladenovic is having a helluva week. Yesterday, she ... | Sports  | Vice        |
301 | | 7     | “A Rebel From The Wrong Side of the Tennis Tramlines” – The Legacy of Fred Perry                       | This article originally appeared on VICE Sports UK. When spectators en... | Sports  | Vice        |
302 | | 8     | This Phoenix Suns Fast Break Is Better Than Sex                                                        | People sure like to call sports a "dance." There even was a long New Y... | Sports  | Vice        |
303 | | 9     | Human Slimeball Chris Christie Oozes Creepily About Women's Tennis                                     | Former Republican presidential candidate Chris Christie is pretty good... | Sports  | Vice        |
304 | 
305 | And here are the recommended articles for this user:
306 | 
307 | | Index | Title                                                                                | Article                                                                   | Section     | Publication |
308 | | :---- | :----------------------------------------------------------------------------------- | :------------------------------------------------------------------------ | :---------- | :---------- |
309 | | 0     | MaliVai Washington on Men's Tennis Today and His Historic Wimbledon Run 20 Years Ago | Welcome to VICE Sports Q&A, where we talk to authors, directors, and o... | Sports      | Vice        |
310 | | 1     | Wimbledon crowd go loco for Coco as dream continues                                  | LONDON (Reuters) - Did she win? Did she win?... Men, women, girls and...  | Sports News | Reuters     |
311 | | 2     | "Just Ask Anybody": Searching for Match-Fixing in Minor League Tennis                | Uladzimir Ignatik, a six-foot-tall 25-year-old who, as a junior, was t... | Sports      | Vice        |
312 | | 3     | Williams' U.S. Open treatment divides tennis world                                   | NEW YORK (Reuters) - Serena Williams’ behavior in Saturday’s U.S. Open... | Sports News | Reuters     |
313 | | 4     | John McEnroe Says Serena is "Best Female Player Ever," Reheats Dumb Debate           | Here we ago again. Almost two decades after saying that any male colle... | Sports      | Vice        |
314 | | 5     | Venus, Serena, and the Power of Believing                                            | On Saturday, another chapter in the best story in the history of sport... | Sports      | Vice        |
315 | | 6     | Serena survives Fourth of July test by Slovenian student                             | LONDON (Reuters) - For the second time in four days a Wimbledon champi... | Sports News | Reuters     |
316 | | 7     | Nadal rebuffs interrogator who dared to question Centre Court status                 | LONDON (Reuters) - It was a case of “Don’t you know who I am?” for Raf... | Sports News | Reuters     |
317 | | 8     | School girl Gauff turfs idol Venus out of Wimbledon                                  | LONDON (Reuters) - Cori Gauff served up the perfect excuse for playing... | Sports News | Reuters     |
318 | | 9     | As women wilt at Wimbledon, men's Big Three march on                                 | LONDON (Reuters) - As the Wimbledon women’s quarter-finals take place ... | Sports News | Reuters     |
319 | 
320 | ### Query Games user
321 | 
322 | To get the result for a simulated user with an interest in Games and specifically Xbox, we'll run:
323 | 
324 | ```bash
325 | npm run recommend -- --query="Xbox" --section="Games"
326 | ```
327 | 
328 | As expected, we can see that the recommendations are similar to the user's preferences, with a focus on Tennis.
329 | 
330 | Here are the user's preferences:
331 | 
332 | | Index | Title                                                                              | Article                                                                   | Section | Publication |
333 | | :---- | :--------------------------------------------------------------------------------- | :------------------------------------------------------------------------ | :------ | :---------- |
334 | | 0     | Everything We Learned About the Next Gen Xbox at Microsoft's Press Conference      | Microsoft revealed the next Xbox on Sunday during its E3 press confer...  | Games   | Vice        |
335 | | 1     | Sony Is Finally Adding PlayStation 4 External Hard Drive Support                   | Above: An upright, slim-model PlayStation 4. Photography courtesy of ...  | Games   | Vice        |
336 | | 2     | A Title Card vs Six Teraflops: How Metroid Stole Microsoft’s Thunder               | As I've said already, E3 shouldn't be about which company "won" and w...  | Games   | Vice        |
337 | | 3     | A Canadian Man Is Pissed That His Son Ran Up an $8,800 Xbox Bill in FIFA Purchases | A Pembroke, Ontario, gun shop owner is "mad as a hatter" over his son'... | Games   | Vice        |
338 | | 4     | 'Phantom Dust' Was Ambitious, Beautiful, And Messy                                 | Guide to Games is Waypoint's weekly short video series diving into a ...  | Games   | Vice        |
339 | | 5     | 'Forza Horizon 4' Is a Living Impressionist Landscape                              | On Waypoint Radio 193, Austin, Danielle, Rob, and Natalie discuss Rob'... | Games   | Vice        |
340 | | 6     | An Overdue Apology to the Outgoing PlayStation Vita                                | I'm writing to you from New York. Just down the road from my hotel, in... | Games   | Vice        |
341 | | 7     | The Switch and I: Early Adopters of Nintendo’s Console on its Joys and Cons        | It's easy enough for people who cover games for a living to dig into ...  | Games   | Vice        |
342 | | 8     | The Nintendo 3DS Is the Greatest Handheld Console of All Time                      | Illustration by Stephen Maurice Graham Five years and the best part of... | Games   | Vice        |
343 | | 9     | How the Game Genie Helped Me Understand My Brain Injury                            | Illustration by Stephen Maurice Graham There's a wonderful point in ch... | Games   | Vice        |
344 | 
345 | And here are the recommended articles for this user:
346 | 
347 | | Index | Title                                                                                   | Article                                                                   | Section      | Publication |
348 | | :---- | :-------------------------------------------------------------------------------------- | :------------------------------------------------------------------------ | :----------- | :---------- |
349 | | 0     | An Overdue Apology to the Outgoing PlayStation Vita                                     | I'm writing to you from New York. Just down the road from my hotel, in... | Games        | Vice        |
350 | | 1     | The Nintendo 3DS Is the Greatest Handheld Console of All Time                           | Illustration by Stephen Maurice Graham Five years and the best part of... | Games        | Vice        |
351 | | 2     | Sony Is Finally Adding PlayStation 4 External Hard Drive Support                        | Above: An upright, slim-model PlayStation 4. Photography courtesy of ...  | Games        | Vice        |
352 | | 3     | The Switch and I: Early Adopters of Nintendo’s Console on its Joys and Cons             | It's easy enough for people who cover games for a living to dig into ...  | Games        | Vice        |
353 | | 4     | Sony Is Putting the PlayStation's Back Catalog to Work for PC Gamers                    | Sony announced this week that PlayStation Now, the game-streaming subs... | Tech by VICE | Vice        |
354 | | 5     | A Title Card vs Six Teraflops: How Metroid Stole Microsoft’s Thunder                    | As I've said already, E3 shouldn't be about which company "won" and w...  | Games        | Vice        |
355 | | 6     | Like 'Breath of the Wild,' Switch's Success is Tied to Taking Big Risks                 | People have been predicting Nintendo's doom for longer than I can reme... | Games        | Vice        |
356 | | 7     | UPDATE 2-Microsoft unveils next-gen "Project Scarlett" Xbox console for release in 2020 | (Adds quote from director of Xbox platform marketing) By Arjun Pancha...  | Company News | Reuters     |
357 | | 8     | There’s Only One Reason to Own Nintendo’s New 2DS XL                                    | I already called the 3DS the greatest handheld console of all time. A...  | Games        | Vice        |
358 | | 9     | 'Battlefield' Developer's Unreleased Sega Mega Drive Game Is Coming Out After 25 Years  | The history of video games is filled with unreleased games: Prey 2, ...   | Tech by VICE | Vice        |
359 | 
360 | ### Query Business user
361 | 
362 | To get the result for a simulated user with an interest in Business and specifically Wall Street, we'll run:
363 | 
364 | ```bash
365 | npm run recommend -- --query="Wall Street" --section="Business News"
366 | ```
367 | 
368 | Here are the user's preferences:
369 | 
370 | | Index | Title                                                                                    | Article                                                                   | Section       | Publication |
371 | | :---- | :--------------------------------------------------------------------------------------- | :------------------------------------------------------------------------ | :------------ | :---------- |
372 | | 0     | Hexagon CEO Rollen found not guilty of insider trading in appeals case                   | OSLO (Reuters) - The chief executive of Swedish industrial technology ... | Business News | Reuters     |
373 | | 1     | Hexagon CEO Rollen found not guilty of insider trading in appeals case                   | OSLO (Reuters) - The chief executive of Swedish industrial technology ... | Business News | Reuters     |
374 | | 2     | Snapchat launches redesign as growth disappoints Wall Street                             | (Reuters) - Snap Inc (SNAP.N) is redesigning its disappearing-message ... | Business News | Reuters     |
375 | | 3     | 'Don't ask my age': Ageing South Koreans begin a new chapter on the catwalk, YouTube     | SEOUL (Reuters) - Boasting an overgrown beard and grey wavy hair, 65-y... | Business News | Reuters     |
376 | | 4     | Twitter warns fake account purge to keep erasing users, shares drop 19 percent           | NEW YORK (Reuters) - Twitter Inc (TWTR.N) on Friday said it lost 1 mil... | Business News | Reuters     |
377 | | 5     | Reckitt picks PepsiCo executive as CEO, going outside for first time                     | (Reuters) - Consumer goods group Reckitt Benckiser has picked PepsiCo ... | Business News | Reuters     |
378 | | 6     | Loyalists unhappy as Coach becomes Tapestry Inc                                          | (Reuters) - Iconic luxury handbag maker Coach Inc risked Wall Street a... | Business News | Reuters     |
379 | | 7     | Cufflinks and the Caribbean: How Virgin Galactic kept space tourists' interest and money | COLORADO SPRINGS, Colo. (Reuters) - Virgin Galactic’s goal to fly tour... | Business News | Reuters     |
380 | | 8     | Exclusive: Amazon scraps bundled video service - sources                                 | NEW YORK/LOS ANGELES (Reuters) - Amazon.com Inc (AMZN.O) has scrapped ... | Business News | Reuters     |
381 | | 9     | With new CEO, Telecom Italia 'opera' edges towards finale                                | MILAN (Reuters) - Barely a year into the job as the boss of Italy’s bi... | Business News | Reuters     |
382 | 
383 | And here are the recommended articles for this user:
384 | 
385 | | Index | Title                                                                 | Article                                                                   | Section       | Publication |
386 | | :---- | :-------------------------------------------------------------------- | :------------------------------------------------------------------------ | :------------ | :---------- |
387 | | 0     | Snapchat launches redesign as growth disappoints Wall Street          | (Reuters) - Snap Inc (SNAP.N) is redesigning its disappearing-message ... | Business News | Reuters     |
388 | | 1     | With new CEO, Telecom Italia 'opera' edges towards finale             | MILAN (Reuters) - Barely a year into the job as the boss of Italy’s bi... | Business News | Reuters     |
389 | | 2     | Online lender SoFi nabs Twitter executive Noto as CEO                 | (Reuters) - Wall Street banker turned Silicon Valley executive Anthony... | Fintech       | Reuters     |
390 | | 3     | End of Sorrell's reign heralds change for big ad empires              | LONDON (Reuters) - Martin Sorrell’s departure from the world’s biggest... | Business News | Reuters     |
391 | | 4     | Loeb's Third Point takes new approach in battle with Nestle           | BOSTON (Reuters) - For a year, billionaire hedge fund manager Daniel L... | Wealth        | Reuters     |
392 | | 5     | Greenlight comment adds to Tesla losses from Musk mocking SEC         | (Reuters) - Shares of Tesla Inc (TSLA.O) fell 7 percent on Friday, as ... | Business News | Reuters     |
393 | | 6     | Out of Sorrell's shadow, Mark Read poised for top job at ad giant WPP | (Note strong language in final paragraph) By Kate Holton LONDON (Re...    | Business News | Reuters     |
394 | | 7     | Gucci parent Kering moves to tighten grip on e-commerce               | PARIS (Reuters) - Kering, owner of brands including Gucci, will tighte... | Business News | Reuters     |
395 | | 8     | Gucci parent Kering moves to tighten grip on e-commerce               | PARIS (Reuters) - Kering, owner of brands including Gucci, will tighte... | Business News | Reuters     |
396 | | 9     | China's Tencent takes 12 percent stake in Snap as shares plunge       | (Reuters) - Snap Inc said on Wednesday that Chinese tech and media inv... | Business News | Reuters     |
397 | 
398 | ## Query Results
399 | 
400 | We can see that each user's recommendations have a high similarity to what the user actually reads. A user who likes Tennis news has plenty of Tennis news recommendations. A user who likes to read about Xbox has that kind of news. And a business user has plenty of Wall Street news that they enjoy.
401 | 
402 | Since we used only the title and the content of the article to define the embeddings, and we did not take publications and sections into account, a user may get recommendations from a publication/section that they does not regularly read. You may try adding this information when creating embeddings as well and check your query results then!
403 | 
404 | Also, you may notice that some articles appear in the recommendations, although the user has already read them. These articles could be removed as part of postprocessing the query results, in case you prefer not to see them in the recommendations.
405 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "langchain-ts-starter",
 3 |   "version": "1.0.0",
 4 |   "description": "",
 5 |   "type": "module",
 6 |   "main": "./dist/index.js",
 7 |   "files": [
 8 |     "dist"
 9 |   ],
10 |   "scripts": {
11 |     "tsup": "tsup",
12 |     "build": "tsup",
13 |     "index": "tsx -r dotenv/config src/index.ts",
14 |     "recommend": "tsx -r dotenv/config src/recommend.ts",
15 |     "lint": "eslint src",
16 |     "lint:fix": "npm run lint --fix",
17 |     "format": "prettier --write \"**/*.ts\"",
18 |     "format:check": "prettier --list-different \"**/*.ts\""
19 |   },
20 |   "keywords": [
21 |     "langchain",
22 |     "starter",
23 |     "template",
24 |     "node",
25 |     "typescript",
26 |     "llm"
27 |   ],
28 |   "author": "",
29 |   "license": "MIT",
30 |   "dependencies": {
31 |     "@pinecone-database/pinecone": "^2.0.0",
32 |     "@xenova/transformers": "^2.2.0",
33 |     "chalk": "^5.2.0",
34 |     "cli-progress": "^3.12.0",
35 |     "console-table-printer": "^2.11.1",
36 |     "cross-fetch": "^3.1.6",
37 |     "danfojs-node": "^1.1.2",
38 |     "dotenv": "^16.1.3",
39 |     "json2csv": "^6.0.0-alpha.2",
40 |     "langchain": "^0.0.85",
41 |     "onnxruntime-node": "^1.15.0",
42 |     "tablemark": "^3.0.0",
43 |     "tsup": "^6.7.0",
44 |     "yargs": "^17.7.2"
45 |   },
46 |   "devDependencies": {
47 |     "@tsconfig/recommended": "^1.0.2",
48 |     "@types/cli-progress": "^3.11.0",
49 |     "@types/js-yaml": "^4",
50 |     "@types/json2csv": "^5.0.3",
51 |     "@types/node": "^18.15.5",
52 |     "@types/yargs": "^17.0.24",
53 |     "@typescript-eslint/eslint-plugin": "^5.51.0",
54 |     "@typescript-eslint/parser": "^5.51.0",
55 |     "eslint": "^8.33.0",
56 |     "eslint-config-airbnb-base": "^15.0.0",
57 |     "eslint-config-prettier": "^8.6.0",
58 |     "eslint-plugin-import": "^2.27.5",
59 |     "eslint-plugin-prettier": "^4.2.1",
60 |     "prettier": "^2.8.3",
61 |     "tsx": "^3.12.3",
62 |     "typescript": "^5.0.4"
63 |   }
64 | }
65 | 


--------------------------------------------------------------------------------
/src/embeddings.ts:
--------------------------------------------------------------------------------
  1 | import { randomUUID } from "crypto";
  2 | import { Pipeline, pipeline, AutoConfig } from "@xenova/transformers";
  3 | import type {
  4 |   PineconeRecord,
  5 |   RecordMetadata,
  6 | } from "@pinecone-database/pinecone";
  7 | import type { Document } from "langchain/document";
  8 | import { EmbeddingsParams, Embeddings } from "langchain/embeddings/base";
  9 | import { sliceIntoChunks } from "./utils/util.js";
 10 | 
 11 | type DocumentOrString = Document | string;
 12 | 
 13 | // eslint-disable-next-line @typescript-eslint/no-explicit-any
 14 | function isString(test: any): test is string {
 15 |   return typeof test === "string";
 16 | }
 17 | 
 18 | class Embedder {
 19 |   private pipe: Pipeline;
 20 | 
 21 |   async init(modelName: string) {
 22 |     const config = await AutoConfig.from_pretrained(modelName);
 23 |     this.pipe = await pipeline("embeddings", modelName, {
 24 |       quantized: false,
 25 |       config,
 26 |     });
 27 |   }
 28 | 
 29 |   // Embeds a text and returns the embedding
 30 |   async embed(
 31 |     text: string,
 32 |     metadata?: RecordMetadata
 33 |   ): Promise<PineconeRecord> {
 34 |     try {
 35 |       const result = await this.pipe(text, {
 36 |         pooling: "mean",
 37 |         normalize: true,
 38 |       });
 39 |       const id = (metadata?.id as string) || randomUUID();
 40 |       return {
 41 |         id,
 42 |         metadata: metadata || {
 43 |           text,
 44 |         },
 45 |         values: Array.from(result.data) as number[],
 46 |       };
 47 |     } catch (e) {
 48 |       console.log(`Error embedding text: ${text}, ${e}`);
 49 |       throw e;
 50 |     }
 51 |   }
 52 | 
 53 |   // Embeds a batch of documents and calls onDoneBatch with the embeddings
 54 |   async embedBatch(
 55 |     documents: DocumentOrString[],
 56 |     batchSize: number,
 57 |     onDoneBatch: (embeddings: PineconeRecord[]) => void
 58 |   ) {
 59 |     const batches = sliceIntoChunks<DocumentOrString>(documents, batchSize);
 60 |     for (const batch of batches) {
 61 |       const embeddings = await Promise.all(
 62 |         batch.map((documentOrString) =>
 63 |           isString(documentOrString)
 64 |             ? this.embed(documentOrString)
 65 |             : this.embed(
 66 |                 documentOrString.pageContent,
 67 |                 documentOrString.metadata
 68 |               )
 69 |         )
 70 |       );
 71 |       await onDoneBatch(embeddings);
 72 |     }
 73 |   }
 74 | }
 75 | 
 76 | interface TransformersJSEmbeddingParams extends EmbeddingsParams {
 77 |   modelName: string;
 78 |   onEmbeddingDone?: (embeddings: PineconeRecord[]) => void;
 79 | }
 80 | 
 81 | class TransformersJSEmbedding
 82 |   extends Embeddings
 83 |   implements TransformersJSEmbeddingParams
 84 | {
 85 |   modelName: string;
 86 | 
 87 |   pipe: Pipeline | null = null;
 88 | 
 89 |   constructor(params: TransformersJSEmbeddingParams) {
 90 |     super(params);
 91 |     this.modelName = params.modelName;
 92 |   }
 93 | 
 94 |   async embedDocuments(texts: string[]): Promise<number[][]> {
 95 |     this.pipe = this.pipe || (await pipeline("embeddings", this.modelName));
 96 | 
 97 |     const embeddings = await Promise.all(
 98 |       texts.map(async (text) => this.embedQuery(text))
 99 |     );
100 |     return embeddings;
101 |   }
102 | 
103 |   async embedQuery(text: string): Promise<number[]> {
104 |     this.pipe = this.pipe || (await pipeline("embeddings", this.modelName));
105 | 
106 |     const result = await this.pipe(text);
107 |     return Array.from(result.data) as number[];
108 |   }
109 | }
110 | 
111 | const embedder = new Embedder();
112 | export { embedder, TransformersJSEmbedding };
113 | 


--------------------------------------------------------------------------------
/src/index.ts:
--------------------------------------------------------------------------------
  1 | /* eslint-disable import/no-extraneous-dependencies */
  2 | /* eslint-disable dot-notation */
  3 | import * as dotenv from "dotenv";
  4 | import {
  5 |   Pinecone,
  6 |   type PineconeRecord,
  7 |   type ServerlessSpecCloudEnum,
  8 | } from "@pinecone-database/pinecone";
  9 | import { getEnv, validateEnvironmentVariables } from "utils/util.ts";
 10 | import cliProgress from "cli-progress";
 11 | import { Document } from "langchain/document";
 12 | import * as dfd from "danfojs-node";
 13 | import { embedder } from "embeddings.ts";
 14 | import loadCSVFile from "utils/csvLoader.ts";
 15 | import splitFile from "utils/fileSplitter.ts";
 16 | import type { ArticleRecord } from "types.ts";
 17 | import { chunkedUpsert } from "./utils/chunkedUpsert.ts";
 18 | 
 19 | dotenv.config();
 20 | validateEnvironmentVariables();
 21 | 
 22 | const progressBar = new cliProgress.SingleBar(
 23 |   {},
 24 |   cliProgress.Presets.shades_classic
 25 | );
 26 | 
 27 | // Index setup
 28 | const indexName = getEnv("PINECONE_INDEX");
 29 | const indexCloud = getEnv("PINECONE_CLOUD") as ServerlessSpecCloudEnum;
 30 | const indexRegion = getEnv("PINECONE_REGION");
 31 | const pinecone = new Pinecone();
 32 | 
 33 | async function getChunk(
 34 |   df: dfd.DataFrame,
 35 |   start: number,
 36 |   size: number
 37 | ): Promise<dfd.DataFrame> {
 38 |   // eslint-disable-next-line no-return-await
 39 |   return await df.head(start + size).tail(size);
 40 | }
 41 | 
 42 | async function* processInChunks<T, M extends keyof T, P extends keyof T>(
 43 |   dataFrame: dfd.DataFrame,
 44 |   chunkSize: number,
 45 |   metadataFields: M[],
 46 |   pageContentField: P
 47 | ): AsyncGenerator<Document[]> {
 48 |   for (let i = 0; i < dataFrame.shape[0]; i += chunkSize) {
 49 |     const chunk = await getChunk(dataFrame, i, chunkSize);
 50 |     const records = dfd.toJSON(chunk) as T[];
 51 |     yield records.map((record: T) => {
 52 |       const metadata: Partial<Record<M, T[M]>> = {};
 53 |       for (const field of metadataFields) {
 54 |         metadata[field] = record[field];
 55 |       }
 56 |       return new Document({
 57 |         pageContent: record[pageContentField] as string,
 58 |         metadata,
 59 |       });
 60 |     });
 61 |   }
 62 | }
 63 | 
 64 | async function embedAndUpsert(dataFrame: dfd.DataFrame, chunkSize: number) {
 65 |   const chunkGenerator = processInChunks<
 66 |     ArticleRecord,
 67 |     "section" | "url" | "title" | "publication" | "author" | "article",
 68 |     "article"
 69 |   >(
 70 |     dataFrame,
 71 |     100,
 72 |     ["section", "url", "title", "publication", "author", "article"],
 73 |     "article"
 74 |   );
 75 |   const index = pinecone.index(indexName);
 76 | 
 77 |   for await (const documents of chunkGenerator) {
 78 |     await embedder.embedBatch(
 79 |       documents,
 80 |       chunkSize,
 81 |       async (embeddings: PineconeRecord[]) => {
 82 |         await chunkedUpsert(index, embeddings, "default");
 83 |         progressBar.increment(embeddings.length);
 84 |       }
 85 |     );
 86 |   }
 87 | }
 88 | 
 89 | try {
 90 |   const fileParts = await splitFile("./data/all-the-news-2-1.csv", 100000);
 91 |   const firstFile = fileParts[0];
 92 | 
 93 |   // For this example, we will use the first file part to create the index
 94 |   const data = await loadCSVFile(firstFile);
 95 |   const clean = data.dropNa() as dfd.DataFrame;
 96 |   clean.head().print();
 97 | 
 98 |   // Create the index if it doesn't already exist
 99 |   const indexList = await pinecone.listIndexes();
100 |   if (!indexList.indexes?.some((index) => index.name === indexName)) {
101 |     await pinecone.createIndex({
102 |       name: indexName,
103 |       dimension: 384,
104 |       spec: { serverless: { region: indexRegion, cloud: indexCloud } },
105 |       waitUntilReady: true,
106 |     });
107 |   }
108 | 
109 |   progressBar.start(clean.shape[0], 0);
110 |   await embedder.init("Xenova/all-MiniLM-L6-v2");
111 |   await embedAndUpsert(clean, 1);
112 |   progressBar.stop();
113 |   console.log(
114 |     `Inserted ${progressBar.getTotal()} documents into index ${indexName}`
115 |   );
116 | } catch (error) {
117 |   console.error(error);
118 | }
119 | 


--------------------------------------------------------------------------------
/src/recommend.ts:
--------------------------------------------------------------------------------
  1 | /* eslint-disable import/no-extraneous-dependencies */
  2 | import {
  3 |   getEnv,
  4 |   getQueryingCommandLineArguments,
  5 |   validateEnvironmentVariables,
  6 | } from "utils/util.ts";
  7 | import { embedder } from "embeddings.ts";
  8 | import { Table } from "console-table-printer";
  9 | import { Pinecone } from "@pinecone-database/pinecone";
 10 | import type { ScoredPineconeRecord } from "@pinecone-database/pinecone";
 11 | import type { ArticleRecord } from "types.ts";
 12 | 
 13 | validateEnvironmentVariables();
 14 | 
 15 | const indexName = getEnv("PINECONE_INDEX");
 16 | const pinecone = new Pinecone();
 17 | 
 18 | // Ensure the index exists
 19 | try {
 20 |   const description = await pinecone.describeIndex(indexName);
 21 |   if (!description.status?.ready) {
 22 |     throw new Error(
 23 |       `Index not ready, description was ${JSON.stringify(description)}`
 24 |     );
 25 |   }
 26 | } catch (e) {
 27 |   console.log(
 28 |     'An error occurred. Run "npm run index" to load data into the index before querying.'
 29 |   );
 30 |   throw e;
 31 | }
 32 | 
 33 | const index = pinecone.index<ArticleRecord>(indexName).namespace("default");
 34 | 
 35 | await embedder.init("Xenova/all-MiniLM-L6-v2");
 36 | 
 37 | const { query, section } = getQueryingCommandLineArguments();
 38 | 
 39 | // We create a simulated user with an interest given a query and a specific section
 40 | const queryEmbedding = await embedder.embed(query);
 41 | const queryResult = await index.query({
 42 |   vector: queryEmbedding.values,
 43 |   includeMetadata: true,
 44 |   includeValues: true,
 45 |   filter: {
 46 |     section: { $eq: section },
 47 |   },
 48 |   topK: 10,
 49 | });
 50 | 
 51 | // We extract the vectors of the results
 52 | const userVectors = queryResult?.matches?.map(
 53 |   (result: ScoredPineconeRecord<ArticleRecord>) => result.values
 54 | );
 55 | 
 56 | // A couple of functions to calculate mean vector
 57 | const mean = (arr: number[]): number =>
 58 |   arr.reduce((a, b) => a + b, 0) / arr.length;
 59 | const meanVector = (vectors: number[][]): number[] => {
 60 |   const { length } = vectors[0];
 61 | 
 62 |   return Array.from({ length }).map((_, i) =>
 63 |     mean(vectors.map((vec) => vec[i]))
 64 |   );
 65 | };
 66 | 
 67 | // We calculate the mean vector of the results
 68 | // eslint-disable-next-line @typescript-eslint/no-non-null-assertion
 69 | const meanVec = meanVector(userVectors!);
 70 | 
 71 | // We query the index with the mean vector to get recommendations for the user
 72 | const recommendations = await index.query({
 73 |   vector: meanVec,
 74 |   includeMetadata: true,
 75 |   includeValues: true,
 76 |   topK: 10,
 77 | });
 78 | 
 79 | const userPreferences = new Table({
 80 |   columns: [
 81 |     { name: "title", alignment: "left" },
 82 |     { name: "author", alignment: "left" },
 83 |     { name: "section", alignment: "left" },
 84 |   ],
 85 | });
 86 | 
 87 | const userRecommendations = new Table({
 88 |   columns: [
 89 |     { name: "title", alignment: "left" },
 90 |     { name: "author", alignment: "left" },
 91 |     { name: "section", alignment: "left" },
 92 |   ],
 93 | });
 94 | 
 95 | queryResult?.matches?.slice(0, 10).forEach((result: any) => {
 96 |   const { title, article, publication, section } = result.metadata;
 97 |   userPreferences.addRow({
 98 |     title,
 99 |     article: `${article.slice(0, 70)}...`,
100 |     publication,
101 |     section,
102 |   });
103 | });
104 | 
105 | console.log("========== User Preferences ==========");
106 | userPreferences.printTable();
107 | 
108 | recommendations?.matches?.slice(0, 10).forEach((result: any) => {
109 |   const { title, article, publication, section } = result.metadata;
110 |   userRecommendations.addRow({
111 |     title,
112 |     article: `${article.slice(0, 70)}...`,
113 |     publication,
114 |     section,
115 |   });
116 | });
117 | console.log("=========== Recommendations ==========");
118 | userRecommendations.printTable();
119 | 


--------------------------------------------------------------------------------
/src/types.ts:
--------------------------------------------------------------------------------
 1 | export type ArticleRecord = {
 2 |   index: number;
 3 |   title: string;
 4 |   article: string;
 5 |   publication: string;
 6 |   url: string;
 7 |   author: string;
 8 |   section: string;
 9 | };
10 | 


--------------------------------------------------------------------------------
/src/utils/chunkedUpsert.ts:
--------------------------------------------------------------------------------
 1 | import type { Index, PineconeRecord } from "@pinecone-database/pinecone";
 2 | import { sliceIntoChunks } from "./util.ts";
 3 | 
 4 | export const chunkedUpsert = async (
 5 |   index: Index,
 6 |   vectors: Array<PineconeRecord>,
 7 |   namespace: string,
 8 |   chunkSize = 10
 9 | ) => {
10 |   // Split the vectors into chunks
11 |   const chunks = sliceIntoChunks<PineconeRecord>(vectors, chunkSize);
12 | 
13 |   try {
14 |     // Upsert each chunk of vectors into the index
15 |     await Promise.allSettled(
16 |       chunks.map(async (chunk) => {
17 |         try {
18 |           await index.namespace(namespace).upsert(chunk);
19 |         } catch (e) {
20 |           console.log("Error upserting chunk", e);
21 |         }
22 |       })
23 |     );
24 | 
25 |     return true;
26 |   } catch (e) {
27 |     throw new Error(`Error upserting vectors into index: ${e}`);
28 |   }
29 | };
30 | 


--------------------------------------------------------------------------------
/src/utils/csvLoader.ts:
--------------------------------------------------------------------------------
 1 | import fs from "fs/promises";
 2 | import * as dfd from "danfojs-node";
 3 | 
 4 | async function loadCSVFile(filePath: string): Promise<dfd.DataFrame> {
 5 |   try {
 6 |     // Get csv file absolute path
 7 |     const csvAbsolutePath = await fs.realpath(filePath);
 8 |     const df: dfd.DataFrame = (await dfd.readCSV(
 9 |       csvAbsolutePath
10 |     )) as dfd.DataFrame;
11 | 
12 |     // Create a readable stream from the CSV file
13 |     return df;
14 |   } catch (err) {
15 |     console.error(err);
16 |     throw err;
17 |   }
18 | }
19 | 
20 | export default loadCSVFile;
21 | 


--------------------------------------------------------------------------------
/src/utils/dataLoader.ts:
--------------------------------------------------------------------------------
 1 | /* eslint-disable import/no-extraneous-dependencies */
 2 | import * as dfd from "danfojs-node";
 3 | import { Parser, transforms } from "json2csv";
 4 | import fs from "fs";
 5 | import fetch from "cross-fetch";
 6 | 
 7 | const { unwind, flatten } = transforms;
 8 | 
 9 | const jsonToCSV = async (
10 |   url: string,
11 |   fields: string[],
12 |   unwindFieldsPaths: string[]
13 | ): Promise<string> => {
14 |   const response = await fetch(url);
15 |   const { data } = await response.json();
16 | 
17 |   const topLevelData = data;
18 |   const transforms = [
19 |     unwind({ paths: [...unwindFieldsPaths] }),
20 |     flatten({ objects: true, arrays: true }),
21 |   ];
22 | 
23 |   const json2csvParser = new Parser({ fields, transforms });
24 |   const csv = json2csvParser.parse(topLevelData);
25 | 
26 |   return csv;
27 | };
28 | 
29 | const dataFrameFromURL = async (
30 |   url: string,
31 |   fields: string[],
32 |   unwindFieldsPaths: string[]
33 | ): Promise<dfd.DataFrame> => {
34 |   const csv = await jsonToCSV(url, fields, unwindFieldsPaths);
35 |   // generate random file name
36 |   const name = Math.random().toString(36).substring(7);
37 | 
38 |   const filePath = `./${name}.csv`;
39 |   try {
40 |     fs.writeFile(filePath, csv, (err) => {
41 |       if (err) throw err;
42 |     });
43 |   } catch (err) {
44 |     console.log(err);
45 |   }
46 | 
47 |   const df: dfd.DataFrame = (await dfd.readCSV(filePath)) as dfd.DataFrame;
48 | 
49 |   // delete the file in try catch, asynchronously
50 |   try {
51 |     fs.unlinkSync(filePath);
52 |   } catch (err) {
53 |     console.error(err);
54 |   }
55 |   return df;
56 | };
57 | 
58 | const dropDuplicates = (
59 |   df: dfd.DataFrame,
60 |   columnName: string
61 | ): dfd.DataFrame => {
62 |   // Get the column as a series
63 |   const series = df[columnName];
64 | 
65 |   // Drop duplicates from the series
66 |   const cleanedSeries = series.dropDuplicates();
67 | 
68 |   // Create a new array filled with NaN for the length of the original DataFrame
69 |   const filledValues = new Array(df.shape[0]).fill(NaN);
70 | 
71 |   // Replace the beginning of the filledValues array with the cleaned values
72 |   // eslint-disable-next-line no-plusplus
73 |   for (let i = 0; i < cleanedSeries.values.length; i++) {
74 |     filledValues[i] = cleanedSeries.values[i];
75 |   }
76 | 
77 |   // Create a new dataframe with the filled series
78 |   const newDfData: Record<string, (string | number | boolean)[]> = {};
79 |   for (const colName of df.columns) {
80 |     if (colName === columnName) {
81 |       newDfData[colName] = filledValues;
82 |     } else {
83 |       // For all the other columns, just copy the data over
84 |       newDfData[colName] = df[colName].values;
85 |     }
86 |   }
87 | 
88 |   // Create a new DataFrame
89 |   const newDf = new dfd.DataFrame(newDfData);
90 | 
91 |   // Drop rows containing NaN values
92 |   newDf.dropNa({ axis: 1, inplace: true });
93 | 
94 |   return newDf;
95 | };
96 | 
97 | export { dataFrameFromURL, dropDuplicates };
98 | 


--------------------------------------------------------------------------------
/src/utils/fileSplitter.ts:
--------------------------------------------------------------------------------
 1 | /* eslint-disable no-plusplus */
 2 | import fs from "fs";
 3 | import readline from "readline";
 4 | 
 5 | async function splitFile(
 6 |   filePath: string,
 7 |   numLinesPerPart: number
 8 | ): Promise<string[]> {
 9 |   const fileStream = fs.createReadStream(filePath);
10 |   const rl = readline.createInterface({
11 |     input: fileStream,
12 |     crlfDelay: Infinity,
13 |   });
14 | 
15 |   let partIndex = 1;
16 |   let lineIndex = 0;
17 |   let writeStream = fs.createWriteStream(`${filePath}.${partIndex}`);
18 |   const createdFiles: string[] = [`${filePath}.${partIndex}`];
19 | 
20 |   for await (const line of rl) {
21 |     if (lineIndex === numLinesPerPart) {
22 |       writeStream.end();
23 |       lineIndex = 0;
24 |       partIndex++;
25 |       writeStream = fs.createWriteStream(`${filePath}.${partIndex}`);
26 |       createdFiles.push(`${filePath}.${partIndex}`);
27 |     }
28 |     writeStream.write(`${line}\n`);
29 |     lineIndex++;
30 |   }
31 | 
32 |   if (!writeStream.closed) {
33 |     writeStream.end();
34 |   }
35 | 
36 |   return createdFiles;
37 | }
38 | 
39 | export default splitFile;
40 | 


--------------------------------------------------------------------------------
/src/utils/util.ts:
--------------------------------------------------------------------------------
 1 | import yargs from "yargs";
 2 | import { hideBin } from "yargs/helpers";
 3 | 
 4 | const sliceIntoChunks = <T>(arr: T[], chunkSize: number) =>
 5 |   Array.from({ length: Math.ceil(arr.length / chunkSize) }, (_, i) =>
 6 |     arr.slice(i * chunkSize, (i + 1) * chunkSize)
 7 |   );
 8 | 
 9 | const getQueryingCommandLineArguments = () => {
10 |   const argv = yargs(hideBin(process.argv))
11 |     .option("query", {
12 |       alias: "q",
13 |       type: "string",
14 |       description: "The query to search for",
15 |       demandOption: true,
16 |     })
17 |     .option("section", {
18 |       alias: "s",
19 |       type: "string",
20 |       description: "The section of the article",
21 |       demandOption: true,
22 |     })
23 | 
24 |     .parseSync();
25 | 
26 |   const { query, section } = argv;
27 |   if (!query) {
28 |     console.error("Please provide a query");
29 |     process.exit(1);
30 |   }
31 | 
32 |   return { query, section };
33 | };
34 | 
35 | export const getEnv = (key: string): string => {
36 |   const value = process.env[key];
37 |   if (!value) {
38 |     throw new Error(`${key} environment variable not set`);
39 |   }
40 |   return value;
41 | };
42 | 
43 | const validateEnvironmentVariables = () => {
44 |   getEnv("PINECONE_API_KEY");
45 |   getEnv("PINECONE_INDEX");
46 |   getEnv("PINECONE_CLOUD");
47 |   getEnv("PINECONE_REGION");
48 | };
49 | 
50 | export {
51 |   getQueryingCommandLineArguments,
52 |   sliceIntoChunks,
53 |   validateEnvironmentVariables,
54 | };
55 | 


--------------------------------------------------------------------------------
/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "compilerOptions": {
 3 |     "allowImportingTsExtensions": true,
 4 |     "strict": true,
 5 |     "esModuleInterop": true,
 6 |     "skipLibCheck": true,
 7 |     "forceConsistentCasingInFileNames": true,
 8 |     "outDir": "dist",
 9 |     "lib": ["ES2021", "ES2022.Object", "DOM"],
10 |     "target": "es2017",
11 |     "module": "nodenext",
12 |     "sourceMap": true,
13 |     "allowSyntheticDefaultImports": true,
14 |     "baseUrl": "./src",
15 |     "declaration": true,
16 |     "experimentalDecorators": true,
17 |     "noImplicitReturns": true,
18 |     "noFallthroughCasesInSwitch": true,
19 |     "noUnusedLocals": true,
20 |     "noUnusedParameters": true,
21 |     "useDefineForClassFields": true,
22 |     "strictPropertyInitialization": false,
23 |     "noEmit": true
24 |   },
25 |   "exclude": ["node_modules/", "dist/", "tests/"],
26 |   "include": ["./src"]
27 | }
28 | 


--------------------------------------------------------------------------------
/tsup.config.js:
--------------------------------------------------------------------------------
 1 | import { defineConfig } from "tsup";
 2 | 
 3 | export default defineConfig({
 4 |   entry: ["src/index.ts"],
 5 |   format: ["cjs", "esm"],
 6 |   splitting: false,
 7 |   sourcemap: true,
 8 |   clean: true,
 9 |   bundle: true,
10 |   dts: true,
11 | });
12 | 


--------------------------------------------------------------------------------
/turbo.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "$schema": "https://turbo.build/schema.json",
 3 |   "pipeline": {
 4 |     "build": {
 5 |       "outputs": [".next/**", "!.next/cache/**"]
 6 |     },
 7 |     "lint": {},
 8 |     "format": {}
 9 |   }
10 | }
11 | 


--------------------------------------------------------------------------------