├── .github ├── scripts │ └── aura.py └── workflows │ └── run-notebooks.yml ├── .gitignore ├── README.md ├── customers-and-products ├── README.md ├── data-load.ipynb ├── data-prep.ipynb ├── genai-example-app-only.ipynb ├── genai-workshop-w-outputs.ipynb ├── genai-workshop.ipynb ├── hm-bloom-perspective.json ├── img │ ├── CUSTOMERS_ALSO_LIKE.png │ ├── data-model.png │ ├── hm-banner.png │ ├── purchase-history.png │ ├── related-products.png │ ├── sample-query.png │ └── search_results.png ├── workshop-slides.pdf └── ws.env.template └── talent ├── README.md ├── data ├── create_skill_embeddings.ipynb ├── expanded_skills.csv └── skills_embeddings.csv ├── module_01_graph_basics.ipynb ├── module_02_unstructured_data.ipynb ├── module_03_graphrag_agent.ipynb ├── workshop_slides.pdf └── ws_temp.txt /.github/scripts/aura.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import json 4 | import time 5 | import logging 6 | 7 | import requests 8 | 9 | logger = logging.getLogger(__name__) 10 | logging.basicConfig(level='INFO') 11 | 12 | 13 | class AuraAPI: 14 | def __init__(self, url, tenant_id, token=None, **kwargs): 15 | self.url = url 16 | self.token = token 17 | self.tenant_id = tenant_id 18 | self.config = kwargs 19 | 20 | def status(self, instance_id): 21 | headers = {"Content-Type": "application/json", "Authorization": self.token} 22 | _url = os.path.join(self.url, instance_id) 23 | response = requests.get(_url, headers=headers) 24 | res = json.loads(response.content) 25 | if not res.get('data'): 26 | logger.info("Unable to retrieve instance Status : {}".format(instance_id)) 27 | return 'Unknown' 28 | status = res.get('data').get('status') 29 | return status 30 | 31 | def create(self, params): 32 | headers = {"Content-Type": "application/json", "Authorization": self.token} 33 | params.update({ 34 | 'tenant_id': self.tenant_id 35 | }) 36 | response = requests.post(self.url, headers=headers, json=params) 37 | res = json.loads(response.content) 38 | instance_details = res.get('data', {}) 39 | errors = res.get('errors', {}) 40 | if not instance_details: 41 | logger.info("Instance creation not successful: {}".format(errors)) 42 | return instance_details 43 | 44 | def delete(self, instance_id): 45 | _url = os.path.join(self.url, instance_id) 46 | headers = {"Content-Type": "application/json", "Authorization": self.token} 47 | response = requests.delete(_url, headers=headers) 48 | res = json.loads(response.content) 49 | instance_details = res.get('data', {}) 50 | errors = res.get('errors', {}) 51 | if not instance_details: 52 | logger.info("Instance not found or unable to delete: {}".format(errors)) 53 | return dict() 54 | return instance_details 55 | 56 | def generate_token(self, url, client_id, client_secret): 57 | body = { 58 | "grant_type": "client_credentials" 59 | } 60 | headers = {"Content-Type": "application/x-www-form-urlencoded"} 61 | response = requests.post(url, auth=(client_id, client_secret), headers=headers, data=body) 62 | data = json.loads(response.content) 63 | token = data['access_token'] 64 | return token 65 | 66 | def generate_token_if_expired(self): 67 | auth_config = self.config['auth'] 68 | auth_url = auth_config.get('endpoint') 69 | client_id = auth_config.get('client_id') 70 | client_secret = auth_config.get('client_secret') 71 | if time.time() - auth_config.get('token_ttl') >= 3599: 72 | self.token = self.generate_token(auth_url, client_id, client_secret) 73 | self.config['auth']['access_token'] = self.token 74 | self.config['auth']['token_ttl'] = time.time() 75 | logger.info("Token Generation Successful: {}".format(time.ctime())) 76 | return True 77 | logger.info("Token is Valid") 78 | return False 79 | 80 | def wait_for_status(self, instance_id, status=None, time_out=300): 81 | start = time.time() 82 | current_status = self.status(instance_id) 83 | while current_status != status and time.time() - start <= time_out: 84 | time.sleep(5) 85 | current_status = self.status(instance_id) 86 | logger.info("Waiting: {} {}".format(instance_id, current_status)) 87 | return current_status 88 | 89 | 90 | def cli(): 91 | parser = argparse.ArgumentParser() 92 | parser.add_argument('task', type=str, help='setup task', choices=['configure', 'delete']) 93 | parser.add_argument('--tenant-id', type=str, help="Aura Tenant ID") 94 | parser.add_argument('--client-id', type=str, help="Aura API Client ID") 95 | parser.add_argument('--client-secret', type=str, help="Aura API Client Secret") 96 | parser.add_argument('--region', type=str, help="Aura Region") 97 | parser.add_argument('--cloud-provider', type=str, help="Aura Cloud Provider") 98 | parser.add_argument('--instance-id', type=str, help="Aura Instance Id") 99 | 100 | return parser.parse_args() 101 | 102 | 103 | def configure_instance(api, region, cloud_provider): 104 | logger.info("Creating Aura instance") 105 | data = api.create(params={ 106 | "name": "gh-action-genai-workshop", 107 | "version": "5", 108 | "region": region, 109 | "memory": "8GB", 110 | "type": "enterprise-ds", 111 | "cloud_provider": cloud_provider, 112 | }) 113 | instance_details = {k: v for k, v in data.items() if 114 | k in ['id', 'connection_url', 'name', 'username', 'password']} 115 | logger.info(f"Waiting for Aura instance {instance_details['id']} to come online") 116 | api.wait_for_status(instance_details['id'], status="running", time_out=300) 117 | 118 | print(f""" 119 | AURA_INSTANCEID={instance_details['id']} 120 | NEO4J_URI={instance_details['connection_url']} 121 | NEO4J_USERNAME={instance_details['username']} 122 | NEO4J_PASSWORD={instance_details['password']} 123 | AURA_DS=true 124 | """) 125 | 126 | 127 | def delete_instance(api, instance_id): 128 | logger.info(f"Deleting Aura instance {instance_id}") 129 | api.delete(instance_id) 130 | 131 | 132 | if __name__ == '__main__': 133 | args = cli() 134 | 135 | config = { 136 | "auth": { 137 | "endpoint": "https://api.neo4j.io/oauth/token", 138 | "client_id": args.client_id, 139 | "client_secret": args.client_secret, 140 | "token_ttl": 0.0 141 | } 142 | } 143 | api = AuraAPI("https://api.neo4j.io/v1/instances", args.tenant_id, **config) 144 | _ = api.generate_token_if_expired() 145 | 146 | task = args.task 147 | if task == 'configure': 148 | configure_instance(api, args.region, args.cloud_provider) 149 | 150 | if task == 'delete': 151 | delete_instance(api, args.instance_id) 152 | -------------------------------------------------------------------------------- /.github/workflows/run-notebooks.yml: -------------------------------------------------------------------------------- 1 | name: Run Notebook and Commit Version With Output 2 | 3 | on: 4 | pull_request: 5 | types: [opened, synchronize, reopened] 6 | branches: 7 | - main 8 | 9 | jobs: 10 | run-notebooks: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - uses: actions/checkout@v4 14 | with: 15 | ref: ${{ github.event.pull_request.head.ref }} 16 | 17 | - name: Set up Python 18 | uses: actions/setup-python@v5 19 | with: 20 | python-version: '3.x' 21 | 22 | - name: Install dependencies 23 | run: | 24 | python -m pip install --upgrade pip 25 | pip install jupyter nbconvert 26 | 27 | - name: Create env file 28 | run: | 29 | echo "${{ secrets.WORKSHOP_ENV }}" > ws.env 30 | 31 | - name: Create Aura instance 32 | run: | 33 | source ws.env 34 | python .github/scripts/aura.py configure \ 35 | --tenant-id $AURA_TENANT_ID \ 36 | --client-id $AURA_CLIENT_ID \ 37 | --client-secret $AURA_CLIENT_SECRET \ 38 | --region $AURA_REGION \ 39 | --cloud-provider $AURA_CLOUD_PROVIDER \ 40 | >> ws.env 41 | env: 42 | ENV_FILE: ws.env 43 | 44 | - name: Copy env file to each workshop subdir 45 | run: | 46 | cp ws.env customers-and-products/ws.env 47 | cp ws.env talent/ws.env 48 | env: 49 | ENV_FILE: ws.env 50 | 51 | - name: Run customer and products data loading notebook 52 | run: | 53 | jupyter nbconvert --to notebook --ExecutePreprocessor.timeout=1200 --execute customers-and-products/data-load.ipynb 54 | rm customers-and-products/data-load.nbconvert.ipynb 55 | env: 56 | ENV_FILE: ws.env 57 | 58 | - name: Run and save customer and products workshop notebook 59 | run: | 60 | export AUTOMATED_RUN=true 61 | jupyter nbconvert --to notebook --ExecutePreprocessor.timeout=1200 --execute customers-and-products/genai-workshop.ipynb 62 | mv customers-and-products/genai-workshop.nbconvert.ipynb customers-and-products/genai-workshop-w-outputs.ipynb 63 | env: 64 | ENV_FILE: ws.env 65 | 66 | - name: Run customer and products example-app-only notebook 67 | run: | 68 | export AUTOMATED_RUN=true 69 | jupyter nbconvert --to notebook --ExecutePreprocessor.timeout=1200 --execute customers-and-products/genai-example-app-only.ipynb 70 | rm customers-and-products/genai-example-app-only.nbconvert.ipynb 71 | env: 72 | ENV_FILE: ws.env 73 | 74 | - name: Delete Aura instance 75 | run: | 76 | source ws.env 77 | python .github/scripts/aura.py delete \ 78 | --tenant-id $AURA_TENANT_ID \ 79 | --client-id $AURA_CLIENT_ID \ 80 | --client-secret $AURA_CLIENT_SECRET \ 81 | --instance-id $AURA_INSTANCEID 82 | env: 83 | ENV_FILE: ws.env 84 | 85 | - name: Commit and push notebook with outputs 86 | run: | 87 | git config --global user.name 'GitHub Action' 88 | git config --global user.email 'action@github.com' 89 | git add customers-and-products/genai-workshop-w-outputs.ipynb 90 | git commit -m "Auto-commit: Run notebook and update notebook with output file" 91 | git push 92 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | embedding_model/ 2 | .idea/ 3 | __pycache__/ 4 | scratch/ 5 | *.env 6 | .DS_Store 7 | .ipynb_checkpoints/ -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # GraphRAG Workshops 2 | 3 | This repository contains two GraphRAG workshops: **Talent** and **Customers+Products**. Each workshop provides notebooks, data, and resources for hands-on exercises. 4 | 5 | ## Workshops 6 | 7 | 8 | 1. **[Talent](talent)** 9 | Combine both structured and unstructured data about employees and their technical skills into a knowledge graph. Perform graph pattern matching, vector search, and graph analytics to find similar skill sets and cohorts/clusters. Use the knowledge graph to power a GraphRAG talent agent that can search and respond to inquiries about people, their skills, and their similarities. 10 | 11 | 12 | 2. **[Customers & Products](customers-and-products)** 13 | Uses real-world customer and product data from a fashion, style, and beauty retailer. Learn how to use a knowledge graph to ground an LLM with GraphRAG, enabling AI to build tailored marketing content personalized to each customer based on their interests and shared purchase histories. Learn about retrieval strategies leveraging vector search, graph pattern matching, and graph machine learning. 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /customers-and-products/README.md: -------------------------------------------------------------------------------- 1 | # Customer & Products GraphRAG Workshop 2 | 3 | Please see [`genai-workshop.ipynb`](genai-workshop.ipynb) which serves as the self-contained workshop. 4 | 5 | The other companion notebooks contain code for staging data, building the Neo4j Graph, and providing easy access to demos: 6 | 1. [`data-prep.ipynb`](data-prep.ipynb) stages the workshop data, sampling and formatting data sourced from the [H&M Personalized Fashion Recommendations Dataset](https://www.kaggle.com/competitions/h-and-m-personalized-fashion-recommendations/data). 7 | 2. [`data-load.ipynb`](data-load.ipynb) loads the staged data into Neo4j, performs text embedding, and creates a vector index. 8 | 3. [`genai-workshop-w-outputs.ipynb`](genai-workshop-w-outputs.ipynb) is a reference notebook for following along and checking outputs. It is simply `genai-workshop.ipynb`with cell outputs intact. 9 | 4. [`genai-example-app-only.ipynb`](genai-example-app-only.ipynb) is a copy of `genai-workshop.ipynb` that contains only the final section: the demo LLM GraphRAG app for content generation. It assumes you have already run [`genai-workshop.ipynb`](genai-workshop.ipynb) and exists only for instructor demo purposes. 10 | 11 | 12 | ## Changelog 13 | 14 | 15 | ### v5 (Mar 20, 2025 - present) 16 | 17 | ------------ 18 | - Directory restructure: Added second workshop to repo 19 | 20 | ### [v4 (Sep 2nd, 2024 - Mar 20, 2025)](https://github.com/neo4j-product-examples/genai-workshop/releases/tag/v4.0) 21 | 22 | ------------ 23 | - Transition from using Neo4j Sandbox to AuraDS 24 | 25 | 26 | - Split out data loading 27 | - Split out data load into a separate notebook 28 | - Live workshops now begin with the dataset pre-loaded to cut down on time and spend more of the course walking through GraphRAG. the data-load.ipynb is kept for reference and replication. 29 | - Remove `neo4j_tools` Python package. the functions/utilities are now included in data-load.ipynb 30 | 31 | 32 | - Updated Workshop Slides 33 | 34 | 35 | - Added more query exploration & improved explainer queries 36 | - Add Browser-based graph exploration in beginning of workshop 37 | - Include database tips & more Cypher queries in multiple steps 38 | - Update explainer markdown and code cells for graph patterns and GDS for clarity 39 | - Various other minor adjustments to markdown and code to improve course quality 40 | 41 | 42 | - Added `genai-workshop-w-outputs.ipynb` and GitHub Actions Workflow 43 | - `genai-workshp.ipynb` is now maintained with cleared outputs for better workshop experience and easier PR review 44 | - A GitHub actions workflow automatically tests data loading, workshop, and example-only notebooks and auto-commits `genai-workshop-w-outputs.ipynb` file for each PR. 45 | 46 | ### [v3 (June 25th, 2024 - Sep 1st, 2024)](https://github.com/neo4j-product-examples/genai-workshop/releases/tag/v3.0) 47 | 48 | ------------ 49 | - improve LLM response quality and cleaned up code for LLM chains and vector stores 50 | - parameterizing customer id so don't need to recreate chains & stores for each customer 51 | - updated prompts to better account for seasonality and use all retrieved data 52 | - update to use gpt-4o 53 | 54 | 55 | - Improve text embedding speed and reduce code by transitioning to native `genai.vector` Cypher functions 56 | 57 | 58 | - Updated slides 59 | 60 | 61 | - Various other minor adjustments to markdown and code to improve course quality 62 | 63 | ### [v2 (Feb 20th, 2024 - June 24th, 2024)](https://github.com/neo4j-product-examples/genai-workshop/releases/tag/v2.0) 64 | 65 | ------------ 66 | 67 | - (fix) Add `langchain_community` to the libraries that are pip installed in the notebooks 68 | 69 | 70 | - Simplify and Shorten Course 71 | - Shortened GDS section to just three cells to run 72 | - Condensed Vector Search Section 73 | - Condensed Loading to Single Notebook Cell 74 | - Switched Recommendation Retriever to a Simple KG Query 75 | - Adding `neo4j_tools` Package to hold convenience functions for loading data and reduce code footprint in main workshop notebook 76 | - Updated to GPT-4 throughout 77 | - General Notebook Cleaning - Removed duplicate load statements, updating to newest llm packages, etc. 78 | 79 | 80 | - Provide Better Explainers & Examples 81 | - Add A Chain for Printing Final Prompt to LLM with retrieval data to better explain process. 82 | - Added Differentiated Names for Customer Examples in Demo App. 83 | 84 | 85 | - Added Additional Resources 86 | - Added workshop slides 87 | - Added "demo only" notebook 88 | 89 | 90 | ### [v1 (Nov 13th, 2023 - Feb 19th, 2024)](https://github.com/neo4j-product-examples/genai-workshop/releases/tag/v1.0) 91 | 92 | ------------ 93 | 94 | - Initial 5-part course with 95 | - Building the knowledge graph 96 | - Vector search & text embedding 97 | - Graph patterns to improve semantic search 98 | - knowledge graph inference & ML 99 | - Building the LLM chain and demo app for generating content 100 | 101 | 102 | ## Contributing 103 | Contributions are welcome! To contribute please: 104 | 1. Make a PR with a descriptive name 105 | 2. If you are updating [`genai-workshop.ipynb`](genai-workshop.ipynb) please ensure to clear all outputs before committing. 106 | 3. Do not alter the [`genai-workshop-w-outputs.ipynb`](genai-workshop-w-outputs.ipynb) file. This file is autogenerated upon creating/updating PRs. -------------------------------------------------------------------------------- /customers-and-products/data-load.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "88eddbdd-94d5-44fa-b54f-844b67966d44", 6 | "metadata": {}, 7 | "source": [ 8 | "# Neo4j Generative AI - Data Loading\n", 9 | "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/neo4j-product-examples/genai-workshop/blob/main/customers-and-products/data-load.ipynb)\n", 10 | "\n", 11 | "This workshop will teach you how to use Neo4j for Graph-Powered Retrieval-Augmented Generation (GraphRAG) to enhance GenAI and improve response quality for real-world applications.\n", 12 | "\n", 13 | "GenAI, despite its potential, faces challenges like hallucination and lack of domain knowledge. GraphRAG addresses these issues by combining vector search with knowledge graphs and data science techniques. This integration helps improve context, semantic understanding, and personalization, making Large Language Models (LLMs) more effective for critical applications.\n", 14 | "\n", 15 | "We walk through an example that uses real-world customer and product data from a fashion, style, and beauty retailer. We show how you can use a knowledge graph to ground an LLM, enabling it to build tailored marketing content personalized to each customer based on their interests and shared purchase histories. We use Retrieval-Augmented Generation (RAG) to accomplish this, specifically leveraging not just vector search but also graph pattern matching and graph machine learning to provide more relevant personalized results to customers. We call this graph-powered RAG approach “GraphRAG” for short.\n", 16 | "\n", 17 | "This notebook walks through the first steps of the process, including:\n", 18 | "- Building the knowledge graph and\n", 19 | "- generating text embeddings from scratch\n", 20 | "\n", 21 | "[genai-workshop.ipynb](https://github.com/neo4j-product-examples/genai-workshop/blob/main/customers-and-products/genai-workshop.ipynb) contains the rest of the workshop including\n", 22 | " - Vector search\n", 23 | " - Graph patterns to improve semantic search\n", 24 | " - Augmenting semantic search with graph data science\n", 25 | " - Building an example LLM chain and demo app\n", 26 | "\n", 27 | "If you would rather start from a database dump and skip this data loading, you can do so using [this dump file](https://storage.googleapis.com/gds-training-materials/Version8_Jan2024/neo4j_genai_hnm.dump).\n", 28 | " " 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "id": "a527357f-a5e8-42c9-8966-f37846097c1d", 34 | "metadata": {}, 35 | "source": [ 36 | "### Some Logistics\n", 37 | "1. Run the pip install below to get the necessary dependencies. this can take a while. Then run the following cell to import relevant libraries\n", 38 | "2. You will need a Neo4j database environment with the [graph data science library](https://neo4j.com/docs/graph-data-science/current/installation) installed e.g. \n", 39 | " - [AuraDS](https://neo4j.com/docs/aura/aurads/) \n", 40 | " - [Neo4j Desktop](https://neo4j.com/docs/browser-manual/current/deployment-modes/neo4j-desktop/) \n", 41 | " - Your own server environment " 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 3, 47 | "id": "1e2481cc-cf56-45d3-aa5a-236c0dbc7255", 48 | "metadata": { 49 | "tags": [] 50 | }, 51 | "source": [ 52 | "%%capture\n", 53 | "%pip install sentence_transformers langchain langchain-openai langchain_community openai tiktoken python-dotenv gradio graphdatascience\n", 54 | "%pip install \"vegafusion[embed]\"" 55 | ], 56 | "outputs": [] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 4, 61 | "id": "2ad71e3b-ccb9-4f5d-931f-4959862b16df", 62 | "metadata": { 63 | "tags": [] 64 | }, 65 | "source": [ 66 | "import pandas as pd\n", 67 | "import numpy as np\n", 68 | "from dotenv import load_dotenv\n", 69 | "import os\n", 70 | "from graphdatascience import GraphDataScience\n", 71 | "from langchain_openai import OpenAIEmbeddings, ChatOpenAI\n", 72 | "from langchain.vectorstores.neo4j_vector import Neo4jVector\n", 73 | "from langchain.graphs import Neo4jGraph\n", 74 | "from langchain.prompts import PromptTemplate\n", 75 | "from langchain.schema import StrOutputParser\n", 76 | "from langchain.schema.runnable import RunnableLambda\n", 77 | "import gradio as gr" 78 | ], 79 | "outputs": [] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 5, 84 | "id": "8a7af80e-7146-4280-a75a-7bccf16a2611", 85 | "metadata": { 86 | "tags": [] 87 | }, 88 | "source": [ 89 | "pd.set_option('display.max_rows', 10)\n", 90 | "pd.set_option('display.max_colwidth', 500)\n", 91 | "pd.set_option('display.width', 0)" 92 | ], 93 | "outputs": [] 94 | }, 95 | { 96 | "cell_type": "markdown", 97 | "id": "0418616e-c078-448d-83c5-ab907bf08236", 98 | "metadata": { 99 | "tags": [] 100 | }, 101 | "source": [ 102 | "### Setup Credentials and Environment Variables\n", 103 | "\n", 104 | "There are two things you need here.\n", 105 | "1. Credentials to a Neo4j database with Graph Data Science (AuraDS, Neo4j Desktop, or your own environment)\n", 106 | "2. Your own [OpenAI API key](https://platform.openai.com/docs/quickstart?context=python). You can use [this one](https://docs.google.com/document/d/19Lqjd0MqRs088KUVnd23ZrVU9G0OAg-53U72VrFwwms/edit) if you do not have one already.\n", 107 | "\n", 108 | "To make this easy, you can write the credentials and env variables directly into the below cell.\n", 109 | "\n", 110 | "Alternatively, if you like, you can use an environment file. This is a best practice for the future, but fine to skip for now." 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": 7, 116 | "id": "076f0ca5-c7bd-4d77-b0c8-26cc5ef7bdb9", 117 | "metadata": { 118 | "tags": [] 119 | }, 120 | "source": [ 121 | "# Neo4j\n", 122 | "NEO4J_URI = 'copy_paste_your_db_uri_here' #change this\n", 123 | "NEO4J_PASSWORD = 'terminologies-fire-planet' #change this\n", 124 | "NEO4J_USERNAME = 'neo4j'\n", 125 | "AURA_DS = True\n", 126 | "\n", 127 | "# AI\n", 128 | "LLM = 'gpt-4o'\n", 129 | "os.environ['OPENAI_API_KEY'] = 'sk-...' #change this\n", 130 | "OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')" 131 | ], 132 | "outputs": [] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": 9, 137 | "id": "2f674b2d-da73-4d6d-be03-d259478f7cdd", 138 | "metadata": { 139 | "tags": [] 140 | }, 141 | "source": [ 142 | "# You can skip this cell if not using a ws.env file - alternative to above\n", 143 | "from dotenv import load_dotenv\n", 144 | "import os\n", 145 | "\n", 146 | "if os.path.exists('ws.env'):\n", 147 | " load_dotenv('ws.env', override=True)\n", 148 | "\n", 149 | " # Neo4j\n", 150 | " NEO4J_URI = os.getenv('NEO4J_URI')\n", 151 | " NEO4J_USERNAME = os.getenv('NEO4J_USERNAME')\n", 152 | " NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')\n", 153 | " AURA_DS = eval(os.getenv('AURA_DS').title())\n", 154 | "\n", 155 | " # AI\n", 156 | " LLM = 'gpt-4o'\n", 157 | " OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')" 158 | ], 159 | "outputs": [] 160 | }, 161 | { 162 | "cell_type": "markdown", 163 | "id": "e9baad48-4875-4780-a168-e8eba6b95df5", 164 | "metadata": {}, 165 | "source": [ 166 | "## Knowledge Graph Building\n", 167 | "\n", 168 | "\"summary\"\n", 169 | "\n", 170 | "We begin by building our knowledge graph. This workshop will leverage the [H&M Personalized Fashion Recommendations Dataset](https://www.kaggle.com/competitions/h-and-m-personalized-fashion-recommendations/data), a sample of real customer purchase data that includes rich information around products including names, types, descriptions, department sections, etc.\n", 171 | "\n", 172 | "Below is the graph data model we will use:\n", 173 | "\n", 174 | "\"summary\"\n" 175 | ] 176 | }, 177 | { 178 | "cell_type": "markdown", 179 | "id": "37effc05-88bb-4ee4-9cbe-548a15069649", 180 | "metadata": { 181 | "tags": [] 182 | }, 183 | "source": [ 184 | "### Connect to Neo4j\n", 185 | "\n", 186 | "We will use the [Graph Data Science Python Client](https://neo4j.com/docs/graph-data-science-client/current/) to connect to Neo4j. This client makes it convenient to display results, as we will see later. Perhaps more importantly, it allows us to easily run [Graph Data Science](https://neo4j.com/docs/graph-data-science/current/introduction/) algorithms from Python.\n", 187 | "\n", 188 | "This client will only work if your Neo4j instance has Graph Data Science installed. If not, you can still use the [Neo4j Python Driver](https://neo4j.com/docs/python-manual/current/) or use Langchain’s Neo4j Graph object that we will see later on." 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": 10, 194 | "id": "1cbee8fc-9699-493e-b75e-2068dd208ebb", 195 | "metadata": { 196 | "tags": [] 197 | }, 198 | "source": [ 199 | "# Use Neo4j URI and credentials according to our setup\n", 200 | "gds = GraphDataScience(\n", 201 | " NEO4J_URI,\n", 202 | " auth=(NEO4J_USERNAME, NEO4J_PASSWORD),\n", 203 | " aura_ds=AURA_DS)\n", 204 | "\n", 205 | "# Necessary if you enabled Arrow on the db - this is true for AuraDS\n", 206 | "gds.set_database(\"neo4j\")" 207 | ], 208 | "outputs": [] 209 | }, 210 | { 211 | "cell_type": "markdown", 212 | "id": "2d2df684-9d7f-44ab-ae05-7a42b91a204e", 213 | "metadata": {}, 214 | "source": [ 215 | "Test your connection by running the below. It should output your GDS version." 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": 12, 221 | "id": "22a02d2d-f894-4e7b-9910-20dcc54cae5a", 222 | "metadata": { 223 | "tags": [] 224 | }, 225 | "source": [ 226 | "gds.version()" 227 | ], 228 | "outputs": [] 229 | }, 230 | { 231 | "cell_type": "markdown", 232 | "id": "8a831e09-3b57-43a7-b79d-e2c9845334f6", 233 | "metadata": {}, 234 | "source": [ 235 | "### Get Source Data\n", 236 | "This workshop will leverage the [H&M Personalized Fashion Recommendations Dataset](https://www.kaggle.com/competitions/h-and-m-personalized-fashion-recommendations/data), a sample of real customer purchase data that includes rich information around products including names, types, descriptions, department sections, etc.\n", 237 | "\n", 238 | "*Bonus!*\n", 239 | "The data we use is a sampled and preformatted version of the Kaggle data. If you are interested in what we did, you can find the details [here](https://github.com/neo4j-product-examples/genai-workshop/blob/main/customers-and-products/data-prep.ipynb)" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": 13, 245 | "id": "5fd8856c-6132-427e-8add-f344882954b1", 246 | "metadata": { 247 | "tags": [] 248 | }, 249 | "source": [ 250 | "import pandas as pd\n", 251 | "\n", 252 | "# get source data - it has been pre-formatted. If you would like to re-generate from source on Kaggle, see the data-prep.ipynb notebook\n", 253 | "department_df = pd.read_csv('https://storage.googleapis.com/neo4j-workshop-data/genai-hm/department.csv')\n", 254 | "product_df = pd.read_csv('https://storage.googleapis.com/neo4j-workshop-data/genai-hm/product.csv')\n", 255 | "article_df = pd.read_csv('https://storage.googleapis.com/neo4j-workshop-data/genai-hm/article.csv')\n", 256 | "customer_df = pd.read_csv('https://storage.googleapis.com/neo4j-workshop-data/genai-hm/customer.csv')\n", 257 | "transaction_df = pd.read_csv('https://storage.googleapis.com/neo4j-workshop-data/genai-hm/transaction.csv')" 258 | ], 259 | "outputs": [] 260 | }, 261 | { 262 | "cell_type": "markdown", 263 | "id": "18f4341e-334d-4764-aca6-8178e69c9f30", 264 | "metadata": {}, 265 | "source": [ 266 | "### Create Constraints\n", 267 | "\n", 268 | "Before loading data into Neo4j, it is usually best practice to create Key or Uniqueness constraints for nodes. These [constraints](https://neo4j.com/docs/cypher-manual/current/constraints/) act as an index with some validation on unique id properties and thus make `MATCH` statements run significantly faster. Not doing this can result in a VERY slow ingest, so this is a critical step." 269 | ] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "execution_count": 14, 274 | "id": "0e37adb5-1ad3-4b71-adfa-24b5d2bf22b8", 275 | "metadata": { 276 | "tags": [] 277 | }, 278 | "source": [ 279 | "# create constraints - one uniqueness constraint for each node label\n", 280 | "gds.run_cypher('CREATE CONSTRAINT unique_department_no IF NOT EXISTS FOR (n:Department) REQUIRE n.departmentNo IS UNIQUE')\n", 281 | "gds.run_cypher('CREATE CONSTRAINT unique_product_code IF NOT EXISTS FOR (n:Product) REQUIRE n.productCode IS UNIQUE')\n", 282 | "gds.run_cypher('CREATE CONSTRAINT unique_article_id IF NOT EXISTS FOR (n:Article) REQUIRE n.articleId IS UNIQUE')\n", 283 | "gds.run_cypher('CREATE CONSTRAINT unique_customer_id IF NOT EXISTS FOR (n:Customer) REQUIRE n.customerId IS UNIQUE')" 284 | ], 285 | "outputs": [] 286 | }, 287 | { 288 | "cell_type": "markdown", 289 | "id": "2f331d91-f8e6-45bb-b5b2-2f10d2aafa66", 290 | "metadata": {}, 291 | "source": [ 292 | "### Loading Data with Helper Functions\n", 293 | "\n", 294 | "Since we normalized our data beforehand, we can load each node and relationship type separately in batches.\n", 295 | "The Node and Relationship query patterns will follow the same template for different types. The below functions simply automatically construct the queries and handle the batching. They will print the queries they are using while loading so you can see the patterns.\n", 296 | "\n", 297 | "Cypher for Loading Nodes follows a MATCH-MERGE pattern, while Cypher for loading relationships follows a MATCH-MATCH-MERGE pattern." 298 | ] 299 | }, 300 | { 301 | "cell_type": "code", 302 | "execution_count": 16, 303 | "id": "9b7e4f14-2187-478c-9b03-952c8dc39f6a", 304 | "metadata": { 305 | "tags": [] 306 | }, 307 | "source": [ 308 | "from typing import Tuple, Union\n", 309 | "from numpy.typing import ArrayLike\n", 310 | "\n", 311 | "\n", 312 | "def make_map(x):\n", 313 | " if type(x) == str:\n", 314 | " return x, x\n", 315 | " elif type(x) == tuple:\n", 316 | " return x\n", 317 | " else:\n", 318 | " raise Exception(\"Entry must of type string or tuple\")\n", 319 | "\n", 320 | "\n", 321 | "def make_set_clause(prop_names: ArrayLike, element_name='n', item_name='rec'):\n", 322 | " clause_list = []\n", 323 | " for prop_name in prop_names:\n", 324 | " clause_list.append(f'{element_name}.{prop_name} = {item_name}.{prop_name}')\n", 325 | " return 'SET ' + ', '.join(clause_list)\n", 326 | "\n", 327 | "\n", 328 | "def make_node_merge_query(node_key_name: str, node_label: str, cols: ArrayLike):\n", 329 | " template = f'''UNWIND $recs AS rec\\nMERGE(n:{node_label} {{{node_key_name}: rec.{node_key_name}}})'''\n", 330 | " prop_names = [x for x in cols if x != node_key_name]\n", 331 | " if len(prop_names) > 0:\n", 332 | " template = template + '\\n' + make_set_clause(prop_names)\n", 333 | " return template + '\\nRETURN count(n) AS nodeLoadedCount'\n", 334 | "\n", 335 | "\n", 336 | "def make_rel_merge_query(source_target_labels: Union[Tuple[str, str], str],\n", 337 | " source_node_key: Union[Tuple[str, str], str],\n", 338 | " target_node_key: Union[Tuple[str, str], str],\n", 339 | " rel_type: str,\n", 340 | " cols: ArrayLike,\n", 341 | " rel_key: str = None):\n", 342 | " source_target_label_map = make_map(source_target_labels)\n", 343 | " source_node_key_map = make_map(source_node_key)\n", 344 | " target_node_key_map = make_map(target_node_key)\n", 345 | "\n", 346 | " merge_statement = f'MERGE(s)-[r:{rel_type}]->(t)'\n", 347 | " if rel_key is not None:\n", 348 | " merge_statement = f'MERGE(s)-[r:{rel_type} {{{rel_key}: rec.{rel_key}}}]->(t)'\n", 349 | "\n", 350 | " template = f'''\\tUNWIND $recs AS rec\n", 351 | " MATCH(s:{source_target_label_map[0]} {{{source_node_key_map[0]}: rec.{source_node_key_map[1]}}})\n", 352 | " MATCH(t:{source_target_label_map[1]} {{{target_node_key_map[0]}: rec.{target_node_key_map[1]}}})\\n\\t''' + merge_statement\n", 353 | " prop_names = [x for x in cols if x not in [rel_key, source_node_key_map[1], target_node_key_map[1]]]\n", 354 | " if len(prop_names) > 0:\n", 355 | " template = template + '\\n\\t' + make_set_clause(prop_names, 'r')\n", 356 | " return template + '\\n\\tRETURN count(r) AS relLoadedCount'\n", 357 | "\n", 358 | "\n", 359 | "def chunks(xs, n=10_000):\n", 360 | " n = max(1, n)\n", 361 | " return [xs[i:i + n] for i in range(0, len(xs), n)]\n", 362 | "\n", 363 | "\n", 364 | "def load_nodes(gds: GraphDataScience, node_df: pd.DataFrame, node_key_col: str, node_label: str, chunk_size=10_000):\n", 365 | " records = node_df.to_dict('records')\n", 366 | " print(f'====== loading {node_label} nodes ======')\n", 367 | " total = len(records)\n", 368 | " print(f'staging {total:,} records')\n", 369 | " query = make_node_merge_query(node_key_col, node_label, node_df.columns.copy())\n", 370 | " print(f'\\nUsing This Cypher Query:\\n```\\n{query}\\n```\\n')\n", 371 | " cumulative_count = 0\n", 372 | " for recs in chunks(records, chunk_size):\n", 373 | " res = gds.run_cypher(query, params={'recs': recs})\n", 374 | " cumulative_count += res.iloc[0, 0]\n", 375 | " print(f'Loaded {cumulative_count:,} of {total:,} nodes')\n", 376 | "\n", 377 | "\n", 378 | "def load_rels(gds: GraphDataScience,\n", 379 | " rel_df: pd.DataFrame,\n", 380 | " source_target_labels: Union[Tuple[str, str], str],\n", 381 | " source_node_key: Union[Tuple[str, str], str],\n", 382 | " target_node_key: Union[Tuple[str, str], str],\n", 383 | " rel_type: str,\n", 384 | " rel_key: str = None,\n", 385 | " chunk_size=10_000):\n", 386 | " records = rel_df.to_dict('records')\n", 387 | " print(f'====== loading {rel_type} relationships ======')\n", 388 | " total = len(records)\n", 389 | " print(f'staging {total:,} records')\n", 390 | " query = make_rel_merge_query(source_target_labels, source_node_key,\n", 391 | " target_node_key, rel_type, rel_df.columns.copy(), rel_key)\n", 392 | " print(f'\\nUsing This Cypher Query:\\n```\\n{query}\\n```\\n')\n", 393 | " cumulative_count = 0\n", 394 | " for recs in chunks(records, chunk_size):\n", 395 | " res = gds.run_cypher(query, params={'recs': recs})\n", 396 | " cumulative_count += res.iloc[0, 0]\n", 397 | " print(f'Loaded {cumulative_count:,} of {total:,} relationships')" 398 | ], 399 | "outputs": [] 400 | }, 401 | { 402 | "cell_type": "code", 403 | "execution_count": 17, 404 | "id": "e2f582f6-4f75-4b4f-add5-35c128c53fad", 405 | "metadata": { 406 | "tags": [] 407 | }, 408 | "source": [ 409 | "%%time\n", 410 | "\n", 411 | "# load nodes\n", 412 | "load_nodes(gds, department_df, 'departmentNo', 'Department')\n", 413 | "load_nodes(gds, article_df.drop(columns=['productCode', 'departmentNo']), 'articleId', 'Article')\n", 414 | "load_nodes(gds, product_df, 'productCode', 'Product')\n", 415 | "load_nodes(gds, customer_df, 'customerId', 'Customer')\n", 416 | "\n", 417 | "# load relationships\n", 418 | "load_rels(gds, article_df[['articleId', 'departmentNo']], source_target_labels=('Article', 'Department'),\n", 419 | " source_node_key='articleId', target_node_key='departmentNo',\n", 420 | " rel_type='FROM_DEPARTMENT')\n", 421 | "load_rels(gds, article_df[['articleId', 'productCode']], source_target_labels=('Article', 'Product'),\n", 422 | " source_node_key='articleId',target_node_key='productCode',\n", 423 | " rel_type='VARIANT_OF')\n", 424 | "load_rels(gds, transaction_df, source_target_labels=('Customer', 'Article'),\n", 425 | " source_node_key='customerId', target_node_key='articleId', rel_key='txId',\n", 426 | " rel_type='PURCHASED')\n", 427 | "\n", 428 | "# convert transaction dates\n", 429 | "gds.run_cypher('''\n", 430 | "MATCH (:Customer)-[r:PURCHASED]->()\n", 431 | "SET r.tDat = date(r.tDat)\n", 432 | "''')\n", 433 | "\n", 434 | "# convert NaN product descriptions\n", 435 | "gds.run_cypher('''\n", 436 | "MATCH (n:Product) WHERE valueType(n.detailDesc) <> \"STRING NOT NULL\"\n", 437 | "SET n.detailDesc = \"\"\n", 438 | "RETURN n\n", 439 | "''')\n", 440 | "\n", 441 | "# create combined text property. This will help simplify later with semantic search and RAG\n", 442 | "gds.run_cypher(\"\"\"\n", 443 | " MATCH(p:Product)\n", 444 | " SET p.text = 'Product-- ' +\n", 445 | " 'Name: ' + p.prodName + ' || ' +\n", 446 | " 'Type: ' + p.productTypeName + ' || ' +\n", 447 | " 'Group: ' + p.productGroupName + ' || ' +\n", 448 | " 'Garment Type: ' + p.garmentGroupName + ' || ' +\n", 449 | " 'Description: ' + p.detailDesc\n", 450 | " RETURN count(p) AS propertySetCount\n", 451 | " \"\"\")\n", 452 | "\n", 453 | "# write dummy urls to illustrate sourcing in future retrieval\n", 454 | "gds.run_cypher(\"\"\"\n", 455 | "MATCH(p:Product)\n", 456 | "SET p.url = 'https://representative-domain/product/' + p.productCode\n", 457 | "\"\"\")" 458 | ], 459 | "outputs": [] 460 | }, 461 | { 462 | "cell_type": "markdown", 463 | "id": "5d49a5c6-9732-461e-8052-ac3751ff3498", 464 | "metadata": { 465 | "tags": [] 466 | }, 467 | "source": [ 468 | "## Creating Text Embeddings & Vector Index\n", 469 | "\n", 470 | "Now that the data has been loaded, we need to generate text embeddings on our product nodes to support Vector Search\n", 471 | "\n", 472 | "Neo4j has native integrations with popular embedding APIs (OpenAI, Vertex AI, Amazon Bedrock, Azure OpenAI) making it possible to generate embeddings with a single Cypher query using `genai.vector.*` operations*.\n", 473 | "\n", 474 | "The below query embeds the Product text property with OpenAI `text-embedding-ada-002` in batches. Specifically it\n", 475 | "1. Matches every Product that has a detailed description\n", 476 | "2. Uses the `collect` aggregation function to batch products into a set number of partitions\n", 477 | "3. Encodes the text property in batches using OpenAI `text-embedding-ada-002`\n", 478 | "4. Sets the embedding as a vector property using `db.create.setNodeVectorProperty`. This special function is used to set the properties as floats rather than double precision, which requires more space. This becomes important as these embedding vectors tend to be long, and the size can add up quickly." 479 | ] 480 | }, 481 | { 482 | "cell_type": "code", 483 | "execution_count": 18, 484 | "id": "438cd7dc-b01c-48e2-b8d7-fef2bcf91aa8", 485 | "metadata": { 486 | "tags": [] 487 | }, 488 | "source": [ 489 | "#generate embeddings\n", 490 | "\n", 491 | "gds.run_cypher('''\n", 492 | "MATCH (n:Product) WHERE size(n.detailDesc) <> 0\n", 493 | "WITH collect(n) AS nodes, toInteger(rand()*$numberOfBatches) AS partition\n", 494 | "CALL {\n", 495 | " WITH nodes\n", 496 | " CALL genai.vector.encodeBatch([node IN nodes| node.text], \"OpenAI\", { token: $token})\n", 497 | " YIELD index, vector\n", 498 | " CALL db.create.setNodeVectorProperty(nodes[index], \"textEmbedding\", vector)\n", 499 | "} IN TRANSACTIONS OF 1 ROW''', params={'token':OPENAI_API_KEY, 'numberOfBatches':100})" 500 | ], 501 | "outputs": [] 502 | }, 503 | { 504 | "cell_type": "markdown", 505 | "id": "b868b5f5-425a-4851-9351-36907909327f", 506 | "metadata": {}, 507 | "source": [ 508 | "After generating embeddings we will create a vector index for them. The Neo4j Vector Index enables efficient Approximate Nearest Neighbor (ANN) search with vectors. It uses the Hierarchical Navigable Small World (HNSW) algorithm.\n", 509 | "\n", 510 | "The below cell will create the index, then, with a separate query, await for the index to come online, meaning it is ready to be used in vector search." 511 | ] 512 | }, 513 | { 514 | "cell_type": "code", 515 | "execution_count": 19, 516 | "id": "76e4ccf2-65c8-468b-90ab-0bc489eb1e48", 517 | "metadata": { 518 | "tags": [] 519 | }, 520 | "source": [ 521 | "#create vector index\n", 522 | "\n", 523 | "embedding_dimension = 1536 #default for OpenAI text-embedding-ada-002\n", 524 | "\n", 525 | "gds.run_cypher('''\n", 526 | "CREATE VECTOR INDEX product_text_embeddings IF NOT EXISTS FOR (n:Product) ON (n.textEmbedding)\n", 527 | "OPTIONS {indexConfig: {\n", 528 | " `vector.dimensions`: toInteger($dimension),\n", 529 | " `vector.similarity_function`: 'cosine'\n", 530 | "}}''', params={'dimension': embedding_dimension})\n", 531 | "\n", 532 | "#wait for index to come online\n", 533 | "gds.run_cypher('CALL db.awaitIndex(\"product_text_embeddings\", 300)')" 534 | ], 535 | "outputs": [] 536 | }, 537 | { 538 | "cell_type": "markdown", 539 | "id": "a0ba071f-a3d4-4959-8236-e4d39cec0c19", 540 | "metadata": {}, 541 | "source": [ 542 | "## Next Steps\n", 543 | "Analyze the graph, try out a vector search, and learn how to enhance search with graphs and graph data science in [genai-workshop.ipynb](https://github.com/neo4j-product-examples/genai-workshop/blob/main/customers-and-products/genai-workshop.ipynb)\n" 544 | ] 545 | }, 546 | { 547 | "metadata": {}, 548 | "cell_type": "code", 549 | "outputs": [], 550 | "execution_count": null, 551 | "source": "", 552 | "id": "fceb214524a7764a" 553 | } 554 | ], 555 | "metadata": { 556 | "kernelspec": { 557 | "display_name": "Python 3 (ipykernel)", 558 | "language": "python", 559 | "name": "python3" 560 | }, 561 | "language_info": { 562 | "codemirror_mode": { 563 | "name": "ipython", 564 | "version": 3 565 | }, 566 | "file_extension": ".py", 567 | "mimetype": "text/x-python", 568 | "name": "python", 569 | "nbconvert_exporter": "python", 570 | "pygments_lexer": "ipython3", 571 | "version": "3.11.5" 572 | } 573 | }, 574 | "nbformat": 4, 575 | "nbformat_minor": 5 576 | } 577 | -------------------------------------------------------------------------------- /customers-and-products/genai-example-app-only.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "collapsed": false, 7 | "id": "I8F3XGz_dyXc" 8 | }, 9 | "source": [ 10 | "# Neo4j Generative AI Workshop Example Application\n", 11 | "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/neo4j-product-examples/genai-workshop/blob/main/customers-and-products/genai-example-app-only.ipynb)\n", 12 | "\n", 13 | "__This notebook is a copy of `genai-workshop.ipynb` that contains only the final section: the example application for the LLM content generator. This notebook assumes you have already run `genai-workshop.ipynb`.__\n", 14 | "\n", 15 | "__Please note: There is no need to run this notebook for the workshop. It exists for demo purposes only.__" 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": { 21 | "collapsed": false, 22 | "id": "cmjr1dz8dyXd" 23 | }, 24 | "source": [ 25 | "## Setup" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": { 31 | "collapsed": false, 32 | "id": "8yxD7Ah0ZACB" 33 | }, 34 | "source": [ 35 | "### Some Logistics\n", 36 | "1. Make a copy of this notebook in Colab by [clicking here](https://colab.research.google.com/github/neo4j-product-examples/genai-workshop/blob/main/customers-and-products/genai-workshop-app-only.ipynb).\n", 37 | "2. Run the pip install below to get the necessary dependencies. this can take a while. Then run the following cell to import relevant libraries\n" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "metadata": { 43 | "id": "yY1XylsiZACB", 44 | "pycharm": { 45 | "name": "#%%capture\n" 46 | }, 47 | "ExecuteTime": { 48 | "end_time": "2024-06-25T15:14:38.579011Z", 49 | "start_time": "2024-06-25T15:14:36.822880Z" 50 | } 51 | }, 52 | "source": [ 53 | "%%capture\n", 54 | "%pip install langchain langchain-openai langchain_community openai tiktoken python-dotenv gradio neo4j" 55 | ], 56 | "outputs": [], 57 | "execution_count": 1 58 | }, 59 | { 60 | "cell_type": "code", 61 | "metadata": { 62 | "id": "7psF1otOdyXe", 63 | "ExecuteTime": { 64 | "end_time": "2024-06-25T15:19:01.690663Z", 65 | "start_time": "2024-06-25T15:19:01.687080Z" 66 | } 67 | }, 68 | "source": [ 69 | "import pandas as pd\n", 70 | "import numpy as np\n", 71 | "from dotenv import load_dotenv\n", 72 | "import os\n", 73 | "from langchain_openai import OpenAIEmbeddings, ChatOpenAI\n", 74 | "from langchain.vectorstores.neo4j_vector import Neo4jVector\n", 75 | "from langchain.graphs import Neo4jGraph\n", 76 | "from langchain.prompts import PromptTemplate\n", 77 | "from langchain.schema import StrOutputParser\n", 78 | "from langchain.schema.runnable import RunnableLambda\n", 79 | "import gradio as gr\n", 80 | "\n", 81 | "pd.set_option('display.max_rows', 10)\n", 82 | "pd.set_option('display.max_colwidth', 500)\n", 83 | "pd.set_option('display.width', 0)" 84 | ], 85 | "outputs": [], 86 | "execution_count": 8 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": { 91 | "collapsed": false, 92 | "id": "_ar1ZFhPdyXe" 93 | }, 94 | "source": [ 95 | "### Setup Credentials and Environment Variables\n", 96 | "\n", 97 | "There are two things you need here.\n", 98 | "1. Start a blank [Neo4j Sandbox](https://sandbox.neo4j.com/). Get your URI and password and plug them in below. Do not change the Neo4j username.\n", 99 | "2. Get your OpenAI API key. You can use [this one](https://docs.google.com/document/d/19Lqjd0MqRs088KUVnd23ZrVU9G0OAg-53U72VrFwwms/edit) if you do not have one already.\n", 100 | "\n", 101 | "To make this easy, you can write the credentials and env variables directly into the below cell." 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "metadata": { 107 | "id": "BQ9s0ZWhekd8", 108 | "ExecuteTime": { 109 | "end_time": "2024-06-25T15:19:03.245545Z", 110 | "start_time": "2024-06-25T15:19:03.243207Z" 111 | } 112 | }, 113 | "source": [ 114 | "import os\n", 115 | "\n", 116 | "# Neo4j\n", 117 | "NEO4J_URI = 'bolt://34.202.229.218:7687' #change this\n", 118 | "NEO4J_PASSWORD = 'terminologies-fire-planet' #change this\n", 119 | "NEO4J_USERNAME = 'neo4j'\n", 120 | "AURA_DS = False\n", 121 | "\n", 122 | "# AI\n", 123 | "LLM = 'gpt-4o'\n", 124 | "\n", 125 | "# OpenAI - Required when using OpenAI models\n", 126 | "os.environ['OPENAI_API_KEY'] = 'sk-...' #change this\n", 127 | "OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')" 128 | ], 129 | "outputs": [], 130 | "execution_count": 9 131 | }, 132 | { 133 | "cell_type": "code", 134 | "metadata": { 135 | "id": "o-98NuINdyXe", 136 | "ExecuteTime": { 137 | "end_time": "2024-06-25T15:19:28.737198Z", 138 | "start_time": "2024-06-25T15:19:28.731702Z" 139 | } 140 | }, 141 | "source": [ 142 | "# You can skip this cell if not using a ws.env file - alternative to above\n", 143 | "from dotenv import load_dotenv\n", 144 | "import os\n", 145 | "\n", 146 | "if os.path.exists('ws.env'):\n", 147 | " load_dotenv('ws.env', override=True)\n", 148 | "\n", 149 | " # Neo4j\n", 150 | " NEO4J_URI = os.getenv('NEO4J_URI')\n", 151 | " NEO4J_USERNAME = os.getenv('NEO4J_USERNAME')\n", 152 | " NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')\n", 153 | " AURA_DS = eval(os.getenv('AURA_DS').title())\n", 154 | "\n", 155 | " # AI\n", 156 | " LLM = 'gpt-4o'\n", 157 | " OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')" 158 | ], 159 | "outputs": [], 160 | "execution_count": 14 161 | }, 162 | { 163 | "cell_type": "markdown", 164 | "metadata": { 165 | "collapsed": false, 166 | "id": "90E9HGu4dyXq" 167 | }, 168 | "source": [ 169 | "## LLM For Generating Grounded Content\n", 170 | "\n", 171 | "Let's use an LLM to automatically generate content for targeted marketing campaigns grounded with our knowledge graph using the above tools.\n", 172 | "Here is a quick example for generating promotional messages, but you can create all sorts of content with this!\n", 173 | "\n", 174 | "For our first message, let's consider a scenario where a user recently searched for products, but perhaps didn't commit to a purchase yet. We now want to send a message to promote relevant products." 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "metadata": { 180 | "id": "P-06mvW-A59U", 181 | "ExecuteTime": { 182 | "end_time": "2024-06-25T15:19:29.806163Z", 183 | "start_time": "2024-06-25T15:19:29.771996Z" 184 | } 185 | }, 186 | "source": [ 187 | "from langchain_openai import OpenAIEmbeddings\n", 188 | "\n", 189 | "embedding_model = OpenAIEmbeddings()\n", 190 | "embedding_dimension = 1536" 191 | ], 192 | "outputs": [], 193 | "execution_count": 15 194 | }, 195 | { 196 | "cell_type": "code", 197 | "metadata": { 198 | "id": "JI9LVEdKekeH", 199 | "ExecuteTime": { 200 | "end_time": "2024-06-25T15:19:30.427489Z", 201 | "start_time": "2024-06-25T15:19:30.385238Z" 202 | } 203 | }, 204 | "source": [ 205 | "# Import relevant libraries\n", 206 | "from langchain.prompts import PromptTemplate\n", 207 | "from langchain_openai import ChatOpenAI\n", 208 | "from langchain.schema import StrOutputParser\n", 209 | "\n", 210 | "#Instantiate LLM\n", 211 | "llm = ChatOpenAI(temperature=0, model_name=LLM, streaming=True)" 212 | ], 213 | "outputs": [], 214 | "execution_count": 16 215 | }, 216 | { 217 | "cell_type": "markdown", 218 | "metadata": { 219 | "collapsed": false, 220 | "id": "8I6JesV0ekeH" 221 | }, 222 | "source": [ 223 | "### Create Knowledge Graph Stores for Retrieval\n", 224 | "\n", 225 | "To ground our content generation, we need to define retrievers to pull information from our knowledge graph. Let's make two stores:\n", 226 | "1. Personalized Search Retriever (`kg_personalized_search`): Based on recent customer searches and purchase history, pull relevant products.\n", 227 | "2. Recommendations retriever (`kg_recommendations_app`): Based on our Graph ML, what else can we recommend to them to pair with the relevant products?\n" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "metadata": { 233 | "id": "WLBBVRXwdyXq", 234 | "ExecuteTime": { 235 | "end_time": "2024-06-25T15:22:18.508903Z", 236 | "start_time": "2024-06-25T15:22:16.817976Z" 237 | } 238 | }, 239 | "source": [ 240 | "# We will use a mock URL for our sources in the metadata\n", 241 | "kg_personalized_search_store = Neo4jVector.from_existing_index(\n", 242 | " embedding=embedding_model,\n", 243 | " url=NEO4J_URI,\n", 244 | " username=NEO4J_USERNAME,\n", 245 | " password=NEO4J_PASSWORD,\n", 246 | " index_name='product_text_embeddings',\n", 247 | " retrieval_query=\"\"\"\n", 248 | " WITH node AS product, score AS searchScore\n", 249 | "\n", 250 | " OPTIONAL MATCH(product)<-[:VARIANT_OF]-(:Article)<-[:PURCHASED]-(:Customer)\n", 251 | " -[:PURCHASED]->(a:Article)<-[:PURCHASED]-(:Customer {customerId: $customerId})\n", 252 | " WITH count(a) AS purchaseScore, product, searchScore\n", 253 | " RETURN product.text + '\\nurl: ' + product.url AS text,\n", 254 | " (1.0+purchaseScore)*searchScore AS score,\n", 255 | " {source: product.url} AS metadata\n", 256 | " ORDER BY purchaseScore DESC, searchScore DESC LIMIT 10\n", 257 | " \"\"\"\n", 258 | ")\n", 259 | "\n", 260 | "# This will be a function so we can change per customer id\n", 261 | "def kg_personalized_search(search_prompt, customer_id, k=100):\n", 262 | " docs = kg_personalized_search_store.similarity_search(search_prompt, k, params={'customerId': customer_id})\n", 263 | " return \"\\n\\n\".join([d.page_content for d in docs])\n", 264 | "\n", 265 | "# Use the same personalized recommendations as above but with a smaller limit\n", 266 | "kg = Neo4jGraph(url=NEO4J_URI, username=NEO4J_USERNAME, password=NEO4J_PASSWORD)\n", 267 | "def kg_recommendations_app(customer_id, k=30):\n", 268 | " res = kg.query(\"\"\"\n", 269 | " MATCH(:Customer {customerId:$customerId})-[:PURCHASED]->(:Article)\n", 270 | " -[r:CUSTOMERS_ALSO_LIKE]->(:Article)-[:VARIANT_OF]->(product)\n", 271 | " RETURN product.text + '\\nurl: ' + product.url AS text,\n", 272 | " sum(r.score) AS recommenderScore\n", 273 | " ORDER BY recommenderScore DESC LIMIT $k\n", 274 | " \"\"\", params={'customerId': customer_id, 'k':k})\n", 275 | "\n", 276 | " return \"\\n\\n\".join([d['text'] for d in res])" 277 | ], 278 | "outputs": [], 279 | "execution_count": 23 280 | }, 281 | { 282 | "cell_type": "markdown", 283 | "metadata": { 284 | "collapsed": false, 285 | "id": "i7sCt8roekeH" 286 | }, 287 | "source": [ 288 | "### Prompt Engineering\n", 289 | "\n", 290 | "Now let's define our prompt. We will accept multiple parameters and provide detailed instructions to the LLM to condition the response based of retrieved data, customer interests, and time of year.\n" 291 | ] 292 | }, 293 | { 294 | "cell_type": "code", 295 | "metadata": { 296 | "id": "aUAROR6aekeI", 297 | "ExecuteTime": { 298 | "end_time": "2024-06-25T15:22:19.979575Z", 299 | "start_time": "2024-06-25T15:22:19.976711Z" 300 | } 301 | }, 302 | "source": [ 303 | "prompt = PromptTemplate.from_template('You are a personal assistant named Sally '\n", 304 | "'for a fashion, home, and beauty company called HRM.'\n", 305 | "'write an engaging email to {customerName}, one of your customers, '\n", 306 | "'to promote and summarize products relevant for them given: '\n", 307 | "'- The current season / time of year: {timeOfYear}'\n", 308 | "'- Recent searches/interests: {customerInterests}'\n", 309 | "'Please only mention the products listed below. '\n", 310 | "'Do not come up with or add any new products to the list.'\n", 311 | "'Each product comes with an https `url` field. '\n", 312 | "'Make sure to provide that https url with descriptive name text '\n", 313 | "'in markdown for each product.'\n", 314 | "'''\n", 315 | "\n", 316 | "# RelevantProducts:\n", 317 | "These are products from the HRM store the customer may be interested in based\n", 318 | "on their recent searches/interests: {customerInterests}\n", 319 | "{searchProds}\n", 320 | "\n", 321 | "# Customer May Also Be Interested In the following\n", 322 | "The below candidates are recommended based on the shared purchase patterns of\n", 323 | "other customers in the HRM database.\n", 324 | "Select the best 4 to 5 product subset from the context that best match the\n", 325 | "time of year: {timeOfYear} and to pair with the RelevantProducts above.\n", 326 | "For example, even if scarfs are listed here, they may not be appropriate for a\n", 327 | "summer time of year so best not to include those.\n", 328 | "{recProds}\n", 329 | "''')" 330 | ], 331 | "outputs": [], 332 | "execution_count": 24 333 | }, 334 | { 335 | "cell_type": "markdown", 336 | "metadata": { 337 | "collapsed": false, 338 | "id": "TgFbaUt6ekeI" 339 | }, 340 | "source": [ 341 | "### Create a Chain\n", 342 | "\n", 343 | "Now let's put a chain together that will leverage the retrievers, prompt, and LLM model. This is where Langchain shines, putting RAG together in a simple way.\n", 344 | "\n", 345 | "In addition to the personalized search and recommendations context, we will allow for some other parameters.\n", 346 | "\n", 347 | "1. `timeOfYear`: The time of year as a date, season, month, etc. so the LLM can tailor the language appropriately.\n", 348 | "2. `customerName`: Ordinarily, this can be pulled from the DB, but it has been scrubbed to maintain anonymity, so we will provide our own name here.\n", 349 | "\n", 350 | "You can potentially add other creative parameters here to help the LLM write relevant messages.\n" 351 | ] 352 | }, 353 | { 354 | "cell_type": "code", 355 | "metadata": { 356 | "id": "nUpih07QdyXr", 357 | "ExecuteTime": { 358 | "end_time": "2024-06-25T15:22:21.862317Z", 359 | "start_time": "2024-06-25T15:22:21.857513Z" 360 | } 361 | }, 362 | "source": [ 363 | "chain = ({'searchProds': (lambda x:kg_personalized_search(x['customerInterests'], x['customerId'])),\n", 364 | " 'recProds': (lambda x:kg_recommendations_app(x['customerId'])),\n", 365 | " 'customerName': lambda x:x['customerName'],\n", 366 | " 'timeOfYear': lambda x:x['timeOfYear'],\n", 367 | " \"customerInterests\": lambda x:x['customerInterests']}\n", 368 | " | prompt\n", 369 | " | llm\n", 370 | " | StrOutputParser())" 371 | ], 372 | "outputs": [], 373 | "execution_count": 25 374 | }, 375 | { 376 | "cell_type": "markdown", 377 | "metadata": { 378 | "collapsed": false, 379 | "id": "JjbUGH6WekeI" 380 | }, 381 | "source": [ 382 | "### Example Runs" 383 | ] 384 | }, 385 | { 386 | "cell_type": "code", 387 | "metadata": { 388 | "id": "EEdx6aTSA59V", 389 | "ExecuteTime": { 390 | "end_time": "2024-06-25T15:22:24.414241Z", 391 | "start_time": "2024-06-25T15:22:24.410766Z" 392 | } 393 | }, 394 | "source": [ 395 | "# example inputs\n", 396 | "CUSTOMER_ID = \"daae10780ecd14990ea190a1e9917da33fe96cd8cfa5e80b67b4600171aa77e0\"\n", 397 | "search_prompt = 'denim jeans'" 398 | ], 399 | "outputs": [], 400 | "execution_count": 26 401 | }, 402 | { 403 | "cell_type": "code", 404 | "metadata": { 405 | "id": "CCX-ut4LA59V", 406 | "ExecuteTime": { 407 | "end_time": "2024-06-25T15:22:24.810085Z", 408 | "start_time": "2024-06-25T15:22:24.806807Z" 409 | } 410 | }, 411 | "source": "#print(chain.invoke({'searchPrompt':search_prompt, 'customerId':CUSTOMER_ID, 'customerName':'Alex Smith', 'timeOfYear':'Feb, 2024'}))", 412 | "outputs": [], 413 | "execution_count": 27 414 | }, 415 | { 416 | "cell_type": "markdown", 417 | "metadata": { 418 | "id": "Vy3fKoC1E0CC" 419 | }, 420 | "source": [ 421 | "#### Inspecting the Prompt Sent to the LLM\n", 422 | "In the above run, the LLM should only be using results from our Neo4j database to populate recommendations. Run the below cell to see the final prompt that was sent to the LLM." 423 | ] 424 | }, 425 | { 426 | "cell_type": "code", 427 | "metadata": { 428 | "colab": { 429 | "base_uri": "https://localhost:8080/" 430 | }, 431 | "id": "Z7-yDDUaD6FD", 432 | "outputId": "2266a8c8-6a46-4103-da3a-9807edb6a1f4", 433 | "ExecuteTime": { 434 | "end_time": "2024-06-25T15:22:26.664378Z", 435 | "start_time": "2024-06-25T15:22:26.245954Z" 436 | } 437 | }, 438 | "source": [ 439 | "def format_final_prompt(x):\n", 440 | " return f'''=== Prompt to send to LLM ===\n", 441 | " {x.to_string()}\n", 442 | " === End Prompt ===\n", 443 | " '''\n", 444 | "chain_print_prompt = ({'searchProds': (lambda x:kg_personalized_search(x['customerInterests'], x['customerId'])),\n", 445 | " 'recProds': (lambda x:kg_recommendations_app(x['customerId'])),\n", 446 | " 'customerName': lambda x:x['customerName'],\n", 447 | " 'timeOfYear': lambda x:x['timeOfYear'],\n", 448 | " \"customerInterests\": lambda x:x['customerInterests']}\n", 449 | " | prompt\n", 450 | " | format_final_prompt\n", 451 | " | StrOutputParser())\n", 452 | "\n", 453 | "print( chain_print_prompt.invoke({\n", 454 | " 'customerInterests':search_prompt,\n", 455 | " 'customerId':CUSTOMER_ID,\n", 456 | " 'customerName':'Alex Smith',\n", 457 | " 'timeOfYear':'Feb, 2024'}))" 458 | ], 459 | "outputs": [ 460 | { 461 | "name": "stdout", 462 | "output_type": "stream", 463 | "text": [ 464 | "=== Prompt to send to LLM ===\n", 465 | " You are a personal assistant named Sally for a fashion, home, and beauty company called HRM.write an engaging email to Alex Smith, one of your customers, to promote and summarize products relevant for them given: - The current season / time of year: Feb, 2024- Recent searches/interests: denim jeansPlease only mention the products listed below. Do not come up with or add any new products to the list.Each product comes with an https `url` field. Make sure to provide that https url with descriptive name text in markdown for each product.\n", 466 | "\n", 467 | "# RelevantProducts:\n", 468 | "These are products from the HRM store the customer may be interested in based\n", 469 | "on their recent searches/interests: denim jeans\n", 470 | "##Product\n", 471 | "Name: Rachel HW Denim TRS\n", 472 | "Type: Trousers\n", 473 | "Group: Garment Lower body\n", 474 | "Garment Type: Trousers\n", 475 | "Description: 5-pocket, ankle-length jeans in washed stretch denim in a relaxed fit with a high waist, zip fly and button and straight legs with cut-off, raw-edge hems.\n", 476 | "url: https://representative-domain/product/670698\n", 477 | "\n", 478 | "##Product\n", 479 | "Name: Jade HW Skinny Denim TRS\n", 480 | "Type: Trousers\n", 481 | "Group: Garment Lower body\n", 482 | "Garment Type: Trousers\n", 483 | "Description: High-waisted jeans in washed superstretch denim with a zip fly and button, fake front pockets, real back pockets and super-skinny legs.\n", 484 | "url: https://representative-domain/product/706016\n", 485 | "\n", 486 | "##Product\n", 487 | "Name: Bono NW slim denim\n", 488 | "Type: Trousers\n", 489 | "Group: Garment Lower body\n", 490 | "Garment Type: Trousers\n", 491 | "Description: 5-pocket, ankle-length jeans in washed slightly stretch denim with a high waist, zip fly and button and tapered legs.\n", 492 | "url: https://representative-domain/product/777038\n", 493 | "\n", 494 | "##Product\n", 495 | "Name: Perrie Slim Mom Denim TRS\n", 496 | "Type: Trousers\n", 497 | "Group: Garment Lower body\n", 498 | "Garment Type: Trousers\n", 499 | "Description: 5-pocket, ankle-length jeans in washed, sturdy cotton denim with a high waist, button fly and slim, straight legs with raw-edge hems.\n", 500 | "url: https://representative-domain/product/448509\n", 501 | "\n", 502 | "##Product\n", 503 | "Name: Jade Denim TRS\n", 504 | "Type: Trousers\n", 505 | "Group: Garment Lower body\n", 506 | "Garment Type: Trousers\n", 507 | "Description: High-waisted jeans in washed superstretch denim with a zip fly and button, fake front pockets, real back pockets and super-skinny legs.\n", 508 | "url: https://representative-domain/product/539723\n", 509 | "\n", 510 | "##Product\n", 511 | "Name: Mom Fit\n", 512 | "Type: Trousers\n", 513 | "Group: Garment Lower body\n", 514 | "Garment Type: Trousers Denim\n", 515 | "Description: 5-pocket, ankle-length jeans in washed, stretch cotton denim with an extra-high waist. Slightly looser fit with straight legs. The cotton content of the jeans is partly recycled.\n", 516 | "url: https://representative-domain/product/714790\n", 517 | "\n", 518 | "##Product\n", 519 | "Name: Jade Denim Petite Trs 1\n", 520 | "Type: Trousers\n", 521 | "Group: Garment Lower body\n", 522 | "Garment Type: Trousers\n", 523 | "Description: High-waisted jeans in washed superstretch denim with a zip fly and button, fake front pockets, real back pockets and skinny legs.\n", 524 | "url: https://representative-domain/product/673901\n", 525 | "\n", 526 | "##Product\n", 527 | "Name: Maja cargo Slim HW Denim\n", 528 | "Type: Trousers\n", 529 | "Group: Garment Lower body\n", 530 | "Garment Type: Trousers\n", 531 | "Description: Jeans in washed, stretch denim with a high waist, zip fly and button, fake front pockets, real back pockets, flap leg pockets with concealed press-studs, and slim legs.\n", 532 | "url: https://representative-domain/product/788575\n", 533 | "\n", 534 | "##Product\n", 535 | "Name: Jones Denim Slim Shorts\n", 536 | "Type: Shorts\n", 537 | "Group: Garment Lower body\n", 538 | "Garment Type: Shorts\n", 539 | "Description: Long, 5-pocket denim shorts with a regular waist, zip fly and button, slightly lower crotch and slim legs.\n", 540 | "url: https://representative-domain/product/478992\n", 541 | "\n", 542 | "##Product\n", 543 | "Name: Perrie Fancy Denim TRS\n", 544 | "Type: Trousers\n", 545 | "Group: Garment Lower body\n", 546 | "Garment Type: Trousers\n", 547 | "Description: 5-pocket, ankle-length jeans in washed denim with decorative V-shaped seams at the top. Slightly looser fit with an extra high waist, zip fly and button and tapered legs.\n", 548 | "url: https://representative-domain/product/779659\n", 549 | "\n", 550 | "# Customer May Also Be Interested In the following\n", 551 | "The below candidates are recommended based on the shared purchase patterns of\n", 552 | "other customers in the HRM database.\n", 553 | "Select the best 4 to 5 product subset from the context that best match the\n", 554 | "time of year: Feb, 2024 and to pair with the RelevantProducts above.\n", 555 | "For example, even if scarfs are listed here, they may not be appropriate for a\n", 556 | "summer time of year so best not to include those.\n", 557 | "##Product\n", 558 | "Name: DONT USE ROLAND HOOD\n", 559 | "Type: Hoodie\n", 560 | "Group: Garment Upper body\n", 561 | "Garment Type: Jersey Basic\n", 562 | "Description: Top in sweatshirt fabric with a lined drawstring hood, kangaroo pocket, long raglan sleeves and ribbing at the cuffs and hem.\n", 563 | "url: https://representative-domain/product/569974\n", 564 | "\n", 565 | "##Product\n", 566 | "Name: PASTRY SWEATER\n", 567 | "Type: Sweater\n", 568 | "Group: Garment Upper body\n", 569 | "Garment Type: Knitwear\n", 570 | "Description: Jumper in soft, textured-knit cotton with long raglan sleeves and ribbing around the neckline, cuffs and hem.\n", 571 | "url: https://representative-domain/product/656401\n", 572 | "\n", 573 | "##Product\n", 574 | "Name: Skinny RW Ankle Milo Zip\n", 575 | "Type: Trousers\n", 576 | "Group: Garment Lower body\n", 577 | "Garment Type: Trousers Denim\n", 578 | "Description: 5-pocket, ankle-length jeans in washed stretch denim with hard-worn details, a regular waist, zip fly and button, and skinny legs with a zip at the hems. The jeans are made partly from recycled cotton.\n", 579 | "url: https://representative-domain/product/682848\n", 580 | "\n", 581 | "##Product\n", 582 | "Name: Haven back detail\n", 583 | "Type: Bra\n", 584 | "Group: Underwear\n", 585 | "Garment Type: Under-, Nightwear\n", 586 | "Description: Push-up bra in lace and mesh with underwired, moulded, padded cups for a larger bust and fuller cleavage. Lace racer back, narrow adjustable shoulder straps, a wide mesh strap at the back and a metal fastener at the front.\n", 587 | "url: https://representative-domain/product/660519\n", 588 | "\n", 589 | "##Product\n", 590 | "Name: Bubble Bum Bandeau (1)\n", 591 | "Type: Bikini top\n", 592 | "Group: Swimwear\n", 593 | "Garment Type: Swimwear\n", 594 | "Description: Fully lined bandeau bikini top with padded cups and removable inserts. Detachable ties at the back of the neck, ties at the back, side support and a silicone trim at the top.\n", 595 | "url: https://representative-domain/product/642498\n", 596 | "\n", 597 | "##Product\n", 598 | "Name: Dixie tee\n", 599 | "Type: T-shirt\n", 600 | "Group: Garment Upper body\n", 601 | "Garment Type: Jersey Fancy\n", 602 | "Description: Short top in soft cotton jersey with short sleeves. Contrasting colour trims around the neckline and sleeves.\n", 603 | "url: https://representative-domain/product/598806\n", 604 | "\n", 605 | "##Product\n", 606 | "Name: Rylee flatform\n", 607 | "Type: Heeled sandals\n", 608 | "Group: Shoes\n", 609 | "Garment Type: Shoes\n", 610 | "Description: Sandals with imitation suede straps, an elastic heel strap and wedge heels. Satin insoles and thermoplastic rubber (TPR) soles. Platform front 2 cm, heel 6 cm.\n", 611 | "url: https://representative-domain/product/606711\n", 612 | "\n", 613 | "##Product\n", 614 | "Name: Eden SP Andes\n", 615 | "Type: Bra\n", 616 | "Group: Underwear\n", 617 | "Garment Type: Under-, Nightwear\n", 618 | "Description: Super push-up bra in lace with underwired, thickly padded cups to maximise the bust and create a fuller cleavage. Adjustable shoulder straps, a racer back and metal front fastenings.\n", 619 | "url: https://representative-domain/product/640129\n", 620 | "\n", 621 | "##Product\n", 622 | "Name: Silver lake\n", 623 | "Type: Sweater\n", 624 | "Group: Garment Upper body\n", 625 | "Garment Type: Knitwear\n", 626 | "Description: Purl-knit jumper in a cotton blend with a slightly wider neckline and 3/4-length sleeves.\n", 627 | "url: https://representative-domain/product/244267\n", 628 | "\n", 629 | "##Product\n", 630 | "Name: Leona Push Mirny\n", 631 | "Type: Bra\n", 632 | "Group: Underwear\n", 633 | "Garment Type: Under-, Nightwear\n", 634 | "Description: Push-up bra in lace and mesh with underwired, moulded, padded cups for a larger bust and fuller cleavage. Lace racer back, narrow adjustable shoulder straps, a wide mesh strap at the back and metal fastener at the front.\n", 635 | "url: https://representative-domain/product/511924\n", 636 | "\n", 637 | "##Product\n", 638 | "Name: Karin headband\n", 639 | "Type: Hairband\n", 640 | "Group: Accessories\n", 641 | "Garment Type: Accessories\n", 642 | "Description: Wide hairband in cotton jersey with a twisted detail.\n", 643 | "url: https://representative-domain/product/620425\n", 644 | "\n", 645 | "##Product\n", 646 | "Name: Survivor\n", 647 | "Type: Blouse\n", 648 | "Group: Garment Upper body\n", 649 | "Garment Type: Blouses\n", 650 | "Description: Straight-cut blouse in a crêpe weave with a collar, concealed buttons down the front and fake flap front pockets. Yoke with a pleat at the back, long sleeves with pleats and buttoned cuffs, and a straight cut hem with slits in the sides.\n", 651 | "url: https://representative-domain/product/662328\n", 652 | "\n", 653 | "##Product\n", 654 | "Name: Rosemary\n", 655 | "Type: Dress\n", 656 | "Group: Garment Full body\n", 657 | "Garment Type: Dresses Ladies\n", 658 | "Description: Short dress in woven fabric with 3/4-length sleeves with an opening and ties at the cuffs, and a gently rounded hem. Unlined.\n", 659 | "url: https://representative-domain/product/753724\n", 660 | "\n", 661 | "##Product\n", 662 | "Name: Petar Sweater(1)\n", 663 | "Type: Sweater\n", 664 | "Group: Garment Upper body\n", 665 | "Garment Type: Jersey Basic\n", 666 | "Description: Oversized top in sturdy sweatshirt fabric with dropped shoulders and ribbing around the neckline, cuffs and hem. Soft brushed inside.\n", 667 | "url: https://representative-domain/product/557247\n", 668 | "\n", 669 | "##Product\n", 670 | "Name: Lead Superskinny\n", 671 | "Type: Trousers\n", 672 | "Group: Garment Lower body\n", 673 | "Garment Type: Trousers\n", 674 | "Description: Chinos in stretch twill with a zip fly and button, side pockets, welt back pockets and skinny legs.\n", 675 | "url: https://representative-domain/product/731142\n", 676 | "\n", 677 | "##Product\n", 678 | "Name: Banks\n", 679 | "Type: Hoodie\n", 680 | "Group: Garment Upper body\n", 681 | "Garment Type: Jersey Basic\n", 682 | "Description: Long-sleeved top in sweatshirt fabric made from a cotton blend with a double-layered hood, gently dropped shoulders and ribbing at the cuffs and hem. Soft brushed inside.\n", 683 | "url: https://representative-domain/product/752193\n", 684 | "\n", 685 | "##Product\n", 686 | "Name: Malte r-neck\n", 687 | "Type: Sweater\n", 688 | "Group: Garment Upper body\n", 689 | "Garment Type: Knitwear\n", 690 | "Description: Jumper in soft, patterned, fine-knit cotton with ribbing around the neckline, cuffs and hem.\n", 691 | "url: https://representative-domain/product/713577\n", 692 | "\n", 693 | "##Product\n", 694 | "Name: Lassy Linnen LS\n", 695 | "Type: Sweater\n", 696 | "Group: Garment Upper body\n", 697 | "Garment Type: Jersey Basic\n", 698 | "Description: Long-sleeved top in airy linen jersey.\n", 699 | "url: https://representative-domain/product/531615\n", 700 | "\n", 701 | "##Product\n", 702 | "Name: Gwen Jersey Top\n", 703 | "Type: Vest top\n", 704 | "Group: Garment Upper body\n", 705 | "Garment Type: Dresses Ladies\n", 706 | "Description: Fitted top in stretch jersey with a slight sheen. V-neck with a lace trim at the top and adjustable spaghetti straps.\n", 707 | "url: https://representative-domain/product/671852\n", 708 | "\n", 709 | "##Product\n", 710 | "Name: FF Kate dress PI\n", 711 | "Type: Dress\n", 712 | "Group: Garment Full body\n", 713 | "Garment Type: Special Offers\n", 714 | "Description: Short dress in patterned stretch jersey with a round neckline, gathered seam at the waist and long raglan sleeves with gathered seams at the front. The polyester content of the dress is recycled.\n", 715 | "url: https://representative-domain/product/796240\n", 716 | "\n", 717 | "##Product\n", 718 | "Name: Brad LW BF Denim TRS\n", 719 | "Type: Trousers\n", 720 | "Group: Garment Lower body\n", 721 | "Garment Type: Trousers\n", 722 | "Description: 5-pocket, low-rise jeans in washed denim with hard-worn details, a zip fly and button, and slightly wider, tapered legs.\n", 723 | "url: https://representative-domain/product/615970\n", 724 | "\n", 725 | "##Product\n", 726 | "Name: Baby shark top\n", 727 | "Type: Bikini top\n", 728 | "Group: Swimwear\n", 729 | "Garment Type: Swimwear\n", 730 | "Description: Lined, non-wired bikini top with flounces. Adjustable shoulder straps, cups with removable inserts that shape the bust and provide good support, and a metal fastener at the back.\n", 731 | "url: https://representative-domain/product/861410\n", 732 | "\n", 733 | "##Product\n", 734 | "Name: Belle PU skirt\n", 735 | "Type: Skirt\n", 736 | "Group: Garment Lower body\n", 737 | "Garment Type: Skirts\n", 738 | "Description: Flared, calf-length skirt in imitation leather. High waist with press-studs and a concealed zip at one side, and visible seams front and back. Unlined.\n", 739 | "url: https://representative-domain/product/856232\n", 740 | "\n", 741 | "##Product\n", 742 | "Name: Burcu Styling Scarf\n", 743 | "Type: Scarf\n", 744 | "Group: Accessories\n", 745 | "Garment Type: Accessories\n", 746 | "Description: Scarf in soft, patterned satin.\n", 747 | "url: https://representative-domain/product/772565\n", 748 | "\n", 749 | "##Product\n", 750 | "Name: Girlfriend R.W Trash\n", 751 | "Type: Trousers\n", 752 | "Group: Garment Lower body\n", 753 | "Garment Type: Trousers Denim\n", 754 | "Description: 5-pocket, ankle-length jeans in washed denim with hard-worn details in a slightly looser fit. Regular waist, zip fly and button, slightly lower crotch and tapered legs with raw-edge hems.\n", 755 | "url: https://representative-domain/product/724904\n", 756 | "\n", 757 | "##Product\n", 758 | "Name: Derek\n", 759 | "Type: Dress\n", 760 | "Group: Garment Full body\n", 761 | "Garment Type: Dresses Ladies\n", 762 | "Description: Calf-length dress in woven fabric with a collar, long sleeves and wide cuffs with a slit. Narrow elasticated seam at the waist, a pleated skirt and laser-cut hem. Unlined.\n", 763 | "url: https://representative-domain/product/706366\n", 764 | "\n", 765 | "##Product\n", 766 | "Name: BISCUIT\n", 767 | "Type: Sweater\n", 768 | "Group: Garment Upper body\n", 769 | "Garment Type: Knitwear\n", 770 | "Description: Long-sleeved jumper in a soft, fine knit with ribbing around the neckline, cuffs and hem.\n", 771 | "url: https://representative-domain/product/657852\n", 772 | "\n", 773 | "##Product\n", 774 | "Name: ROLAND SLIM FIT HOOD\n", 775 | "Type: Hoodie\n", 776 | "Group: Garment Upper body\n", 777 | "Garment Type: Jersey Basic\n", 778 | "Description: Top in lightweight sweatshirt fabric with ribbed side panels and a kangaroo pocket. Jersey-lined hood with a wrapover front, and ribbing at the cuffs and hem. Brushed inside. Slim Fit.\n", 779 | "url: https://representative-domain/product/667769\n", 780 | "\n", 781 | "##Product\n", 782 | "Name: BRUNO PIQUE HOOD\n", 783 | "Type: Hoodie\n", 784 | "Group: Garment Upper body\n", 785 | "Garment Type: Jersey Basic\n", 786 | "Description: Jacket in cotton piqué with a lined, drawstring hood, zip down the front, side pockets and ribbing at the cuffs and hem. Regular fit.\n", 787 | "url: https://representative-domain/product/622370\n", 788 | "\n", 789 | "##Product\n", 790 | "Name: Fav regular polo(1)\n", 791 | "Type: Polo shirt\n", 792 | "Group: Garment Upper body\n", 793 | "Garment Type: Jersey Fancy\n", 794 | "Description: Polo shirt in cotton piqué with a ribbed collar, button placket, short sleeves with ribbed trims, and slits in the sides.\n", 795 | "url: https://representative-domain/product/816759\n", 796 | "\n", 797 | " === End Prompt ===\n", 798 | " \n" 799 | ] 800 | } 801 | ], 802 | "execution_count": 28 803 | }, 804 | { 805 | "cell_type": "markdown", 806 | "metadata": { 807 | "collapsed": false, 808 | "id": "8G_vdFviekeI" 809 | }, 810 | "source": [ 811 | "Feel free to experiment and try more!" 812 | ] 813 | }, 814 | { 815 | "cell_type": "code", 816 | "metadata": { 817 | "id": "qeOts3Q4ZACL", 818 | "ExecuteTime": { 819 | "end_time": "2024-06-25T15:22:28.848411Z", 820 | "start_time": "2024-06-25T15:22:28.846498Z" 821 | } 822 | }, 823 | "source": "#print(chain.invoke({'customerInterests':\"western boots\", 'customerId':CUSTOMER_ID, 'customerName':'Alex Smith', 'timeOfYear':'July, 2024'}))", 824 | "outputs": [], 825 | "execution_count": 29 826 | }, 827 | { 828 | "cell_type": "markdown", 829 | "metadata": { 830 | "collapsed": false, 831 | "id": "1IU_gedrekeI" 832 | }, 833 | "source": [ 834 | "### Demo App\n", 835 | "Now let’s use the above tools to create a demo app with Gradio. We will need to make a couple more functions, but otherwise easy to fire up from a Notebook!" 836 | ] 837 | }, 838 | { 839 | "cell_type": "code", 840 | "metadata": { 841 | "id": "A1F0ve3cekeI", 842 | "ExecuteTime": { 843 | "end_time": "2024-06-25T15:22:30.272583Z", 844 | "start_time": "2024-06-25T15:22:30.269709Z" 845 | } 846 | }, 847 | "source": [ 848 | "# create multiple demo examples to try\n", 849 | "examples = [\n", 850 | " [\n", 851 | " CUSTOMER_ID,\n", 852 | " 'June, 2024',\n", 853 | " 'Alex Smith',\n", 854 | " 'denim jeans'\n", 855 | " ],\n", 856 | " [\n", 857 | " CUSTOMER_ID,\n", 858 | " 'July, 2024',\n", 859 | " 'Alex Smith',\n", 860 | " 'western boots'\n", 861 | " ],\n", 862 | " [\n", 863 | " '819f4eab1fd76b932fd403ae9f427de8eb9c5b64411d763bb26b5c8c3c30f16f',\n", 864 | " 'June, 2024',\n", 865 | " 'Robin Fischer',\n", 866 | " 'denim jeans'\n", 867 | " ],\n", 868 | " [\n", 869 | " '44b0898ecce6cc1268dfdb0f91e053db014b973f67e34ed8ae28211410910693',\n", 870 | " 'Feb, 2024',\n", 871 | " 'Chris Johnson',\n", 872 | " 'Oversized Sweaters'\n", 873 | " ],\n", 874 | " [\n", 875 | " '819f4eab1fd76b932fd403ae9f427de8eb9c5b64411d763bb26b5c8c3c30f16f',\n", 876 | " 'Feb, 2024',\n", 877 | " 'Robin Fischer',\n", 878 | " 'denim jeans'\n", 879 | " ],\n", 880 | " [\n", 881 | " CUSTOMER_ID,\n", 882 | " 'Feb, 2024',\n", 883 | " 'Alex Smith',\n", 884 | " 'oversized sweaters'\n", 885 | " ],\n", 886 | "]" 887 | ], 888 | "outputs": [], 889 | "execution_count": 30 890 | }, 891 | { 892 | "cell_type": "code", 893 | "metadata": { 894 | "colab": { 895 | "base_uri": "https://localhost:8080/", 896 | "height": 626 897 | }, 898 | "id": "XsBcFQLlekeI", 899 | "outputId": "ade42c47-69ba-466c-93a6-c7701448db27", 900 | "ExecuteTime": { 901 | "end_time": "2024-06-25T15:24:22.974423Z", 902 | "start_time": "2024-06-25T15:22:31.116344Z" 903 | } 904 | }, 905 | "source": [ 906 | "import gradio as gr\n", 907 | "\n", 908 | "def message_generator(*x):\n", 909 | " return chain.invoke({'customerInterests':x[3],\n", 910 | " 'customerId':x[0],\n", 911 | " 'customerName':x[2],\n", 912 | " 'timeOfYear': x[1]})\n", 913 | "\n", 914 | "customer_id = gr.Textbox(value=CUSTOMER_ID, label=\"Customer ID\")\n", 915 | "time_of_year = gr.Textbox(value=\"June, 2024\", label=\"Time Of Year\")\n", 916 | "search_prompt_txt = gr.Textbox(value='denim jeans', label=\"Customer Interests(s)\")\n", 917 | "customer_name = gr.Textbox(value='Alex Smith', label=\"Customer Name\")\n", 918 | "message_result = gr.Markdown( label=\"Message\")\n", 919 | "\n", 920 | "demo = gr.Interface(fn=message_generator,\n", 921 | " inputs=[customer_id, time_of_year, customer_name, search_prompt_txt],\n", 922 | " outputs=message_result,\n", 923 | " examples=examples,\n", 924 | " title=\"🪄 Message Generator 🥳\")\n", 925 | "\n", 926 | "if not os.getenv('AUTOMATED_RUN') == \"true\":\n", 927 | " demo.launch(share=True, debug=True)" 928 | ], 929 | "outputs": [ 930 | { 931 | "name": "stdout", 932 | "output_type": "stream", 933 | "text": [ 934 | "Running on local URL: http://127.0.0.1:7861\n", 935 | "IMPORTANT: You are using gradio version 4.20.0, however version 4.29.0 is available, please upgrade.\n", 936 | "--------\n", 937 | "\n", 938 | "Could not create share link. Missing file: /Users/zachblumenfeld/opt/anaconda3/envs/genai-workshop/lib/python3.10/site-packages/gradio/frpc_darwin_arm64_v0.2. \n", 939 | "\n", 940 | "Please check your internet connection. This can happen if your antivirus software blocks the download of this file. You can install manually by following these steps: \n", 941 | "\n", 942 | "1. Download this file: https://cdn-media.huggingface.co/frpc-gradio-0.2/frpc_darwin_arm64\n", 943 | "2. Rename the downloaded file to: frpc_darwin_arm64_v0.2\n", 944 | "3. Move the file to this location: /Users/zachblumenfeld/opt/anaconda3/envs/genai-workshop/lib/python3.10/site-packages/gradio\n" 945 | ] 946 | }, 947 | { 948 | "name": "stderr", 949 | "output_type": "stream", 950 | "text": [ 951 | "\n" 952 | ] 953 | }, 954 | { 955 | "data": { 956 | "text/plain": [ 957 | "" 958 | ], 959 | "text/html": [ 960 | "
" 961 | ] 962 | }, 963 | "metadata": {}, 964 | "output_type": "display_data" 965 | }, 966 | { 967 | "name": "stdout", 968 | "output_type": "stream", 969 | "text": [ 970 | "Keyboard interruption in main thread... closing server.\n", 971 | "Killing tunnel 127.0.0.1:7861 <> None\n" 972 | ] 973 | }, 974 | { 975 | "data": { 976 | "text/plain": [] 977 | }, 978 | "execution_count": 31, 979 | "metadata": {}, 980 | "output_type": "execute_result" 981 | } 982 | ], 983 | "execution_count": 31 984 | }, 985 | { 986 | "cell_type": "code", 987 | "execution_count": null, 988 | "metadata": { 989 | "id": "z_qwUiTZA59W" 990 | }, 991 | "outputs": [], 992 | "source": [] 993 | } 994 | ], 995 | "metadata": { 996 | "colab": { 997 | "provenance": [], 998 | "toc_visible": true 999 | }, 1000 | "kernelspec": { 1001 | "display_name": "Python 3", 1002 | "language": "python", 1003 | "name": "python3" 1004 | }, 1005 | "language_info": { 1006 | "codemirror_mode": { 1007 | "name": "ipython", 1008 | "version": 2 1009 | }, 1010 | "file_extension": ".py", 1011 | "mimetype": "text/x-python", 1012 | "name": "python", 1013 | "nbconvert_exporter": "python", 1014 | "pygments_lexer": "ipython2", 1015 | "version": "2.7.6" 1016 | } 1017 | }, 1018 | "nbformat": 4, 1019 | "nbformat_minor": 0 1020 | } 1021 | -------------------------------------------------------------------------------- /customers-and-products/hm-bloom-perspective.json: -------------------------------------------------------------------------------- 1 | {"name":"Untitled Perspective 1","id":"2b307f20-ce78-11ee-8b4f-a726a64e6164","categories":[{"id":1,"name":"Department","labels":["Department"],"properties":[{"name":"departmentNo","exclude":false,"dataType":"bigint"},{"name":"departmentName","exclude":false,"dataType":"string"},{"name":"sectionNo","exclude":false,"dataType":"bigint"},{"name":"sectionName","exclude":false,"dataType":"string"},{"name":"name","exclude":false,"dataType":"string"},{"name":"indexes","exclude":false,"dataType":"array"},{"name":"constraints","exclude":false,"dataType":"array"}],"createdAt":1708272603089,"lastEditedAt":1708273191607,"color":"#F16667","size":1,"icon":"78220CE7-2EF9-468C-840D-6DCE01798C55","captionKeys":[],"captions":[{"inTooltip":false,"styles":[],"isCaption":false,"key":"Department","type":"label"},{"inTooltip":true,"styles":[],"isCaption":true,"key":"departmentName","type":"property","isGdsData":false},{"inTooltip":false,"styles":[],"isCaption":false,"key":"departmentNo","type":"property","isGdsData":false},{"inTooltip":false,"styles":[],"isCaption":false,"key":"sectionName","type":"property","isGdsData":false},{"inTooltip":false,"styles":[],"isCaption":false,"key":"sectionNo","type":"property","isGdsData":false}],"textSize":1,"textAlign":"top","styleRules":[]},{"id":2,"name":"Product","labels":["Product"],"properties":[{"name":"productCode","exclude":false,"dataType":"bigint"},{"name":"prodName","exclude":false,"dataType":"string"},{"name":"productTypeName","exclude":false,"dataType":"string"},{"name":"productTypeNo","exclude":false,"dataType":"bigint"},{"name":"productGroupName","exclude":false,"dataType":"string"},{"name":"garmentGroupNo","exclude":false,"dataType":"bigint"},{"name":"garmentGroupName","exclude":false,"dataType":"string"},{"name":"detailDesc","exclude":false,"dataType":"string"},{"name":"text","exclude":false,"dataType":"string"},{"name":"textEmbedding","exclude":false,"dataType":"array"},{"name":"name","exclude":false,"dataType":"string"},{"name":"indexes","exclude":false,"dataType":"array"},{"name":"constraints","exclude":false,"dataType":"array"}],"createdAt":1708272603089,"lastEditedAt":1708273191608,"color":"#C990C0","size":1,"icon":"BAFA5E62-A381-4DD8-A74E-8525A8FC0DA3","captionKeys":[],"captions":[{"inTooltip":false,"styles":[],"isCaption":false,"key":"Product","type":"label"},{"inTooltip":false,"styles":[],"isCaption":false,"key":"detailDesc","type":"property","isGdsData":false},{"inTooltip":false,"styles":[],"isCaption":false,"key":"garmentGroupName","type":"property","isGdsData":false},{"inTooltip":false,"styles":[],"isCaption":false,"key":"garmentGroupNo","type":"property","isGdsData":false},{"inTooltip":false,"styles":[],"isCaption":false,"key":"prodName","type":"property","isGdsData":false},{"inTooltip":false,"styles":[],"isCaption":false,"key":"productCode","type":"property","isGdsData":false},{"inTooltip":false,"styles":[],"isCaption":false,"key":"productGroupName","type":"property","isGdsData":false},{"inTooltip":false,"styles":[],"isCaption":true,"key":"productTypeName","type":"property","isGdsData":false},{"inTooltip":false,"styles":[],"isCaption":false,"key":"productTypeNo","type":"property","isGdsData":false},{"inTooltip":true,"styles":[],"isCaption":false,"key":"text","type":"property","isGdsData":false},{"inTooltip":true,"styles":[],"isCaption":false,"key":"textEmbedding","type":"property","isGdsData":false},{"inTooltip":false,"styles":[],"isCaption":false,"key":"constraints","type":"property","isGdsData":false},{"inTooltip":false,"styles":[],"isCaption":false,"key":"indexes","type":"property","isGdsData":false},{"inTooltip":false,"styles":[],"isCaption":false,"key":"name","type":"property","isGdsData":false}],"textSize":1,"textAlign":"top","styleRules":[]},{"id":3,"name":"Article","labels":["Article"],"properties":[{"name":"articleId","exclude":false,"dataType":"bigint"},{"name":"prodName","exclude":false,"dataType":"string"},{"name":"productTypeName","exclude":false,"dataType":"string"},{"name":"graphicalAppearanceNo","exclude":false,"dataType":"bigint"},{"name":"graphicalAppearanceName","exclude":false,"dataType":"string"},{"name":"colourGroupCode","exclude":false,"dataType":"bigint"},{"name":"colourGroupName","exclude":false,"dataType":"string"},{"name":"embedding","exclude":false,"dataType":"array"},{"name":"name","exclude":false,"dataType":"string"},{"name":"indexes","exclude":false,"dataType":"array"},{"name":"constraints","exclude":false,"dataType":"array"}],"createdAt":1708272603089,"lastEditedAt":1708273191608,"color":"#F79767","size":1,"icon":"5353F84C-A17C-4745-8534-839AE4A5CD12","captionKeys":[],"captions":[{"inTooltip":false,"styles":[],"isCaption":false,"key":"Article","type":"label"},{"inTooltip":false,"styles":[],"isCaption":false,"key":"articleId","type":"property","isGdsData":false},{"inTooltip":false,"styles":[],"isCaption":false,"key":"colourGroupCode","type":"property","isGdsData":false},{"inTooltip":false,"styles":[],"isCaption":false,"key":"colourGroupName","type":"property","isGdsData":false},{"inTooltip":false,"styles":[],"isCaption":false,"key":"embedding","type":"property","isGdsData":false},{"inTooltip":false,"styles":[],"isCaption":false,"key":"graphicalAppearanceName","type":"property","isGdsData":false},{"inTooltip":false,"styles":[],"isCaption":false,"key":"graphicalAppearanceNo","type":"property","isGdsData":false},{"inTooltip":true,"styles":[],"isCaption":true,"key":"prodName","type":"property","isGdsData":false},{"inTooltip":true,"styles":[],"isCaption":false,"key":"productTypeName","type":"property","isGdsData":false},{"inTooltip":false,"styles":[],"isCaption":false,"key":"constraints","type":"property","isGdsData":false},{"inTooltip":false,"styles":[],"isCaption":false,"key":"indexes","type":"property","isGdsData":false},{"inTooltip":false,"styles":[],"isCaption":false,"key":"name","type":"property","isGdsData":false}],"textSize":1,"textAlign":"top","styleRules":[]},{"id":4,"name":"Customer","labels":["Customer"],"properties":[{"name":"customerId","exclude":false,"dataType":"string"},{"name":"fn","exclude":false,"dataType":"number"},{"name":"active","exclude":false,"dataType":"number"},{"name":"clubMemberStatus","exclude":false,"dataType":"string"},{"name":"fashionNewsFrequency","exclude":false,"dataType":"string"},{"name":"age","exclude":false,"dataType":"number"},{"name":"postalCode","exclude":false,"dataType":"string"},{"name":"name","exclude":false,"dataType":"string"},{"name":"indexes","exclude":false,"dataType":"array"},{"name":"constraints","exclude":false,"dataType":"array"}],"createdAt":1708272603089,"lastEditedAt":1708273191609,"color":"#4C8EDA","size":1,"icon":"DB188874-D25F-4B34-A296-E5E950072319","captionKeys":[],"captions":[{"inTooltip":false,"styles":[],"isCaption":true,"key":"Customer","type":"label"},{"inTooltip":false,"styles":[],"isCaption":false,"key":"active","type":"property","isGdsData":false},{"inTooltip":false,"styles":[],"isCaption":false,"key":"age","type":"property","isGdsData":false},{"inTooltip":false,"styles":[],"isCaption":false,"key":"clubMemberStatus","type":"property","isGdsData":false},{"inTooltip":true,"styles":[],"isCaption":false,"key":"customerId","type":"property","isGdsData":false},{"inTooltip":false,"styles":[],"isCaption":false,"key":"fashionNewsFrequency","type":"property","isGdsData":false},{"inTooltip":false,"styles":[],"isCaption":false,"key":"fn","type":"property","isGdsData":false},{"inTooltip":false,"styles":[],"isCaption":false,"key":"postalCode","type":"property","isGdsData":false},{"inTooltip":false,"styles":[],"isCaption":false,"key":"constraints","type":"property","isGdsData":false},{"inTooltip":false,"styles":[],"isCaption":false,"key":"indexes","type":"property","isGdsData":false},{"inTooltip":false,"styles":[],"isCaption":false,"key":"name","type":"property","isGdsData":false}],"textSize":1,"textAlign":"top","styleRules":[]}],"labels":{"Department":[{"propertyKey":"departmentNo","type":"Department","dataType":"bigint"},{"propertyKey":"departmentName","type":"Department","dataType":"string"},{"propertyKey":"sectionNo","type":"Department","dataType":"bigint"},{"propertyKey":"sectionName","type":"Department","dataType":"string"}],"Product":[{"propertyKey":"productCode","type":"Product","dataType":"bigint"},{"propertyKey":"prodName","type":"Product","dataType":"string"},{"propertyKey":"productTypeName","type":"Product","dataType":"string"},{"propertyKey":"productTypeNo","type":"Product","dataType":"bigint"},{"propertyKey":"productGroupName","type":"Product","dataType":"string"},{"propertyKey":"garmentGroupNo","type":"Product","dataType":"bigint"},{"propertyKey":"garmentGroupName","type":"Product","dataType":"string"},{"propertyKey":"detailDesc","type":"Product","dataType":"string"},{"propertyKey":"text","type":"Product","dataType":"string"},{"propertyKey":"textEmbedding","type":"Product","dataType":"array"}],"Article":[{"propertyKey":"articleId","type":"Article","dataType":"bigint"},{"propertyKey":"prodName","type":"Article","dataType":"string"},{"propertyKey":"productTypeName","type":"Article","dataType":"string"},{"propertyKey":"graphicalAppearanceNo","type":"Article","dataType":"bigint"},{"propertyKey":"graphicalAppearanceName","type":"Article","dataType":"string"},{"propertyKey":"colourGroupCode","type":"Article","dataType":"bigint"},{"propertyKey":"colourGroupName","type":"Article","dataType":"string"},{"propertyKey":"embedding","type":"Article","dataType":"array"}],"Customer":[{"propertyKey":"customerId","type":"Customer","dataType":"string"},{"propertyKey":"fn","type":"Customer","dataType":"number"},{"propertyKey":"active","type":"Customer","dataType":"number"},{"propertyKey":"clubMemberStatus","type":"Customer","dataType":"string"},{"propertyKey":"fashionNewsFrequency","type":"Customer","dataType":"string"},{"propertyKey":"age","type":"Customer","dataType":"number"},{"propertyKey":"postalCode","type":"Customer","dataType":"string"}]},"relationshipTypes":[{"properties":[{"dataType":"string","propertyKey":"name","type":"FROM_DEPARTMENT"}],"name":"FROM_DEPARTMENT","id":"FROM_DEPARTMENT","color":"#848484","size":1,"captionKeys":[],"captions":[{"inTooltip":true,"isCaption":true,"styles":[],"key":"FROM_DEPARTMENT","type":"relationship"}],"textSize":1,"textAlign":"top","styleRules":[]},{"properties":[{"dataType":"string","propertyKey":"name","type":"VARIANT_OF"}],"name":"VARIANT_OF","id":"VARIANT_OF","color":"#848484","size":1,"captionKeys":[],"captions":[{"inTooltip":true,"isCaption":true,"styles":[],"key":"VARIANT_OF","type":"relationship"}],"textSize":1,"textAlign":"top","styleRules":[]},{"properties":[{"propertyKey":"txId","type":"PURCHASED","dataType":"bigint"},{"propertyKey":"tDat","type":"PURCHASED","dataType":"Date"},{"propertyKey":"price","type":"PURCHASED","dataType":"number"},{"propertyKey":"salesChannelId","type":"PURCHASED","dataType":"bigint"},{"dataType":"string","propertyKey":"name","type":"PURCHASED"}],"name":"PURCHASED","id":"PURCHASED","color":"#848484","size":1,"captionKeys":[],"captions":[{"inTooltip":true,"isCaption":true,"styles":[],"key":"PURCHASED","type":"relationship"}],"textSize":1,"textAlign":"top","styleRules":[]},{"properties":[{"dataType":"string","propertyKey":"name","type":"CUSTOMERS_ALSO_LIKE"}],"name":"CUSTOMERS_ALSO_LIKE","id":"CUSTOMERS_ALSO_LIKE","color":"#848484","size":1,"captionKeys":[],"captions":[{"inTooltip":true,"isCaption":true,"styles":[],"key":"CUSTOMERS_ALSO_LIKE","type":"relationship"}],"textSize":1,"textAlign":"top","styleRules":[]}],"palette":{"colors":["#FFE081","#C990C0","#F79767","#57C7E3","#F16667","#D9C8AE","#8DCC93","#ECB5C9","#4C8EDA","#FFC454","#DA7194","#569480","#848484","#D9D9D9"],"currentIndex":4},"createdAt":1708272601106,"lastEditedAt":1708272601106,"templates":[{"name":"display the graph schema","id":"tmpl:1708273082456","createdAt":1708273082456,"text":"Show Schema","cypher":"CALL db.schema.visualization() YIELD nodes, relationships\nunwind nodes as node\nunwind relationships as rel\nreturn node, rel","isUpdateQuery":null,"params":[],"hasCypherErrors":false},{"name":"Search phrase that returns a sample of your data","id":"tmpl:1708272603102","createdAt":1708272603102,"text":"Show me a graph","cypher":"//MATCH (n) OPTIONAL MATCH p=(n)--() RETURN p, n LIMIT 100\nMATCH (p:Product)<-[v:VARIANT_OF]-(a:Article)<-[t:PURCHASED]-(c:Customer)\nRETURN * LIMIT 150","isUpdateQuery":null,"params":[],"hasCypherErrors":false}],"sceneActions":[],"hiddenRelationshipTypes":[],"hiddenCategories":[],"hideUncategorisedData":false,"isAuto":true,"parentPerspectiveId":false,"metadata":{"pathSegments":[{"source":"Article","relationshipType":"VARIANT_OF","target":"Product"},{"source":"Article","relationshipType":"CUSTOMERS_ALSO_LIKE","target":"Article"},{"source":"Article","relationshipType":"FROM_DEPARTMENT","target":"Department"},{"source":"Customer","relationshipType":"PURCHASED","target":"Article"}],"indexes":[{"label":null,"type":"native","propertyKeys":[]},{"label":"Product","type":"native","propertyKeys":[{"key":"textEmbedding","metadataProp":false},{"key":"productCode","metadataProp":false}]},{"label":"Article","type":"native","propertyKeys":[{"key":"articleId","metadataProp":false}]},{"label":"Customer","type":"native","propertyKeys":[{"key":"customerId","metadataProp":false}]},{"label":"Department","type":"native","propertyKeys":[{"key":"departmentNo","metadataProp":false},{"key":"departmentName","metadataProp":true},{"key":"sectionNo","metadataProp":true},{"key":"sectionName","metadataProp":true}]}],"stats":{"labels":{},"relationshipTypes":{"CUSTOMERS_ALSO_LIKE":127925,"FROM_DEPARTMENT":13351,"PURCHASED":23199,"VARIANT_OF":13351}}},"version":"2.11.0"} -------------------------------------------------------------------------------- /customers-and-products/img/CUSTOMERS_ALSO_LIKE.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neo4j-product-examples/genai-workshop/ef9077abccaa9238f2abd556d1ef5beac64610d7/customers-and-products/img/CUSTOMERS_ALSO_LIKE.png -------------------------------------------------------------------------------- /customers-and-products/img/data-model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neo4j-product-examples/genai-workshop/ef9077abccaa9238f2abd556d1ef5beac64610d7/customers-and-products/img/data-model.png -------------------------------------------------------------------------------- /customers-and-products/img/hm-banner.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neo4j-product-examples/genai-workshop/ef9077abccaa9238f2abd556d1ef5beac64610d7/customers-and-products/img/hm-banner.png -------------------------------------------------------------------------------- /customers-and-products/img/purchase-history.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neo4j-product-examples/genai-workshop/ef9077abccaa9238f2abd556d1ef5beac64610d7/customers-and-products/img/purchase-history.png -------------------------------------------------------------------------------- /customers-and-products/img/related-products.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neo4j-product-examples/genai-workshop/ef9077abccaa9238f2abd556d1ef5beac64610d7/customers-and-products/img/related-products.png -------------------------------------------------------------------------------- /customers-and-products/img/sample-query.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neo4j-product-examples/genai-workshop/ef9077abccaa9238f2abd556d1ef5beac64610d7/customers-and-products/img/sample-query.png -------------------------------------------------------------------------------- /customers-and-products/img/search_results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neo4j-product-examples/genai-workshop/ef9077abccaa9238f2abd556d1ef5beac64610d7/customers-and-products/img/search_results.png -------------------------------------------------------------------------------- /customers-and-products/workshop-slides.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neo4j-product-examples/genai-workshop/ef9077abccaa9238f2abd556d1ef5beac64610d7/customers-and-products/workshop-slides.pdf -------------------------------------------------------------------------------- /customers-and-products/ws.env.template: -------------------------------------------------------------------------------- 1 | #***************************************************************** 2 | # Neo4j 3 | #***************************************************************** 4 | NEO4J_URI= 5 | AURA_DS=false 6 | NEO4J_USERNAME=neo4j 7 | NEO4J_PASSWORD= 8 | 9 | #***************************************************************** 10 | # AI 11 | #***************************************************************** 12 | OPENAI_API_KEY=sk-... 13 | 14 | #***************************************************************** 15 | # AURA - for GitHub Admin Only (GitHub Action CI/CD) 16 | #***************************************************************** 17 | AURA_TENANT_ID=<> 18 | AURA_CLIENT_ID=<> 19 | AURA_CLIENT_SECRET=<> 20 | AURA_REGION=<> 21 | AURA_CLOUD_PROVIDER=<> -------------------------------------------------------------------------------- /talent/README.md: -------------------------------------------------------------------------------- 1 | # Talent GraphRAG Workshop 2 | In this workshop, you will learn: 3 | 4 | - How to create a **knowledge graph** from both structured and unstructured data sources. 5 | 6 | - Techniques for **graph pattern matching**, **vector search**, and **graph analytics** to retrieve information, uncover insights, and identify relationships in your data. 7 | 8 | - How to use a knowledge graph to power a **GraphRAG (Graph-based Retrieval-Augmented Generation) agent**. 9 | 10 | We will use a talent dataset as a practical example, exploring how to analyze employee skill sets, identify similar skills, form clusters or cohorts, and build a talent agent capable of answering questions about people, their skills, and relationships within the graph. 11 | 12 | ## How to open modules on the Jupyter Server 13 | Select File -> Open from URL ... 14 | 15 | Module 1: 16 | https://raw.githubusercontent.com/neo4j-product-examples/genai-workshop/refs/heads/main/talent/module_01_graph_basics.ipynb 17 | 18 | Module 2: 19 | https://raw.githubusercontent.com/neo4j-product-examples/genai-workshop/refs/heads/main/talent/module_02_unstructured_data.ipynb 20 | 21 | Module 3: 22 | https://raw.githubusercontent.com/neo4j-product-examples/genai-workshop/refs/heads/main/talent/module_03_graphrag_agent.ipynb 23 | -------------------------------------------------------------------------------- /talent/data/create_skill_embeddings.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "643367b5-0019-425f-b367-5d610e5214c4", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import os\n", 11 | "import pandas as pd\n", 12 | "from dotenv import load_dotenv\n", 13 | "from langchain_openai import OpenAIEmbeddings, ChatOpenAI\n", 14 | "from langchain.schema import SystemMessage, HumanMessage" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 2, 20 | "id": "772415da-5546-4895-9b21-aa4f9284ede9", 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "env_file = '../ws.env'" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 3, 30 | "id": "6214dee4-f0e5-4fca-a575-b4fd9725d8a9", 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "if os.path.exists(env_file):\n", 35 | " load_dotenv(env_file, override=True)\n", 36 | "\n", 37 | " OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')\n", 38 | " os.environ['OPENAI_API_KEY']=OPENAI_API_KEY\n", 39 | " LLM = os.getenv('LLM')\n", 40 | "else:\n", 41 | " print(f\"File {env_file} not found.\")" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 4, 47 | "id": "2974d149-3797-44a8-b834-a5947cb7b768", 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "df = pd.read_csv('expanded_skills.csv')" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 5, 57 | "id": "0d28d2be-4320-4b5d-b2c6-0174f82d2c38", 58 | "metadata": {}, 59 | "outputs": [ 60 | { 61 | "data": { 62 | "text/html": [ 63 | "
\n", 64 | "\n", 77 | "\n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | "
emailnameskills
0thomas.nelson@test.orgThomas NelsonSecurity, Pandas, Go
1lucy.clark@test.orgLucy ClarkWordPress, Scrum, Go, SQL, Linux
2richard.jackson@test.orgRichard JacksonSystem Design, PyTorch, Express.js, DevOps
3amelia.hall@test.orgAmelia HallAgile, CSS3, R, Azure
4david.hill@test.orgDavid HillJava, Scrum, Angular
\n", 119 | "
" 120 | ], 121 | "text/plain": [ 122 | " email name \\\n", 123 | "0 thomas.nelson@test.org Thomas Nelson \n", 124 | "1 lucy.clark@test.org Lucy Clark \n", 125 | "2 richard.jackson@test.org Richard Jackson \n", 126 | "3 amelia.hall@test.org Amelia Hall \n", 127 | "4 david.hill@test.org David Hill \n", 128 | "\n", 129 | " skills \n", 130 | "0 Security, Pandas, Go \n", 131 | "1 WordPress, Scrum, Go, SQL, Linux \n", 132 | "2 System Design, PyTorch, Express.js, DevOps \n", 133 | "3 Agile, CSS3, R, Azure \n", 134 | "4 Java, Scrum, Angular " 135 | ] 136 | }, 137 | "execution_count": 5, 138 | "metadata": {}, 139 | "output_type": "execute_result" 140 | } 141 | ], 142 | "source": [ 143 | "df.head()" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": 6, 149 | "id": "ae3b539f-7a06-4db5-8fb3-653693e1ee81", 150 | "metadata": {}, 151 | "outputs": [], 152 | "source": [ 153 | "df = df[['skills']]" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": 7, 159 | "id": "413a3fa9-8934-47f7-b7c0-40504aa25cbb", 160 | "metadata": {}, 161 | "outputs": [], 162 | "source": [ 163 | "df['Skill'] = df['skills'].str.split(', ')\n", 164 | "\n", 165 | "df = df[['Skill']].explode('Skill')\n", 166 | "df = df.drop_duplicates().sort_values(by='Skill').reset_index(drop=True)" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": 8, 172 | "id": "11d0e11a-4d16-452e-8859-c08e8ff74343", 173 | "metadata": {}, 174 | "outputs": [ 175 | { 176 | "data": { 177 | "text/plain": [ 178 | "(54, 1)" 179 | ] 180 | }, 181 | "execution_count": 8, 182 | "metadata": {}, 183 | "output_type": "execute_result" 184 | } 185 | ], 186 | "source": [ 187 | "df.shape" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": 9, 193 | "id": "dfd7baf3-e259-4d2e-8fbc-5bd01868b6dc", 194 | "metadata": {}, 195 | "outputs": [ 196 | { 197 | "data": { 198 | "text/html": [ 199 | "
\n", 200 | "\n", 213 | "\n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | "
Skill
0API Design
1AWS
2Agile
3Angular
4Azure
\n", 243 | "
" 244 | ], 245 | "text/plain": [ 246 | " Skill\n", 247 | "0 API Design\n", 248 | "1 AWS\n", 249 | "2 Agile\n", 250 | "3 Angular\n", 251 | "4 Azure" 252 | ] 253 | }, 254 | "execution_count": 9, 255 | "metadata": {}, 256 | "output_type": "execute_result" 257 | } 258 | ], 259 | "source": [ 260 | "df.head()" 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": 10, 266 | "id": "2025d62a-23b8-4874-88a3-e391fd8906a6", 267 | "metadata": {}, 268 | "outputs": [], 269 | "source": [ 270 | "llm = ChatOpenAI(model=LLM, temperature=0)" 271 | ] 272 | }, 273 | { 274 | "cell_type": "code", 275 | "execution_count": 11, 276 | "id": "66b3fff8-d16d-46cd-9203-e03edc737600", 277 | "metadata": {}, 278 | "outputs": [], 279 | "source": [ 280 | "system_prompt = \"You are a helpful assistant that provides more information on IT skills. You will get a skill coming from a CV. Your goal is to provide a short description based on this skill. What does it entail, where are these skills used for etc.\" " 281 | ] 282 | }, 283 | { 284 | "cell_type": "code", 285 | "execution_count": 12, 286 | "id": "d67cd069-6aa9-4b42-a9cf-45ab8f757b73", 287 | "metadata": {}, 288 | "outputs": [], 289 | "source": [ 290 | "generated_texts = []\n", 291 | "for skill in df['Skill']:\n", 292 | " messages = [\n", 293 | " SystemMessage(content=system_prompt),\n", 294 | " HumanMessage(content=f\"Write a brief and engaging description of the IT skill: {skill}.\")\n", 295 | " ]\n", 296 | " \n", 297 | " response = llm.invoke(messages)\n", 298 | " generated_texts.append(response.content)" 299 | ] 300 | }, 301 | { 302 | "cell_type": "code", 303 | "execution_count": 13, 304 | "id": "3e61c682-532d-47d9-988d-788d619285ee", 305 | "metadata": {}, 306 | "outputs": [], 307 | "source": [ 308 | "df['Description'] = generated_texts" 309 | ] 310 | }, 311 | { 312 | "cell_type": "code", 313 | "execution_count": 14, 314 | "id": "b2c6af9d-054b-410b-a285-feb2d90200d6", 315 | "metadata": {}, 316 | "outputs": [], 317 | "source": [ 318 | "embeddings = OpenAIEmbeddings(model='text-embedding-ada-002')" 319 | ] 320 | }, 321 | { 322 | "cell_type": "code", 323 | "execution_count": 15, 324 | "id": "decbab69-b9a3-4e6d-9c62-5dc09ec1cb94", 325 | "metadata": {}, 326 | "outputs": [], 327 | "source": [ 328 | "df['Embedding'] = df['Description'].apply( lambda skill: embeddings.embed_documents([skill])[0])" 329 | ] 330 | }, 331 | { 332 | "cell_type": "code", 333 | "execution_count": 16, 334 | "id": "6e37f660-4ef7-40f3-ac78-73608c286874", 335 | "metadata": {}, 336 | "outputs": [ 337 | { 338 | "data": { 339 | "text/html": [ 340 | "
\n", 341 | "\n", 354 | "\n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | "
SkillDescriptionEmbedding
0API DesignAPI Design is the process of creating applicat...[0.007902550511062145, -0.006266295909881592, ...
1AWSAmazon Web Services (AWS) is a comprehensive a...[-0.0029347320087254047, -0.015877487137913704...
2AgileAgile is a dynamic and flexible project manage...[-0.023693757131695747, -0.012672649696469307,...
3AngularAngular is a powerful, open-source web applica...[0.013188531622290611, 0.029791485518217087, -...
4AzureAzure is Microsoft's cloud computing platform ...[-0.004470727406442165, -0.024176467210054398,...
\n", 396 | "
" 397 | ], 398 | "text/plain": [ 399 | " Skill Description \\\n", 400 | "0 API Design API Design is the process of creating applicat... \n", 401 | "1 AWS Amazon Web Services (AWS) is a comprehensive a... \n", 402 | "2 Agile Agile is a dynamic and flexible project manage... \n", 403 | "3 Angular Angular is a powerful, open-source web applica... \n", 404 | "4 Azure Azure is Microsoft's cloud computing platform ... \n", 405 | "\n", 406 | " Embedding \n", 407 | "0 [0.007902550511062145, -0.006266295909881592, ... \n", 408 | "1 [-0.0029347320087254047, -0.015877487137913704... \n", 409 | "2 [-0.023693757131695747, -0.012672649696469307,... \n", 410 | "3 [0.013188531622290611, 0.029791485518217087, -... \n", 411 | "4 [-0.004470727406442165, -0.024176467210054398,... " 412 | ] 413 | }, 414 | "execution_count": 16, 415 | "metadata": {}, 416 | "output_type": "execute_result" 417 | } 418 | ], 419 | "source": [ 420 | "df.head()" 421 | ] 422 | }, 423 | { 424 | "cell_type": "code", 425 | "execution_count": 18, 426 | "id": "a02c1560-cdc5-4cad-ab29-8dd9a7ee4ae2", 427 | "metadata": {}, 428 | "outputs": [], 429 | "source": [ 430 | "df.to_csv('skills_embeddings.csv', index=False)" 431 | ] 432 | } 433 | ], 434 | "metadata": { 435 | "kernelspec": { 436 | "display_name": "Python 3 (ipykernel)", 437 | "language": "python", 438 | "name": "python3" 439 | }, 440 | "language_info": { 441 | "codemirror_mode": { 442 | "name": "ipython", 443 | "version": 3 444 | }, 445 | "file_extension": ".py", 446 | "mimetype": "text/x-python", 447 | "name": "python", 448 | "nbconvert_exporter": "python", 449 | "pygments_lexer": "ipython3", 450 | "version": "3.11.5" 451 | } 452 | }, 453 | "nbformat": 4, 454 | "nbformat_minor": 5 455 | } 456 | -------------------------------------------------------------------------------- /talent/data/expanded_skills.csv: -------------------------------------------------------------------------------- 1 | email,name,skills 2 | thomas.nelson@test.org,Thomas Nelson,"Security, Pandas, Go" 3 | lucy.clark@test.org,Lucy Clark,"WordPress, Scrum, Go, SQL, Linux" 4 | richard.jackson@test.org,Richard Jackson,"System Design, PyTorch, Express.js, DevOps" 5 | amelia.hall@test.org,Amelia Hall,"Agile, CSS3, R, Azure" 6 | david.hill@test.org,David Hill,"Java, Scrum, Angular" 7 | christopher.johnson@test.org,Christopher Johnson,"Tableau, Flask, API Design" 8 | amelia.martin@test.org,Amelia Martin,"CI/CD, Kotlin, HTML5, TensorFlow" 9 | daniel.hill@test.org,Daniel Hill,"System Design, Git, Cypher, Pandas, Spring Boot" 10 | alice.white@test.org,Alice White,"Spark, Agile, JavaScript" 11 | lucy.taylor@test.org,Lucy Taylor,"Flask, Tableau, CI/CD, Rust, System Design" 12 | lucy.turner@test.org,Lucy Turner,"Express.js, Big Data, Scala, Security, Docker" 13 | elena.young@test.org,Elena Young,"Scala, TensorFlow, Blockchain, Angular" 14 | richard.taylor@test.org,Richard Taylor,"Cloud Architecture, Kotlin, Angular, Agile" 15 | joseph.mitchell@test.org,Joseph Mitchell,"System Design, Spark, Vue.js, Ruby" 16 | sophie.jackson@test.org,Sophie Jackson,"Linux, Angular, Security, Pandas" 17 | sophia.richardson@test.org,Sophia Richardson,"CI/CD, Django, TypeScript, Testing" 18 | ryan.rodriguez@test.org,Ryan Rodriguez,"Power BI, DevOps, JavaScript" 19 | lucy.martin@test.org,Lucy Martin,"Flask, DevOps, Machine Learning, Rust" 20 | victoria.thomas@test.org,Victoria Thomas,"API Design, Cloud Architecture, Swift" 21 | joshua.lopez@test.org,Joshua Lopez,"Pandas, Scala, Express.js, Blockchain, Jenkins" 22 | alice.hill@test.org,Alice Hill,"Blockchain, Cloud Architecture, Spring Boot" 23 | mia.nelson@test.org,Mia Nelson,"AWS, Swift, WordPress, Security, Big Data" 24 | david.lopez@test.org,David Lopez,"WordPress, Security, PHP" 25 | emily.thompson@test.org,Emily Thompson,"Scrum, ReactJS, TensorFlow, Cloud Architecture" 26 | natalie.miller@test.org,Natalie Miller,"Testing, Azure, Go, Machine Learning, Express.js" 27 | natalie.thompson@test.org,Natalie Thompson,"System Design, TypeScript, Angular, Spark, Jenkins" 28 | hannah.campbell@test.org,Hannah Campbell,"API Design, JavaScript, DevOps, Power BI, Vue.js" 29 | brian.hill@test.org,Brian Hill,"Machine Learning, System Design, CSS3" 30 | brian.carter@test.org,Brian Carter,"Vue.js, Git, SQL, Testing, Rust" 31 | thomas.brown@test.org,Thomas Brown,"Docker, Java, Security, R" 32 | peter.martinez@test.org,Peter Martinez,"Project Management, Docker, Go" 33 | ryan.nelson@test.org,Ryan Nelson,"Agile, Go, Power BI, Spring Boot" 34 | isabella.allen@test.org,Isabella Allen,"Security, Cloud Architecture, Scala" 35 | robert.davis@test.org,Robert Davis,"Go, Angular, System Design, Data Analysis" 36 | christopher.jackson@test.org,Christopher Jackson,"Django, System Design, Spark, Python, Linux" 37 | charles.jones@test.org,Charles Jones,"System Design, CSS3, AWS, Pandas, Kotlin" 38 | daniel.smith@test.org,Daniel Smith,"Jenkins, Spark, HTML5" 39 | charlotte.adams@test.org,Charlotte Adams,"ReactJS, Java, Blockchain, Kubernetes" 40 | james.carter@test.org,James Carter,"Jenkins, TypeScript, Project Management" 41 | andrew.martin@test.org,Andrew Martin,"Testing, R, Java, Node.js, Cloud Architecture" 42 | harper.wright@test.org,Harper Wright,"Angular, Rust, PyTorch" 43 | william.rodriguez@test.org,William Rodriguez,"Jenkins, PyTorch, Project Management" 44 | emily.garcia@test.org,Emily Garcia,"Data Visualization, Testing, TypeScript" 45 | lucy.roberts@test.org,Lucy Roberts,"CI/CD, Data Visualization, Angular, Swift, System Design" 46 | olivia.johnson@test.org,Olivia Johnson,"CI/CD, Angular, Security" 47 | oliver.bennett@test.org,Oliver Bennett,"Testing, Jenkins, JavaScript, R" 48 | ryan.jones@test.org,Ryan Jones,"Project Management, Spark, PHP, Angular, Jenkins" 49 | matthew.miller@test.org,Matthew Miller,"Ruby, TensorFlow, ReactJS, AWS" 50 | andrew.king@test.org,Andrew King,"Agile, Git, Flask" 51 | amelia.davis@test.org,Amelia Davis,"HTML5, Java, PyTorch, Docker, Security" 52 | emily.phillips@test.org,Emily Phillips,"Kubernetes, Data Visualization, Security, Vue.js, PHP" 53 | john.johnson@test.org,John Johnson,"Python, AWS, WordPress, TensorFlow, Project Management" 54 | matthew.scott@test.org,Matthew Scott,"Cypher, Azure, Scrum" 55 | john.garcia@test.org,John Garcia,"Ruby, AWS, HTML5, Security, PyTorch" 56 | charles.carter@test.org,Charles Carter,"Spark, JavaScript, Docker" 57 | joseph.lopez@test.org,Joseph Lopez,"System Design, ReactJS, Linux" 58 | steven.jones@test.org,Steven Jones,"Docker, HTML5, Blockchain, Big Data" 59 | thomas.miller@test.org,Thomas Miller,"HTML5, Git, Big Data" 60 | ryan.young@test.org,Ryan Young,"WordPress, Blockchain, Cloud Architecture, Python, Data Visualization" 61 | brian.thompson@test.org,Brian Thompson,"Agile, ReactJS, AWS, Data Analysis, PHP" 62 | joseph.martin@test.org,Joseph Martin,"Agile, ReactJS, Java, Power BI, Linux" 63 | isabella.thompson@test.org,Isabella Thompson,"CI/CD, ReactJS, Blockchain, Python" 64 | ava.taylor@test.org,Ava Taylor,"Git, ReactJS, JavaScript" 65 | matthew.mitchell@test.org,Matthew Mitchell,"HTML5, Blockchain, R, Ruby, Cloud Architecture" 66 | alice.thomas@test.org,Alice Thomas,"SQL, Docker, ReactJS, System Design, Swift" 67 | richard.mitchell@test.org,Richard Mitchell,"Scala, Scrum, Node.js" 68 | andrew.anderson@test.org,Andrew Anderson,"Node.js, Data Visualization, Java, Testing, DevOps" 69 | john.taylor@test.org,John Taylor,"CSS3, AWS, Scrum, Pandas, Ruby" 70 | natalie.turner@test.org,Natalie Turner,"CSS3, Rust, Linux" 71 | christopher.thompson@test.org,Christopher Thompson,"R, Scrum, Flask" 72 | claire.moore@test.org,Claire Moore,"Power BI, Scala, AWS, Agile" 73 | daniel.hall@test.org,Daniel Hall,"Spring Boot, R, Cypher, Kubernetes, System Design" 74 | sophie.perez@test.org,Sophie Perez,"Angular, Scrum, Cloud Architecture" 75 | kevin.young@test.org,Kevin Young,"C++, TensorFlow, ReactJS, Linux, Agile" 76 | sophia.walker@test.org,Sophia Walker,"Scrum, DevOps, Django, C++" 77 | isabella.martin@test.org,Isabella Martin,"HTML5, Data Visualization, Blockchain, Jenkins" 78 | john.baker@test.org,John Baker,"Agile, Spring Boot, Data Analysis, PHP" 79 | brian.jackson@test.org,Brian Jackson,"Cloud Architecture, API Design, Vue.js, Rust" 80 | kevin.garcia@test.org,Kevin Garcia,"Docker, Agile, Angular, Swift, Power BI" 81 | david.rodriguez@test.org,David Rodriguez,"Azure, Cypher, Scrum, Spark" 82 | brian.allen@test.org,Brian Allen,"Pandas, Node.js, Ruby, Agile" 83 | olivia.martin@test.org,Olivia Martin,"Testing, PyTorch, Kubernetes, Spring Boot" 84 | john.walker@test.org,John Walker,"Django, Python, API Design" 85 | thomas.garcia@test.org,Thomas Garcia,"Java, Flask, Docker, Agile, TensorFlow" 86 | michael.allen@test.org,Michael Allen,"Data Analysis, Azure, Kotlin, API Design, Flask" 87 | isabella.baker@test.org,Isabella Baker,"Node.js, Python, Kubernetes" 88 | ava.white@test.org,Ava White,"Project Management, Scala, CI/CD, Data Visualization" 89 | charles.taylor@test.org,Charles Taylor,"TensorFlow, Java, CI/CD" 90 | natalie.brown@test.org,Natalie Brown,"Go, Kubernetes, Project Management, CSS3, Pandas" 91 | claire.lee@test.org,Claire Lee,"Blockchain, PHP, Git, TensorFlow" 92 | james.anderson@test.org,James Anderson,"Security, R, JavaScript, Node.js" 93 | victoria.thompson@test.org,Victoria Thompson,"API Design, Cypher, SQL" 94 | grace.lee@test.org,Grace Lee,"Rust, HTML5, SQL" 95 | john.mitchell@test.org,John Mitchell,"Express.js, Cloud Architecture, R" 96 | matthew.moore@test.org,Matthew Moore,"Spring Boot, Security, TensorFlow, Swift" 97 | grace.miller@test.org,Grace Miller,"Swift, Kubernetes, Tableau, Express.js" 98 | ryan.hall@test.org,Ryan Hall,"Git, Scrum, Scala, Express.js" 99 | amelia.phillips@test.org,Amelia Phillips,"Project Management, Angular, Docker, Tableau, Scala" 100 | amelia.brown@test.org,Amelia Brown,"Spring Boot, Blockchain, PyTorch, CI/CD" 101 | peter.perez@test.org,Peter Perez,"Big Data, System Design, Django, Rust" 102 | -------------------------------------------------------------------------------- /talent/module_01_graph_basics.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Module 1 - Graph basics: Queries, Algorithms & Vectors \n", 8 | "\n", 9 | "This module has the following objectives:\n", 10 | "- Creating a Graph from Structured Data\n", 11 | "- Basic Cypher Queries and Pattern Matching\n", 12 | "- Graph Algorithms\n", 13 | "- Text Embeddings for Semantic Analysis\n", 14 | "- Vector Search" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": { 21 | "id": "FHKg4DVZiQ98" 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "# !pip install graphdatascience neo4j dotenv langchain langchain_openai, matplotlib, seaborn" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": {}, 31 | "source": [ 32 | "Import our usual suspects (and some more...)" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "import os\n", 42 | "import pandas as pd\n", 43 | "from dotenv import load_dotenv\n", 44 | "from graphdatascience import GraphDataScience\n", 45 | "from neo4j import Query, GraphDatabase, RoutingControl, Result\n", 46 | "from langchain_openai import OpenAIEmbeddings, ChatOpenAI\n", 47 | "from langchain_core.prompts import ChatPromptTemplate\n", 48 | "from langchain_core.output_parsers import StrOutputParser\n", 49 | "import matplotlib.pyplot as plt\n", 50 | "import seaborn as sns" 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "metadata": { 56 | "id": "ynPe6RLRWSKd" 57 | }, 58 | "source": [ 59 | "## Setup" 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "metadata": { 65 | "id": "pa61u1jfyk3t" 66 | }, 67 | "source": [ 68 | "Load env variables" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": null, 74 | "metadata": {}, 75 | "outputs": [], 76 | "source": [ 77 | "env_file = 'ws.env'" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": null, 83 | "metadata": {}, 84 | "outputs": [], 85 | "source": [ 86 | "if os.path.exists(env_file):\n", 87 | " load_dotenv(env_file, override=True)\n", 88 | "\n", 89 | " # Neo4j\n", 90 | " HOST = os.getenv('NEO4J_URI')\n", 91 | " USERNAME = os.getenv('NEO4J_USERNAME')\n", 92 | " PASSWORD = os.getenv('NEO4J_PASSWORD')\n", 93 | " DATABASE = os.getenv('NEO4J_DATABASE')\n", 94 | "\n", 95 | " # AI\n", 96 | " OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')\n", 97 | " os.environ['OPENAI_API_KEY']=OPENAI_API_KEY\n", 98 | " LLM = os.getenv('LLM')\n", 99 | " EMBEDDINGS_MODEL = os.getenv('EMBEDDINGS_MODEL')\n", 100 | "else:\n", 101 | " print(f\"File {env_file} not found.\")" 102 | ] 103 | }, 104 | { 105 | "cell_type": "markdown", 106 | "metadata": { 107 | "id": "0OMlYdxHWZLx" 108 | }, 109 | "source": [ 110 | "## Read Data" 111 | ] 112 | }, 113 | { 114 | "cell_type": "markdown", 115 | "metadata": { 116 | "id": "HWJbMUtJeOcv" 117 | }, 118 | "source": [ 119 | "Load synthetic Skills dataset" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": null, 125 | "metadata": {}, 126 | "outputs": [], 127 | "source": "url = \"https://raw.githubusercontent.com/neo4j-product-examples/genai-workshop/refs/heads/main/talent/data/expanded_skills.csv\"" 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": null, 132 | "metadata": { 133 | "id": "HWJbMUtJeOcv" 134 | }, 135 | "outputs": [], 136 | "source": [ 137 | "skills_df = pd.read_csv(url)" 138 | ] 139 | }, 140 | { 141 | "cell_type": "markdown", 142 | "metadata": {}, 143 | "source": [ 144 | "Describe the dataset" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": null, 150 | "metadata": {}, 151 | "outputs": [], 152 | "source": [ 153 | "skills_df.describe()" 154 | ] 155 | }, 156 | { 157 | "cell_type": "markdown", 158 | "metadata": { 159 | "id": "HWJbMUtJeOcv" 160 | }, 161 | "source": [ 162 | "Display the first few rows of the DataFrame" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": null, 168 | "metadata": { 169 | "id": "HWJbMUtJeOcv" 170 | }, 171 | "outputs": [], 172 | "source": [ 173 | "skills_df.head(10)" 174 | ] 175 | }, 176 | { 177 | "cell_type": "markdown", 178 | "metadata": { 179 | "id": "7tdxbTD_ZZ-T" 180 | }, 181 | "source": [ 182 | "Convert skills column from comma separated string to List" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": null, 188 | "metadata": { 189 | "id": "7tdxbTD_ZZ-T" 190 | }, 191 | "outputs": [], 192 | "source": [ 193 | "skills_df['skills'] = skills_df['skills'].str.split(', ')\n", 194 | "skills_df.head()" 195 | ] 196 | }, 197 | { 198 | "cell_type": "markdown", 199 | "metadata": { 200 | "id": "k5QJiOUfZZ-T" 201 | }, 202 | "source": [ 203 | "## Create the Graph" 204 | ] 205 | }, 206 | { 207 | "cell_type": "markdown", 208 | "metadata": {}, 209 | "source": [ 210 | "### Connect to the Database" 211 | ] 212 | }, 213 | { 214 | "cell_type": "markdown", 215 | "metadata": {}, 216 | "source": [ 217 | "To connect to the database we use the [Neo4j Python Driver](https://neo4j.com/docs/python-manual/5/). The credentials are stored in our environment so can be specified to the driver." 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": null, 223 | "metadata": {}, 224 | "outputs": [], 225 | "source": [ 226 | "driver = GraphDatabase.driver(\n", 227 | " HOST,\n", 228 | " auth=(USERNAME, PASSWORD)\n", 229 | ")" 230 | ] 231 | }, 232 | { 233 | "cell_type": "markdown", 234 | "metadata": {}, 235 | "source": [ 236 | "In case we want to split large files. " 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": null, 242 | "metadata": {}, 243 | "outputs": [], 244 | "source": [ 245 | "def split_dataframe(df, chunk_size = 50_000):\n", 246 | " chunks = list()\n", 247 | " num_chunks = len(df) // chunk_size + 1\n", 248 | " for i in range(num_chunks):\n", 249 | " chunks.append(df[i*chunk_size:(i+1)*chunk_size])\n", 250 | " return chunks" 251 | ] 252 | }, 253 | { 254 | "cell_type": "markdown", 255 | "metadata": {}, 256 | "source": [ 257 | "Test the connection" 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "execution_count": null, 263 | "metadata": {}, 264 | "outputs": [], 265 | "source": [ 266 | "driver.execute_query(\n", 267 | " \"\"\"\n", 268 | " MATCH (n) RETURN COUNT(n) as Count\n", 269 | " \"\"\",\n", 270 | " database_=DATABASE,\n", 271 | " routing_=RoutingControl.READ,\n", 272 | " result_transformer_= lambda r: r.to_df()\n", 273 | ")" 274 | ] 275 | }, 276 | { 277 | "cell_type": "markdown", 278 | "metadata": {}, 279 | "source": [ 280 | "### Set constraints" 281 | ] 282 | }, 283 | { 284 | "cell_type": "markdown", 285 | "metadata": {}, 286 | "source": [ 287 | "We know what we will be loading. Set some constrainst first. Documentation: [Constraints](https://neo4j.com/docs/cypher-manual/current/constraints/managing-constraints/)" 288 | ] 289 | }, 290 | { 291 | "cell_type": "markdown", 292 | "metadata": {}, 293 | "source": [ 294 | "Set the constraint on Person Nodes" 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": null, 300 | "metadata": {}, 301 | "outputs": [], 302 | "source": [ 303 | "driver.execute_query(\n", 304 | " 'CREATE CONSTRAINT IF NOT EXISTS FOR (n:Person) REQUIRE (n.email) IS NODE KEY',\n", 305 | " database_=DATABASE,\n", 306 | " routing_=RoutingControl.WRITE\n", 307 | ")" 308 | ] 309 | }, 310 | { 311 | "cell_type": "markdown", 312 | "metadata": {}, 313 | "source": [ 314 | "Set the constraint on Skill Nodes" 315 | ] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "execution_count": null, 320 | "metadata": {}, 321 | "outputs": [], 322 | "source": [ 323 | "driver.execute_query(\n", 324 | " 'CREATE CONSTRAINT IF NOT EXISTS FOR (n:Skill) REQUIRE (n.name) IS NODE KEY',\n", 325 | " database_=DATABASE,\n", 326 | " routing_=RoutingControl.WRITE\n", 327 | ")" 328 | ] 329 | }, 330 | { 331 | "cell_type": "markdown", 332 | "metadata": { 333 | "id": "cdTfdAyV2ZaR" 334 | }, 335 | "source": [ 336 | "Fetch all constraints" 337 | ] 338 | }, 339 | { 340 | "cell_type": "code", 341 | "execution_count": null, 342 | "metadata": { 343 | "id": "cdTfdAyV2ZaR" 344 | }, 345 | "outputs": [], 346 | "source": [ 347 | "schema_result_df = driver.execute_query(\n", 348 | " 'SHOW CONSTRAINTS',\n", 349 | " database_=DATABASE,\n", 350 | " routing_=RoutingControl.READ,\n", 351 | " result_transformer_= lambda r: r.to_df()\n", 352 | ")\n", 353 | "schema_result_df.head()" 354 | ] 355 | }, 356 | { 357 | "cell_type": "markdown", 358 | "metadata": {}, 359 | "source": [ 360 | "### Load (:Person)-[:KNOWS]->(:Skill)" 361 | ] 362 | }, 363 | { 364 | "cell_type": "markdown", 365 | "metadata": { 366 | "id": "uduojIopm0qV" 367 | }, 368 | "source": [ 369 | "Create a Person and Skills nodes and create a relationship in between. Documentation: [MERGE](https://neo4j.com/docs/cypher-manual/current/clauses/merge/?utm_source=GSearch&utm_medium=PaidSearch&utm_campaign=Evergreen&utm_content=EMEA-Search-SEMCE-DSA-None-SEM-SEM-NonABM&utm_term=&utm_adgroup=DSA&gad_source=1&gclid=Cj0KCQjwm7q-BhDRARIsACD6-fXns_MSgSZ3_jQdYreKu3iOBQQU6bwddlNa4wD12oLr3rxKUlF4MMMaAnj1EALw_wcB)" 370 | ] 371 | }, 372 | { 373 | "cell_type": "code", 374 | "execution_count": null, 375 | "metadata": { 376 | "id": "uduojIopm0qV" 377 | }, 378 | "outputs": [], 379 | "source": [ 380 | "for chunk in split_dataframe(skills_df):\n", 381 | " records, summary, keys = driver.execute_query(\n", 382 | " \"\"\"\n", 383 | " UNWIND $rows AS row\n", 384 | " MERGE (p:Person{email:row.email})\n", 385 | " SET p.name = row.name\n", 386 | " WITH p, row\n", 387 | " FOREACH(skill IN row.skills | MERGE (s:Skill{name:skill}) MERGE (p)-[:KNOWS]->(s) )\n", 388 | " RETURN COUNT(*) AS rows_processed\n", 389 | " \"\"\",\n", 390 | " database_=DATABASE,\n", 391 | " routing_=RoutingControl.WRITE,\n", 392 | " rows = chunk.to_dict('records')\n", 393 | " )" 394 | ] 395 | }, 396 | { 397 | "cell_type": "markdown", 398 | "metadata": { 399 | "id": "rZo7Gln2jJcF" 400 | }, 401 | "source": [ 402 | "## Explore the Graph" 403 | ] 404 | }, 405 | { 406 | "cell_type": "markdown", 407 | "metadata": {}, 408 | "source": [ 409 | "Now to the database and observe what is there. \n", 410 | "Example queries: \n", 411 | "- MATCH (n:Person) RETURN n LIMIT 25;\n", 412 | "- MATCH (n:Skill) RETURN n LIMIT 25;\n", 413 | "- MATCH p=()-[:KNOWS]->() RETURN p LIMIT 25;" 414 | ] 415 | }, 416 | { 417 | "cell_type": "markdown", 418 | "metadata": {}, 419 | "source": [ 420 | "We can also run this via the [Neo4j Python Driver](https://neo4j.com/docs/python-manual/5/). Let's do so below" 421 | ] 422 | }, 423 | { 424 | "cell_type": "markdown", 425 | "metadata": {}, 426 | "source": [ 427 | "#### What persons are in the database?" 428 | ] 429 | }, 430 | { 431 | "cell_type": "code", 432 | "execution_count": null, 433 | "metadata": { 434 | "id": "70-xbOfgjQsY" 435 | }, 436 | "outputs": [], 437 | "source": [ 438 | "persons_df = driver.execute_query(\n", 439 | " \"\"\"\n", 440 | " MATCH (p:Person)\n", 441 | " RETURN p.name AS person_name\n", 442 | " \"\"\",\n", 443 | " database_=DATABASE,\n", 444 | " routing_=RoutingControl.READ,\n", 445 | " result_transformer_= lambda r: r.to_df()\n", 446 | ")" 447 | ] 448 | }, 449 | { 450 | "cell_type": "code", 451 | "execution_count": null, 452 | "metadata": {}, 453 | "outputs": [], 454 | "source": [ 455 | "persons_df" 456 | ] 457 | }, 458 | { 459 | "cell_type": "markdown", 460 | "metadata": { 461 | "id": "volCOjn_jjm3" 462 | }, 463 | "source": [ 464 | "#### What skills does each person know?" 465 | ] 466 | }, 467 | { 468 | "cell_type": "code", 469 | "execution_count": null, 470 | "metadata": { 471 | "id": "volCOjn_jjm3" 472 | }, 473 | "outputs": [], 474 | "source": [ 475 | "person_skills_df = driver.execute_query(\n", 476 | " \"\"\"\n", 477 | " MATCH (p:Person)-[:KNOWS]->(s:Skill)\n", 478 | " RETURN p.email AS email, p.name AS person_name, collect(s.name) AS skills\n", 479 | " \"\"\",\n", 480 | " database_=DATABASE,\n", 481 | " routing_=RoutingControl.READ,\n", 482 | " result_transformer_= lambda r: r.to_df()\n", 483 | ")" 484 | ] 485 | }, 486 | { 487 | "cell_type": "code", 488 | "execution_count": null, 489 | "metadata": {}, 490 | "outputs": [], 491 | "source": [ 492 | "person_skills_df" 493 | ] 494 | }, 495 | { 496 | "cell_type": "markdown", 497 | "metadata": { 498 | "id": "2yddechtj2MB" 499 | }, 500 | "source": [ 501 | "#### What are the most frequent skills?" 502 | ] 503 | }, 504 | { 505 | "cell_type": "code", 506 | "execution_count": null, 507 | "metadata": { 508 | "id": "2yddechtj2MB" 509 | }, 510 | "outputs": [], 511 | "source": [ 512 | "skill_count_df = driver.execute_query(\n", 513 | " \"\"\"\n", 514 | " MATCH (p:Person)-[:KNOWS]->(s:Skill)\n", 515 | " RETURN s.name, COUNT(DISTINCT p) AS knownByCount ORDER BY knownByCount DESC LIMIT 10\n", 516 | " \"\"\",\n", 517 | " database_=DATABASE,\n", 518 | " routing_=RoutingControl.READ,\n", 519 | " result_transformer_= lambda r: r.to_df()\n", 520 | ")" 521 | ] 522 | }, 523 | { 524 | "cell_type": "code", 525 | "execution_count": null, 526 | "metadata": {}, 527 | "outputs": [], 528 | "source": [ 529 | "skill_count_df" 530 | ] 531 | }, 532 | { 533 | "cell_type": "markdown", 534 | "metadata": {}, 535 | "source": [ 536 | "#### Multihop question" 537 | ] 538 | }, 539 | { 540 | "cell_type": "markdown", 541 | "metadata": {}, 542 | "source": [ 543 | "Run the following query in the database: \n", 544 | "- ```MATCH p=(p1:Person {name: \"Lucy Clark\"})-[:KNOWS]->(s1:Skill)<-[:KNOWS]-(p2:Person) RETURN DISTINCT p;```\n", 545 | "- ```MATCH p=(p1:Person {name: \"Lucy Clark\"})-[:KNOWS]->(s1:Skill)<-[:KNOWS]-(p2:Person)-[:KNOWS]-(s2:Skill) RETURN DISTINCT p;```" 546 | ] 547 | }, 548 | { 549 | "cell_type": "code", 550 | "execution_count": null, 551 | "metadata": {}, 552 | "outputs": [], 553 | "source": [ 554 | "person_name_1 = \"Lucy Clark\"\n", 555 | "\n", 556 | "persons_with_shared_skills_df = driver.execute_query(\n", 557 | " \"\"\"\n", 558 | " MATCH p=(p1:Person {name: $person_name_1})-[:KNOWS]->(s1:Skill)<-[:KNOWS]-(p2:Person)\n", 559 | " RETURN DISTINCT p2.name as person;\n", 560 | " \"\"\",\n", 561 | " database_=DATABASE,\n", 562 | " routing_=RoutingControl.READ,\n", 563 | " result_transformer_= lambda r: r.to_df(),\n", 564 | " person_name_1 = person_name_1\n", 565 | ")" 566 | ] 567 | }, 568 | { 569 | "cell_type": "code", 570 | "execution_count": null, 571 | "metadata": {}, 572 | "outputs": [], 573 | "source": [ 574 | "persons_with_shared_skills_df" 575 | ] 576 | }, 577 | { 578 | "cell_type": "code", 579 | "execution_count": null, 580 | "metadata": {}, 581 | "outputs": [], 582 | "source": [ 583 | "person_name_1 = \"Lucy Clark\"\n", 584 | "\n", 585 | "skills_two_steps_df = driver.execute_query(\n", 586 | " \"\"\"\n", 587 | " MATCH p=(p1:Person {name: $person_name_1})-[:KNOWS]->(s1:Skill)<-[:KNOWS]-(p2:Person)-[:KNOWS]-(s2:Skill)\n", 588 | " RETURN DISTINCT s2.name as skill;\n", 589 | " \"\"\",\n", 590 | " database_=DATABASE,\n", 591 | " routing_=RoutingControl.READ,\n", 592 | " result_transformer_= lambda r: r.to_df(),\n", 593 | " person_name_1 = person_name_1\n", 594 | ")" 595 | ] 596 | }, 597 | { 598 | "cell_type": "code", 599 | "execution_count": null, 600 | "metadata": {}, 601 | "outputs": [], 602 | "source": [ 603 | "skills_two_steps_df" 604 | ] 605 | }, 606 | { 607 | "cell_type": "markdown", 608 | "metadata": {}, 609 | "source": [ 610 | "## Person Similarity" 611 | ] 612 | }, 613 | { 614 | "cell_type": "markdown", 615 | "metadata": {}, 616 | "source": [ 617 | "We can define the similarity of persons based on the number of skills that are overlapping. " 618 | ] 619 | }, 620 | { 621 | "cell_type": "code", 622 | "execution_count": null, 623 | "metadata": {}, 624 | "outputs": [], 625 | "source": [ 626 | "person_name_1 = \"Thomas Brown\"\n", 627 | "\n", 628 | "similar_skills_df = driver.execute_query(\n", 629 | " \"\"\"\n", 630 | " MATCH path_1=(p1:Person{name: $person_name_1})-[:KNOWS]->(s1:Skill)\n", 631 | " MATCH path_2=(s1)<-[:KNOWS]-(p2:Person)\n", 632 | " WITH p1.name as person_1, p2.name as person_2, COLLECT(DISTINCT s1.name) as skill_list, COUNT(DISTINCT(s1)) as skill_count\n", 633 | " WHERE skill_count > 1 AND person_1 <> person_2\n", 634 | " RETURN * ORDER BY skill_count DESC\n", 635 | " \"\"\",\n", 636 | " database_=DATABASE,\n", 637 | " routing_=RoutingControl.READ,\n", 638 | " result_transformer_= lambda r: r.to_df(),\n", 639 | " person_name_1 = person_name_1\n", 640 | ")" 641 | ] 642 | }, 643 | { 644 | "cell_type": "code", 645 | "execution_count": null, 646 | "metadata": {}, 647 | "outputs": [], 648 | "source": [ 649 | "similar_skills_df" 650 | ] 651 | }, 652 | { 653 | "cell_type": "code", 654 | "execution_count": null, 655 | "metadata": {}, 656 | "outputs": [], 657 | "source": [ 658 | "similar_skills_all_df = driver.execute_query(\n", 659 | " \"\"\"\n", 660 | " MATCH path_1=(p1:Person)-[:KNOWS]->(s1:Skill)<-[:KNOWS]-(p2:Person)\n", 661 | " WHERE p1.name < p2.name\n", 662 | " WITH p1.name as person_1, p2.name as person_2, COLLECT(DISTINCT s1.name) as skill_list, COUNT(DISTINCT(s1)) as skill_count\n", 663 | " WHERE skill_count >= 1\n", 664 | " RETURN * ORDER BY skill_count DESC\n", 665 | " \"\"\",\n", 666 | " database_=DATABASE,\n", 667 | " routing_=RoutingControl.READ,\n", 668 | " result_transformer_= lambda r: r.to_df()\n", 669 | ")" 670 | ] 671 | }, 672 | { 673 | "cell_type": "code", 674 | "execution_count": null, 675 | "metadata": {}, 676 | "outputs": [], 677 | "source": [ 678 | "similar_skills_all_df" 679 | ] 680 | }, 681 | { 682 | "cell_type": "markdown", 683 | "metadata": {}, 684 | "source": [ 685 | "Load the skill count to the database in a new relationship" 686 | ] 687 | }, 688 | { 689 | "cell_type": "code", 690 | "execution_count": null, 691 | "metadata": {}, 692 | "outputs": [], 693 | "source": [ 694 | "for chunk in split_dataframe(similar_skills_all_df):\n", 695 | " records, summary, keys = driver.execute_query(\n", 696 | " \"\"\"\n", 697 | " UNWIND $rows AS row\n", 698 | " MERGE (p1:Person{name:row.person_1})\n", 699 | " MERGE (p2:Person{name:row.person_2})\n", 700 | " MERGE (p1)-[s:SIMILAR_SKILLSET]->(p2)\n", 701 | " SET s.overlap = row.skill_count\n", 702 | " RETURN COUNT(*) AS rows_processed\n", 703 | " \"\"\",\n", 704 | " database_=DATABASE,\n", 705 | " routing_=RoutingControl.WRITE,\n", 706 | " rows = chunk.to_dict('records')\n", 707 | " )" 708 | ] 709 | }, 710 | { 711 | "cell_type": "markdown", 712 | "metadata": {}, 713 | "source": [ 714 | "Take a minute to explore the SIMILAR_SKILLSET network in the database. \n", 715 | "\n", 716 | "- ```MATCH p=()-[:SIMILAR_SKILLSET]->() RETURN p LIMIT 50```\n", 717 | "- ```MATCH p=()-[s:SIMILAR_SKILLSET]->() WHERE s.overlap >= 2 RETURN p LIMIT 50```\n", 718 | "- ```MATCH p=()-[s:SIMILAR_SKILLSET]->() WHERE s.overlap >= 3 RETURN p LIMIT 50```" 719 | ] 720 | }, 721 | { 722 | "cell_type": "markdown", 723 | "metadata": {}, 724 | "source": [ 725 | "## Communities" 726 | ] 727 | }, 728 | { 729 | "cell_type": "markdown", 730 | "metadata": {}, 731 | "source": [ 732 | "Let's run some Graph Data Science based on Persons and Skills. Let's first setup the [Graph Data Science Client](https://neo4j.com/docs/graph-data-science-client/current/). " 733 | ] 734 | }, 735 | { 736 | "cell_type": "code", 737 | "execution_count": null, 738 | "metadata": {}, 739 | "outputs": [], 740 | "source": [ 741 | "gds = GraphDataScience.from_neo4j_driver(driver=driver)\n", 742 | "gds.set_database(DATABASE)\n", 743 | "gds.version()" 744 | ] 745 | }, 746 | { 747 | "cell_type": "markdown", 748 | "metadata": {}, 749 | "source": [ 750 | "Let's investigate Persons that are similar in the graph (based on skills they share). For that we first need to create a [Graph object](https://neo4j.com/docs/graph-data-science-client/current/graph-object/). " 751 | ] 752 | }, 753 | { 754 | "cell_type": "code", 755 | "execution_count": null, 756 | "metadata": {}, 757 | "outputs": [], 758 | "source": [ 759 | "graph_name = \"person_similarity_projection\"\n", 760 | "node_projection = [\"Person\"]\n", 761 | "rel_projection = {\"SIMILAR_SKILLSET\": {\"orientation\": 'UNDIRECTED', \"properties\": \"overlap\"}, }" 762 | ] 763 | }, 764 | { 765 | "cell_type": "code", 766 | "execution_count": null, 767 | "metadata": {}, 768 | "outputs": [], 769 | "source": [ 770 | "G, res = gds.graph.project(graph_name, node_projection, rel_projection)" 771 | ] 772 | }, 773 | { 774 | "cell_type": "markdown", 775 | "metadata": {}, 776 | "source": [ 777 | "Run the [Leiden Algorithm](https://neo4j.com/docs/graph-data-science/current/algorithms/leiden/) for Community Detection" 778 | ] 779 | }, 780 | { 781 | "cell_type": "code", 782 | "execution_count": null, 783 | "metadata": {}, 784 | "outputs": [], 785 | "source": [ 786 | "gds.leiden.write(\n", 787 | " G,\n", 788 | " writeProperty='leiden_community',\n", 789 | " relationshipWeightProperty='overlap',\n", 790 | " maxLevels=100,\n", 791 | " gamma=1.5,\n", 792 | " theta=0.001,\n", 793 | " concurrency = 1,\n", 794 | " randomSeed = 42\n", 795 | ")" 796 | ] 797 | }, 798 | { 799 | "cell_type": "code", 800 | "execution_count": null, 801 | "metadata": {}, 802 | "outputs": [], 803 | "source": [ 804 | "communities_df = driver.execute_query(\n", 805 | " \"\"\"\n", 806 | " MATCH (p:Person)\n", 807 | " RETURN p.leiden_community AS Community, COUNT(*) as MemberCount\n", 808 | " \"\"\",\n", 809 | " database_=DATABASE,\n", 810 | " routing_=RoutingControl.READ,\n", 811 | " result_transformer_= lambda r: r.to_df()\n", 812 | ")" 813 | ] 814 | }, 815 | { 816 | "cell_type": "code", 817 | "execution_count": null, 818 | "metadata": {}, 819 | "outputs": [], 820 | "source": [ 821 | "communities_df" 822 | ] 823 | }, 824 | { 825 | "cell_type": "markdown", 826 | "metadata": {}, 827 | "source": [ 828 | "Check communities based on people with high overlap" 829 | ] 830 | }, 831 | { 832 | "cell_type": "code", 833 | "execution_count": null, 834 | "metadata": {}, 835 | "outputs": [], 836 | "source": [ 837 | "community_check_df = driver.execute_query(\n", 838 | " \"\"\"\n", 839 | " MATCH (p1:Person)-[s:SIMILAR_SKILLSET]->(p2:Person)\n", 840 | " WHERE s.overlap > 2\n", 841 | " RETURN s.overlap AS Overlap, p1.name AS Person1, p1.leiden_community AS Community1, p2.name AS Person2, p2.leiden_community AS Community2\n", 842 | " \"\"\",\n", 843 | " database_=DATABASE,\n", 844 | " routing_=RoutingControl.READ,\n", 845 | " result_transformer_= lambda r: r.to_df()\n", 846 | ")" 847 | ] 848 | }, 849 | { 850 | "cell_type": "code", 851 | "execution_count": null, 852 | "metadata": {}, 853 | "outputs": [], 854 | "source": [ 855 | "community_check_df" 856 | ] 857 | }, 858 | { 859 | "cell_type": "markdown", 860 | "metadata": {}, 861 | "source": [ 862 | "Check some high skill occurences in the communities" 863 | ] 864 | }, 865 | { 866 | "cell_type": "code", 867 | "execution_count": null, 868 | "metadata": {}, 869 | "outputs": [], 870 | "source": [ 871 | "communities_skills_df = gds.run_cypher('''\n", 872 | " MATCH (p:Person)-[:KNOWS]->(s) WHERE (p.leiden_community) IS NOT NULL\n", 873 | " WITH p.leiden_community AS leiden_community, s.name as skill, count(*) as cnt\n", 874 | " WHERE cnt > 5\n", 875 | " RETURN *\n", 876 | " ORDER BY leiden_community, cnt DESC\n", 877 | "''')" 878 | ] 879 | }, 880 | { 881 | "cell_type": "code", 882 | "execution_count": null, 883 | "metadata": {}, 884 | "outputs": [], 885 | "source": [ 886 | "communities_skills_df" 887 | ] 888 | }, 889 | { 890 | "cell_type": "markdown", 891 | "metadata": {}, 892 | "source": [ 893 | "### Plot the Communities with their Skill Count" 894 | ] 895 | }, 896 | { 897 | "cell_type": "code", 898 | "execution_count": null, 899 | "metadata": {}, 900 | "outputs": [], 901 | "source": [ 902 | "df = gds.run_cypher(\"\"\"\n", 903 | "MATCH (p:Person)-[:KNOWS]->(s) WHERE (p.leiden_community) IS NOT NULL\n", 904 | "RETURN p.leiden_community AS leiden_community, s.name as skill, count(*) as cnt\n", 905 | "\"\"\")" 906 | ] 907 | }, 908 | { 909 | "cell_type": "code", 910 | "execution_count": null, 911 | "metadata": {}, 912 | "outputs": [], 913 | "source": [ 914 | "pivot_table = df.pivot(index=\"skill\", columns=\"leiden_community\", values=\"cnt\").fillna(0)\n", 915 | "sns.set_theme(style=\"whitegrid\")\n", 916 | "plt.figure(figsize=(12, 6))\n", 917 | "sns.heatmap(pivot_table, cmap=\"Blues\", linewidths=0.5)\n", 918 | "plt.xlabel(\"Community\")\n", 919 | "plt.ylabel(\"Skill\")\n", 920 | "plt.title(\"Skill Distribution Heatmap per Community\")\n", 921 | "plt.show()" 922 | ] 923 | }, 924 | { 925 | "cell_type": "markdown", 926 | "metadata": {}, 927 | "source": [ 928 | "Drop the projection from the graph catalogue to free up resources" 929 | ] 930 | }, 931 | { 932 | "cell_type": "code", 933 | "execution_count": null, 934 | "metadata": {}, 935 | "outputs": [], 936 | "source": [ 937 | "G.drop()" 938 | ] 939 | }, 940 | { 941 | "cell_type": "markdown", 942 | "metadata": { 943 | "id": "jU696MRBKlne" 944 | }, 945 | "source": [ 946 | "## Semantic Similar skill" 947 | ] 948 | }, 949 | { 950 | "cell_type": "markdown", 951 | "metadata": {}, 952 | "source": [ 953 | "Since the communities don't really make sense (due to the randomness of the skills for persons) we can try the similarity based on the semantic meaning. " 954 | ] 955 | }, 956 | { 957 | "cell_type": "code", 958 | "execution_count": null, 959 | "metadata": { 960 | "id": "mZC3AKS9ZZ-W" 961 | }, 962 | "outputs": [], 963 | "source": [ 964 | "skills_df = gds.run_cypher(\n", 965 | " \"\"\"\n", 966 | " MATCH (s:Skill)\n", 967 | " RETURN s.name AS skill\n", 968 | " \"\"\"\n", 969 | ")" 970 | ] 971 | }, 972 | { 973 | "cell_type": "code", 974 | "execution_count": null, 975 | "metadata": { 976 | "id": "mZC3AKS9ZZ-W" 977 | }, 978 | "outputs": [], 979 | "source": [ 980 | "skills_df.head(5)" 981 | ] 982 | }, 983 | { 984 | "cell_type": "markdown", 985 | "metadata": { 986 | "id": "IKWEZ7oy051q" 987 | }, 988 | "source": [ 989 | "### STOP STOP STOP - DO NOT PROCEED (YET)" 990 | ] 991 | }, 992 | { 993 | "cell_type": "markdown", 994 | "metadata": {}, 995 | "source": [ 996 | "-- Only to be run by instructor (or if you have your own api key). Skip the following two cells -- " 997 | ] 998 | }, 999 | { 1000 | "cell_type": "code", 1001 | "execution_count": null, 1002 | "metadata": {}, 1003 | "outputs": [], 1004 | "source": [ 1005 | "# skills_df['embedding'] = skills_df['description'].apply( lambda skill: embeddings.embed_documents([skill])[0])\n", 1006 | "# skills_df.head()" 1007 | ] 1008 | }, 1009 | { 1010 | "cell_type": "code", 1011 | "execution_count": null, 1012 | "metadata": {}, 1013 | "outputs": [], 1014 | "source": [ 1015 | "# gds.run_cypher('''\n", 1016 | "# unwind $data as row\n", 1017 | "# match (s:Skill{name: row.skill})\n", 1018 | "# set s.embedding = row.embedding\n", 1019 | "# ''',\n", 1020 | "# params = { 'data': skills_df.to_dict(orient='records') }\n", 1021 | "# )" 1022 | ] 1023 | }, 1024 | { 1025 | "cell_type": "code", 1026 | "execution_count": null, 1027 | "metadata": {}, 1028 | "outputs": [], 1029 | "source": "url = 'https://raw.githubusercontent.com/neo4j-product-examples/genai-workshop/refs/heads/main/talent/data/skills_embeddings.csv'" 1030 | }, 1031 | { 1032 | "cell_type": "code", 1033 | "execution_count": null, 1034 | "metadata": {}, 1035 | "outputs": [], 1036 | "source": [ 1037 | "skills_embeddings_df = pd.read_csv(url)" 1038 | ] 1039 | }, 1040 | { 1041 | "cell_type": "code", 1042 | "execution_count": null, 1043 | "metadata": {}, 1044 | "outputs": [], 1045 | "source": [ 1046 | "skills_embeddings_df.head()" 1047 | ] 1048 | }, 1049 | { 1050 | "cell_type": "code", 1051 | "execution_count": null, 1052 | "metadata": {}, 1053 | "outputs": [], 1054 | "source": [ 1055 | "type(skills_embeddings_df['Embedding'].iloc[0][0])" 1056 | ] 1057 | }, 1058 | { 1059 | "cell_type": "code", 1060 | "execution_count": null, 1061 | "metadata": {}, 1062 | "outputs": [], 1063 | "source": [ 1064 | "skills_embeddings_df['Embedding'] = skills_embeddings_df['Embedding'].apply( lambda x: [ float(i) for i in x.strip(\"[]\").split(\", \")] )" 1065 | ] 1066 | }, 1067 | { 1068 | "cell_type": "code", 1069 | "execution_count": null, 1070 | "metadata": {}, 1071 | "outputs": [], 1072 | "source": [ 1073 | "type(skills_embeddings_df['Embedding'].iloc[0][0])" 1074 | ] 1075 | }, 1076 | { 1077 | "cell_type": "code", 1078 | "execution_count": null, 1079 | "metadata": {}, 1080 | "outputs": [], 1081 | "source": [ 1082 | "skills_embeddings_df.head()" 1083 | ] 1084 | }, 1085 | { 1086 | "cell_type": "markdown", 1087 | "metadata": {}, 1088 | "source": [ 1089 | "Length of an embedding" 1090 | ] 1091 | }, 1092 | { 1093 | "cell_type": "code", 1094 | "execution_count": null, 1095 | "metadata": {}, 1096 | "outputs": [], 1097 | "source": [ 1098 | "len(skills_embeddings_df['Embedding'].iloc[0])" 1099 | ] 1100 | }, 1101 | { 1102 | "cell_type": "markdown", 1103 | "metadata": {}, 1104 | "source": [ 1105 | "### Add Embeddings to the database" 1106 | ] 1107 | }, 1108 | { 1109 | "cell_type": "markdown", 1110 | "metadata": { 1111 | "id": "-K7yLpGtZZ-W" 1112 | }, 1113 | "source": [ 1114 | "Add embeddings with the description to Skill nodes in database" 1115 | ] 1116 | }, 1117 | { 1118 | "cell_type": "code", 1119 | "execution_count": null, 1120 | "metadata": {}, 1121 | "outputs": [], 1122 | "source": [ 1123 | "for chunk in split_dataframe(skills_embeddings_df):\n", 1124 | " records, summary, keys = driver.execute_query(\n", 1125 | " \"\"\"\n", 1126 | " UNWIND $rows AS row\n", 1127 | " MATCH (s:Skill{name: row.Skill})\n", 1128 | " SET s.embedding = row.Embedding\n", 1129 | " SET s.description = row.Description\n", 1130 | " WITH s\n", 1131 | " CALL db.create.setNodeVectorProperty(s, \"embedding\", s.embedding)\n", 1132 | " RETURN COUNT(*) AS rows_processed\n", 1133 | " \"\"\",\n", 1134 | " database_=DATABASE,\n", 1135 | " routing_=RoutingControl.WRITE,\n", 1136 | " rows = chunk.to_dict('records')\n", 1137 | " )" 1138 | ] 1139 | }, 1140 | { 1141 | "cell_type": "markdown", 1142 | "metadata": {}, 1143 | "source": [ 1144 | "Let's have a look in the browser! " 1145 | ] 1146 | }, 1147 | { 1148 | "cell_type": "markdown", 1149 | "metadata": {}, 1150 | "source": [ 1151 | "### Vectors for Semantic Meaning" 1152 | ] 1153 | }, 1154 | { 1155 | "cell_type": "code", 1156 | "execution_count": null, 1157 | "metadata": {}, 1158 | "outputs": [], 1159 | "source": [ 1160 | "driver.execute_query(\n", 1161 | " \"\"\"\n", 1162 | " CREATE VECTOR INDEX `skill-embeddings` IF NOT EXISTS\n", 1163 | " FOR (s:Skill) ON (s.embedding)\n", 1164 | " OPTIONS {\n", 1165 | " indexConfig: {\n", 1166 | " `vector.dimensions`: 1536,\n", 1167 | " `vector.similarity_function`: 'cosine'\n", 1168 | " } \n", 1169 | " }\n", 1170 | " \"\"\",\n", 1171 | " database_=DATABASE,\n", 1172 | " routing_=RoutingControl.WRITE\n", 1173 | ") " 1174 | ] 1175 | }, 1176 | { 1177 | "cell_type": "code", 1178 | "execution_count": null, 1179 | "metadata": {}, 1180 | "outputs": [], 1181 | "source": [ 1182 | "indexes_result_df = driver.execute_query(\n", 1183 | " 'SHOW INDEXES',\n", 1184 | " database_=DATABASE,\n", 1185 | " routing_=RoutingControl.READ,\n", 1186 | " result_transformer_= lambda r: r.to_df()\n", 1187 | ")\n", 1188 | "indexes_result_df" 1189 | ] 1190 | }, 1191 | { 1192 | "cell_type": "markdown", 1193 | "metadata": {}, 1194 | "source": [ 1195 | "### Semantic Search" 1196 | ] 1197 | }, 1198 | { 1199 | "cell_type": "markdown", 1200 | "metadata": {}, 1201 | "source": [ 1202 | "Take some Skill and find relevant other Skills: \"Python\", \"Java\", \"Git\", \"CI/CD\", \"AWS\", \"Data Visualization\", \"Power BI\", \"R\"\". " 1203 | ] 1204 | }, 1205 | { 1206 | "cell_type": "code", 1207 | "execution_count": null, 1208 | "metadata": {}, 1209 | "outputs": [], 1210 | "source": [ 1211 | "skill_search = \"Python\"" 1212 | ] 1213 | }, 1214 | { 1215 | "cell_type": "code", 1216 | "execution_count": null, 1217 | "metadata": {}, 1218 | "outputs": [], 1219 | "source": [ 1220 | "similar_skills_df = driver.execute_query(\n", 1221 | " \"\"\"\n", 1222 | " MATCH (s:Skill{name: $skill_search})\n", 1223 | " CALL db.index.vector.queryNodes(\"skill-embeddings\", 10, s.embedding) YIELD node, score\n", 1224 | " WITH node as skill, score ORDER BY score DESC\n", 1225 | " WHERE node.name <> s.name AND score > 0.9\n", 1226 | " RETURN skill.name, score\n", 1227 | " \"\"\",\n", 1228 | " database_=DATABASE,\n", 1229 | " routing_=RoutingControl.READ,\n", 1230 | " result_transformer_= lambda r: r.to_df(),\n", 1231 | " skill_search = skill_search\n", 1232 | "\n", 1233 | ")\n", 1234 | "similar_skills_df" 1235 | ] 1236 | }, 1237 | { 1238 | "cell_type": "markdown", 1239 | "metadata": {}, 1240 | "source": [ 1241 | "We can also find similarity from other terms than the skills in the database now. " 1242 | ] 1243 | }, 1244 | { 1245 | "cell_type": "code", 1246 | "execution_count": null, 1247 | "metadata": {}, 1248 | "outputs": [], 1249 | "source": [ 1250 | "embeddings = OpenAIEmbeddings(model=EMBEDDINGS_MODEL)" 1251 | ] 1252 | }, 1253 | { 1254 | "cell_type": "markdown", 1255 | "metadata": {}, 1256 | "source": [ 1257 | "Some suggestions to search for: \n", 1258 | "- data visualizations and dashboards\n", 1259 | "- deployments\n", 1260 | "- API coding\n", 1261 | "- Machine Learning frameworks\n", 1262 | "- Cloud expertise" 1263 | ] 1264 | }, 1265 | { 1266 | "cell_type": "code", 1267 | "execution_count": null, 1268 | "metadata": {}, 1269 | "outputs": [], 1270 | "source": [ 1271 | "skill_search = \"API coding\"" 1272 | ] 1273 | }, 1274 | { 1275 | "cell_type": "code", 1276 | "execution_count": null, 1277 | "metadata": {}, 1278 | "outputs": [], 1279 | "source": [ 1280 | "driver.execute_query(\n", 1281 | " '''\n", 1282 | " CALL db.index.vector.queryNodes(\"skill-embeddings\", 10, $query_vector) YIELD node, score\n", 1283 | " WHERE score > 0.89\n", 1284 | " RETURN node.name AS skill, score\n", 1285 | " ''',\n", 1286 | " database_ = DATABASE,\n", 1287 | " routing_ = RoutingControl.READ,\n", 1288 | " result_transformer_ = lambda r: r.to_df(),\n", 1289 | " query_vector = embeddings.embed_query(skill_search)\n", 1290 | ")" 1291 | ] 1292 | }, 1293 | { 1294 | "cell_type": "markdown", 1295 | "metadata": {}, 1296 | "source": [ 1297 | "Create relationship for similar sematic skills" 1298 | ] 1299 | }, 1300 | { 1301 | "cell_type": "code", 1302 | "execution_count": null, 1303 | "metadata": {}, 1304 | "outputs": [], 1305 | "source": [ 1306 | "driver.execute_query(\n", 1307 | " \"\"\"\n", 1308 | " CALL apoc.periodic.iterate(\n", 1309 | " \"MATCH (skill1:Skill) RETURN skill1\",\n", 1310 | " \"WITH skill1 \n", 1311 | " CALL db.index.vector.queryNodes('skill-embeddings', 10, skill1.embedding) YIELD node, score\n", 1312 | " WITH skill1, node as skill2, score ORDER BY score DESC\n", 1313 | " WHERE skill1.name < skill2.name AND score > 0.92\n", 1314 | " MERGE (skill1)-[s:SIMILAR_SEMANTIC]->(skill2)\n", 1315 | " SET s.score = score \n", 1316 | " \",\n", 1317 | " {batchSize: 1000}\n", 1318 | " )\n", 1319 | " \"\"\",\n", 1320 | " database_=DATABASE,\n", 1321 | " routing_=RoutingControl.WRITE,\n", 1322 | " result_transformer_= lambda r: r.to_df()\n", 1323 | ")" 1324 | ] 1325 | }, 1326 | { 1327 | "cell_type": "markdown", 1328 | "metadata": {}, 1329 | "source": [ 1330 | "Let's look in the browser how these relationships look like. " 1331 | ] 1332 | }, 1333 | { 1334 | "cell_type": "markdown", 1335 | "metadata": {}, 1336 | "source": [ 1337 | "```MATCH p=()-[:SIMILAR_SEMANTIC]->() RETURN p```" 1338 | ] 1339 | }, 1340 | { 1341 | "cell_type": "markdown", 1342 | "metadata": {}, 1343 | "source": [ 1344 | "What are similar skills in the database now? " 1345 | ] 1346 | }, 1347 | { 1348 | "cell_type": "code", 1349 | "execution_count": null, 1350 | "metadata": {}, 1351 | "outputs": [], 1352 | "source": [ 1353 | "similar_skills_df = driver.execute_query(\n", 1354 | " \"\"\"\n", 1355 | " MATCH (s1:Skill)-[r:SIMILAR_SEMANTIC]-(s2:Skill)\n", 1356 | " WHERE s1.name < s2.name\n", 1357 | " RETURN s1.name AS skill1, r.score AS score, s2.name AS skill2\n", 1358 | " ORDER BY score DESC\n", 1359 | " \"\"\",\n", 1360 | " database_=DATABASE,\n", 1361 | " routing_=RoutingControl.READ,\n", 1362 | " result_transformer_= lambda r: r.to_df()\n", 1363 | ")" 1364 | ] 1365 | }, 1366 | { 1367 | "cell_type": "code", 1368 | "execution_count": null, 1369 | "metadata": {}, 1370 | "outputs": [], 1371 | "source": [ 1372 | "similar_skills_df" 1373 | ] 1374 | }, 1375 | { 1376 | "cell_type": "markdown", 1377 | "metadata": {}, 1378 | "source": [ 1379 | "## Now we can find more people with based on Semantic Similarity" 1380 | ] 1381 | }, 1382 | { 1383 | "cell_type": "markdown", 1384 | "metadata": {}, 1385 | "source": [ 1386 | "Check the following in the browser:\n", 1387 | "```\n", 1388 | "MATCH (p1:Person {name: \"John Garcia\"})-[:KNOWS]->(s:Skill)\n", 1389 | "WITH p1, COLLECT(s.name) as skills_1\n", 1390 | "CALL (p1, p1){\n", 1391 | " MATCH p=(p1)-[:KNOWS]->(s1:Skill)-[r:SIMILAR_SEMANTIC]-(s2:Skill)<-[:KNOWS]-(p2:Person)\n", 1392 | " RETURN p\n", 1393 | " UNION \n", 1394 | " MATCH (p1)-[r:SIMILAR_SKILLSET]->(p2:Person), p=(p2)-[:KNOWS]->(:Skill)\n", 1395 | " RETURN p\n", 1396 | "}\n", 1397 | "RETURN p\n", 1398 | "```" 1399 | ] 1400 | }, 1401 | { 1402 | "cell_type": "markdown", 1403 | "metadata": {}, 1404 | "source": [ 1405 | "The following persons give some interesting results: \"Amelia Davis\", \"Victoria Thomas\", \"John Walker\"" 1406 | ] 1407 | }, 1408 | { 1409 | "cell_type": "code", 1410 | "execution_count": null, 1411 | "metadata": {}, 1412 | "outputs": [], 1413 | "source": [ 1414 | "person_name_1 = \"John Garcia\"" 1415 | ] 1416 | }, 1417 | { 1418 | "cell_type": "code", 1419 | "execution_count": null, 1420 | "metadata": {}, 1421 | "outputs": [], 1422 | "source": [ 1423 | "similar_persons_df = driver.execute_query(\n", 1424 | " \"\"\"\n", 1425 | " MATCH (p1:Person {name: $person_name_1})-[:KNOWS]->(s:Skill)\n", 1426 | " WITH p1, COLLECT(s.name) as skills_1\n", 1427 | " CALL (p1){\n", 1428 | " MATCH (p1)-[:KNOWS]->(s1:Skill)-[r:SIMILAR_SEMANTIC]-(s2:Skill)<-[:KNOWS]-(p2:Person)\n", 1429 | " RETURN p1 as person_1, p2 as person_2, SUM(r.score) AS score\n", 1430 | " UNION \n", 1431 | " MATCH (p1)-[r:SIMILAR_SKILLSET]->(p2:Person)\n", 1432 | " RETURN p1 as person_1, p2 AS person_2, SUM(r.overlap) AS score\n", 1433 | " }\n", 1434 | " WITH person_1.name as person_1, skills_1, person_2, SUM(score) as score\n", 1435 | " WHERE score >= 1\n", 1436 | " MATCH (person_2)-[:KNOWS]->(s:Skill)\n", 1437 | " RETURN person_1, skills_1, person_2.name as person_2, COLLECT(s.name) as skills_2, score\n", 1438 | " ORDER BY score DESC\n", 1439 | " \"\"\",\n", 1440 | " database_=DATABASE,\n", 1441 | " routing_=RoutingControl.READ,\n", 1442 | " result_transformer_= lambda r: r.to_df(),\n", 1443 | " person_name_1 = person_name_1\n", 1444 | ")" 1445 | ] 1446 | }, 1447 | { 1448 | "cell_type": "code", 1449 | "execution_count": null, 1450 | "metadata": {}, 1451 | "outputs": [], 1452 | "source": [ 1453 | "similar_persons_df" 1454 | ] 1455 | }, 1456 | { 1457 | "cell_type": "markdown", 1458 | "metadata": {}, 1459 | "source": [ 1460 | "Check in the browser the following: \n", 1461 | "\n", 1462 | "```\n", 1463 | "MATCH p=(p1:Person {name: \"John Garcia\"})-[:KNOWS]->(s:Skill)-[:SIMILAR_SEMANTIC]->(:Skill)<-[:KNOWS]-(p2:Person{name:\"Matthew Miller\"})\n", 1464 | "RETURN p \n", 1465 | "UNION \n", 1466 | "MATCH p=(p1:Person {name: \"John Garcia\"})-[:KNOWS]->(s:Skill)<-[:KNOWS]-(p2:Person{name:\"Matthew Miller\"})\n", 1467 | "RETURN p\n", 1468 | "```\n", 1469 | "\n", 1470 | "```\n", 1471 | "MATCH p=(p1:Person {name: \"John Garcia\"})-[:KNOWS]->(s:Skill)-[:SIMILAR_SEMANTIC*0..2]->(:Skill)<-[:KNOWS]-(p2:Person{name:\"Matthew Miller\"})\n", 1472 | "RETURN p \n", 1473 | "UNION \n", 1474 | "MATCH p=(p1:Person {name: \"John Garcia\"})-[:KNOWS]->(s:Skill)<-[:KNOWS]-(p2:Person{name:\"Matthew Miller\"})\n", 1475 | "RETURN p\n", 1476 | "```" 1477 | ] 1478 | }, 1479 | { 1480 | "cell_type": "markdown", 1481 | "metadata": {}, 1482 | "source": [ 1483 | "Calculate for all of them with score > 3" 1484 | ] 1485 | }, 1486 | { 1487 | "cell_type": "code", 1488 | "execution_count": null, 1489 | "metadata": {}, 1490 | "outputs": [], 1491 | "source": [ 1492 | "similar_persons_df = driver.execute_query(\n", 1493 | " \"\"\"\n", 1494 | " MATCH (p1:Person)-[:KNOWS]->(s:Skill)\n", 1495 | " WITH p1, COLLECT(s.name) as skills_1\n", 1496 | " CALL (p1, p1){\n", 1497 | " MATCH (p1)-[:KNOWS]->(s1:Skill)-[r:SIMILAR_SEMANTIC]-(s2:Skill)<-[:KNOWS]-(p2:Person)\n", 1498 | " RETURN p1 as person_1, p2 as person_2, SUM(r.score) AS score\n", 1499 | " UNION \n", 1500 | " MATCH (p1)-[r:SIMILAR_SKILLSET]-(p2:Person)\n", 1501 | " RETURN p1 as person_1, p2 AS person_2, SUM(r.overlap) AS score\n", 1502 | " }\n", 1503 | " WITH person_1.name as person_1, skills_1, person_2, SUM(score) as score\n", 1504 | " WHERE score > 3\n", 1505 | " MATCH (person_2)-[:KNOWS]->(s:Skill)\n", 1506 | " RETURN person_1, skills_1, person_2.name as person_2, COLLECT(s.name) as skills_2, score\n", 1507 | " ORDER BY score DESC\n", 1508 | " \"\"\",\n", 1509 | " database_=DATABASE,\n", 1510 | " routing_=RoutingControl.READ,\n", 1511 | " result_transformer_= lambda r: r.to_df()\n", 1512 | ")" 1513 | ] 1514 | }, 1515 | { 1516 | "cell_type": "code", 1517 | "execution_count": null, 1518 | "metadata": {}, 1519 | "outputs": [], 1520 | "source": [ 1521 | "similar_persons_df" 1522 | ] 1523 | } 1524 | ], 1525 | "metadata": { 1526 | "colab": { 1527 | "provenance": [] 1528 | }, 1529 | "kernelspec": { 1530 | "display_name": "Python 3 (ipykernel)", 1531 | "language": "python", 1532 | "name": "python3" 1533 | }, 1534 | "language_info": { 1535 | "codemirror_mode": { 1536 | "name": "ipython", 1537 | "version": 3 1538 | }, 1539 | "file_extension": ".py", 1540 | "mimetype": "text/x-python", 1541 | "name": "python", 1542 | "nbconvert_exporter": "python", 1543 | "pygments_lexer": "ipython3", 1544 | "version": "3.11.5" 1545 | } 1546 | }, 1547 | "nbformat": 4, 1548 | "nbformat_minor": 4 1549 | } 1550 | -------------------------------------------------------------------------------- /talent/module_02_unstructured_data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Module 2 - Taming Unstructured Data\n", 8 | "\n", 9 | "This module has the following objectives:\n", 10 | "- Creating a graph from Unstructured Data" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "metadata": { 17 | "id": "FHKg4DVZiQ98" 18 | }, 19 | "outputs": [], 20 | "source": [ 21 | "# !pip install graphdatascience neo4j dotenv pydantic openai" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "Import our usual suspects (and some more...)" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "import os\n", 38 | "import pandas as pd\n", 39 | "from dotenv import load_dotenv\n", 40 | "from graphdatascience import GraphDataScience\n", 41 | "from neo4j import Query, GraphDatabase, RoutingControl, Result\n", 42 | "from typing import List, Optional\n", 43 | "from pydantic import BaseModel, Field, validator\n", 44 | "from openai import OpenAI\n", 45 | "import json" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": { 51 | "id": "ynPe6RLRWSKd" 52 | }, 53 | "source": [ 54 | "# Setup" 55 | ] 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "metadata": { 60 | "id": "pa61u1jfyk3t" 61 | }, 62 | "source": [ 63 | "Load env variables" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "env_file = 'ws.env'" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": null, 78 | "metadata": { 79 | "id": "CHR_0lmElZ-R" 80 | }, 81 | "outputs": [], 82 | "source": [ 83 | "if os.path.exists(env_file):\n", 84 | " load_dotenv(env_file, override=True)\n", 85 | "\n", 86 | " # Neo4j\n", 87 | " HOST = os.getenv('NEO4J_URI')\n", 88 | " USERNAME = os.getenv('NEO4J_USERNAME')\n", 89 | " PASSWORD = os.getenv('NEO4J_PASSWORD')\n", 90 | " DATABASE = os.getenv('NEO4J_DATABASE')\n", 91 | "\n", 92 | " # AI\n", 93 | " OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')\n", 94 | " os.environ['OPENAI_API_KEY']=OPENAI_API_KEY\n", 95 | " LLM = os.getenv('LLM')\n", 96 | " EMBEDDINGS_MODEL = os.getenv('EMBEDDINGS_MODEL')\n", 97 | "else:\n", 98 | " print(f\"File {env_file} not found.\")" 99 | ] 100 | }, 101 | { 102 | "cell_type": "markdown", 103 | "metadata": {}, 104 | "source": [ 105 | "Connect to neo4j db" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": null, 111 | "metadata": {}, 112 | "outputs": [], 113 | "source": [ 114 | "driver = GraphDatabase.driver(\n", 115 | " HOST,\n", 116 | " auth=(USERNAME, PASSWORD)\n", 117 | ")" 118 | ] 119 | }, 120 | { 121 | "cell_type": "markdown", 122 | "metadata": {}, 123 | "source": [ 124 | "Test the connection" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": null, 130 | "metadata": { 131 | "id": "5w4eCb7xZZ-S" 132 | }, 133 | "outputs": [], 134 | "source": [ 135 | "driver.execute_query(\n", 136 | " \"\"\"\n", 137 | " MATCH (n) RETURN COUNT(n) as Count\n", 138 | " \"\"\",\n", 139 | " database_=DATABASE,\n", 140 | " routing_=RoutingControl.READ,\n", 141 | " result_transformer_= lambda r: r.to_df()\n", 142 | ")" 143 | ] 144 | }, 145 | { 146 | "cell_type": "markdown", 147 | "metadata": {}, 148 | "source": [ 149 | "## Unstructured data" 150 | ] 151 | }, 152 | { 153 | "cell_type": "markdown", 154 | "metadata": {}, 155 | "source": [ 156 | "Let's define some unstrucutured data from some of our Neo4j colleagues" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": null, 162 | "metadata": {}, 163 | "outputs": [], 164 | "source": [ 165 | "list_of_bio = [\n", 166 | " \"\"\" \n", 167 | " Kristof \"speedy gonzales\" Neys, Graph Data Science Director at Neo4j. Kristof excel at Machine \n", 168 | " Learning and has written more Quantified Path Patterns in Cypher than anyone else. Want to know more,\n", 169 | " drop him an email at kristof.neys@neo4j.com\n", 170 | " \"\"\",\n", 171 | " \"\"\" \n", 172 | " Håkan Löfqvist, Solutions Engineer at Neo4j. Håkan prefer using java \n", 173 | " over python, but nothing beats hacking cypher queries and using Graph Technology \n", 174 | " to deliver insane success :) Email: hakan.lofqvist@neo4j.com\n", 175 | " \"\"\",\n", 176 | "]" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": null, 182 | "metadata": {}, 183 | "outputs": [], 184 | "source": [ 185 | "for bio in list_of_bio:\n", 186 | " print(bio)" 187 | ] 188 | }, 189 | { 190 | "cell_type": "markdown", 191 | "metadata": {}, 192 | "source": [ 193 | "## Define the Domain Model" 194 | ] 195 | }, 196 | { 197 | "cell_type": "markdown", 198 | "metadata": {}, 199 | "source": [ 200 | "[Pydantic Models](https://docs.pydantic.dev/latest/api/base_model/) are simply classes which inherit from BaseModel and define fields as annotated attributes." 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": null, 206 | "metadata": {}, 207 | "outputs": [], 208 | "source": [ 209 | "class Skill(BaseModel):\n", 210 | " \"\"\"\n", 211 | " Represents a professional skill or knowledge of a person.\n", 212 | " \"\"\"\n", 213 | " name: str = Field(..., description=\"Sortened name of the skill\")\n", 214 | " \n", 215 | "class Person(BaseModel):\n", 216 | " \"\"\"\n", 217 | " Represents a person with a name.\n", 218 | " \"\"\"\n", 219 | " name: str = Field(..., description=\"Full name of person\")\n", 220 | " email: str = Field(..., description=\"A persons email address\")\n", 221 | " skills: List[Skill] = Field(..., description=\"List of skills known by the person\"\n", 222 | " )\n", 223 | " \n", 224 | "class PersonList(BaseModel):\n", 225 | " persons:List[Person]" 226 | ] 227 | }, 228 | { 229 | "cell_type": "code", 230 | "execution_count": null, 231 | "metadata": {}, 232 | "outputs": [], 233 | "source": [ 234 | "system_message = \"\"\"\n", 235 | " You are an expert in extracting structured information from person resumes.\n", 236 | " Identify key details such as:\n", 237 | " - Name of the person\n", 238 | " - Email address of the person\n", 239 | " - Skills known by the person\n", 240 | " \n", 241 | " Present the extracted information in a clear, structured format. Be concise, focusing on:\n", 242 | " - Key skills\n", 243 | " - Full name of person\n", 244 | " Ignore nick names, titles or roles and company information be short and consise with skills\n", 245 | "\"\"\"" 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": null, 251 | "metadata": {}, 252 | "outputs": [], 253 | "source": [ 254 | "client = OpenAI()" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": null, 260 | "metadata": {}, 261 | "outputs": [], 262 | "source": [ 263 | "def extract(document, model=LLM, temperature=0):\n", 264 | " response = client.beta.chat.completions.parse(\n", 265 | " model=model,\n", 266 | " temperature=temperature,\n", 267 | " messages=[\n", 268 | " {\"role\": \"system\", \"content\": system_message},\n", 269 | " {\"role\": \"user\", \"content\": document},\n", 270 | " ],\n", 271 | " response_format=Person,\n", 272 | " )\n", 273 | " return json.loads(response.choices[0].message.content)" 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": null, 279 | "metadata": {}, 280 | "outputs": [], 281 | "source": [ 282 | "rows = []\n", 283 | "for text in list_of_bio:\n", 284 | " data = extract(text)\n", 285 | " rows.append(data)\n", 286 | "rows" 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "execution_count": null, 292 | "metadata": {}, 293 | "outputs": [], 294 | "source": [ 295 | "structured_data = PersonList.model_validate({'persons':rows})" 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": null, 301 | "metadata": {}, 302 | "outputs": [], 303 | "source": [ 304 | "for k, details_list in structured_data.model_dump().items():\n", 305 | " print(f\"{k}\")\n", 306 | " for details in details_list:\n", 307 | " for key, value in details.items():\n", 308 | " print(f\" {key}: {value}\")\n", 309 | " print()" 310 | ] 311 | }, 312 | { 313 | "cell_type": "markdown", 314 | "metadata": { 315 | "id": "0OMlYdxHWZLx" 316 | }, 317 | "source": [ 318 | "## Graph creation\n", 319 | "Now that data is structured and validated, we can save it to the database" 320 | ] 321 | }, 322 | { 323 | "cell_type": "code", 324 | "execution_count": null, 325 | "metadata": {}, 326 | "outputs": [], 327 | "source": [ 328 | "records, summary, keys = driver.execute_query(\n", 329 | " \"\"\"\n", 330 | " UNWIND $rows AS row\n", 331 | " MERGE (p:Person{email:row.email})\n", 332 | " SET p.name = row.name\n", 333 | " WITH p, row\n", 334 | " FOREACH (skill IN row.skills | MERGE (s:Skill{name:skill.name}) MERGE (p)-[:KNOWS]->(s) )\n", 335 | " RETURN COUNT (*) AS rows_processed\n", 336 | " \"\"\",\n", 337 | " database_=DATABASE,\n", 338 | " routing_=RoutingControl.WRITE,\n", 339 | " rows = rows\n", 340 | ")" 341 | ] 342 | }, 343 | { 344 | "cell_type": "markdown", 345 | "metadata": {}, 346 | "source": [ 347 | "We could also have used [neomodel (OGM)](https://neo4j.com/labs/neomodel/?utm_source=GSearch&utm_medium=PaidSearch&utm_campaign=Evergreen&utm_content=EMEA-Search-SEMCE-DSA-None-SEM-SEM-NonABM&utm_term=&utm_adgroup=DSA&gad_source=1&gclid=Cj0KCQjw4cS-BhDGARIsABg4_J3lQsfHEHC6mPeWozzT4IgafxMFSHlZeWAENoPHfKnqTpFtqDG4nIkaAr3XEALw_wcB) to update the graph." 348 | ] 349 | }, 350 | { 351 | "cell_type": "markdown", 352 | "metadata": {}, 353 | "source": [ 354 | "Check the browser with the following:\n", 355 | "\n", 356 | "- ```MATCH p=(n:Person {name: \"Kristof Neys\"})-[:KNOWS]->(:Skill) RETURN p```\n", 357 | "- ```MATCH p=(n:Person {name: \"Håkan Löfqvist\"})-[:KNOWS]->(:Skill) RETURN p```" 358 | ] 359 | } 360 | ], 361 | "metadata": { 362 | "colab": { 363 | "provenance": [] 364 | }, 365 | "kernelspec": { 366 | "display_name": "Python 3 (ipykernel)", 367 | "language": "python", 368 | "name": "python3" 369 | }, 370 | "language_info": { 371 | "codemirror_mode": { 372 | "name": "ipython", 373 | "version": 3 374 | }, 375 | "file_extension": ".py", 376 | "mimetype": "text/x-python", 377 | "name": "python", 378 | "nbconvert_exporter": "python", 379 | "pygments_lexer": "ipython3", 380 | "version": "3.11.5" 381 | } 382 | }, 383 | "nbformat": 4, 384 | "nbformat_minor": 4 385 | } 386 | -------------------------------------------------------------------------------- /talent/module_03_graphrag_agent.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Module 3 - GraphRAG and Agents\n", 8 | "\n", 9 | "This module has the following objectives:\n", 10 | "- Experiment with queries for an Agent\n", 11 | "- Define Tooling\n", 12 | "- Create an agents with the available tools\n", 13 | "- Chatbot for an Agent\n", 14 | "- Text2Cypher (if we got time)" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": { 21 | "id": "FHKg4DVZiQ98" 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "#!pip install graphdatascience neo4j dotenv openai langchain, langgraph, pydantic, gradio" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": {}, 31 | "source": [ 32 | "Import our usual suspects (and some more...)" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "import os\n", 42 | "import pandas as pd\n", 43 | "from dotenv import load_dotenv\n", 44 | "from graphdatascience import GraphDataScience\n", 45 | "from neo4j import Query, GraphDatabase, RoutingControl, Result\n", 46 | "from langchain.schema import HumanMessage\n", 47 | "from langchain_openai import OpenAIEmbeddings, ChatOpenAI\n", 48 | "from langchain_core.messages import HumanMessage\n", 49 | "from langchain_core.prompts import ChatPromptTemplate, PromptTemplate\n", 50 | "from langchain_core.output_parsers import StrOutputParser\n", 51 | "from langgraph.prebuilt import create_react_agent\n", 52 | "from openai import OpenAI\n", 53 | "from typing import List, Optional\n", 54 | "from pydantic import BaseModel, Field, validator\n", 55 | "import functools\n", 56 | "from langchain_core.tools import tool\n", 57 | "import gradio as gr\n", 58 | "import time" 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "metadata": { 64 | "id": "ynPe6RLRWSKd" 65 | }, 66 | "source": [ 67 | "## Setup" 68 | ] 69 | }, 70 | { 71 | "cell_type": "markdown", 72 | "metadata": { 73 | "id": "pa61u1jfyk3t" 74 | }, 75 | "source": [ 76 | "Load env variables" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "env_file = 'ws.env'" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": null, 91 | "metadata": { 92 | "id": "CHR_0lmElZ-R" 93 | }, 94 | "outputs": [], 95 | "source": [ 96 | "if os.path.exists(env_file):\n", 97 | " load_dotenv(env_file, override=True)\n", 98 | "\n", 99 | " # Neo4j\n", 100 | " HOST = os.getenv('NEO4J_URI')\n", 101 | " USERNAME = os.getenv('NEO4J_USERNAME')\n", 102 | " PASSWORD = os.getenv('NEO4J_PASSWORD')\n", 103 | " DATABASE = os.getenv('NEO4J_DATABASE')\n", 104 | "\n", 105 | " # AI\n", 106 | " OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')\n", 107 | " os.environ['OPENAI_API_KEY']=OPENAI_API_KEY\n", 108 | " LLM = os.getenv('LLM')\n", 109 | " EMBEDDINGS_MODEL = os.getenv('EMBEDDINGS_MODEL')\n", 110 | "else:\n", 111 | " print(f\"File {env_file} not found.\")" 112 | ] 113 | }, 114 | { 115 | "cell_type": "markdown", 116 | "metadata": {}, 117 | "source": [ 118 | "Connect to neo4j db" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": null, 124 | "metadata": {}, 125 | "outputs": [], 126 | "source": [ 127 | "driver = GraphDatabase.driver(\n", 128 | " HOST,\n", 129 | " auth=(USERNAME, PASSWORD)\n", 130 | ")" 131 | ] 132 | }, 133 | { 134 | "cell_type": "markdown", 135 | "metadata": {}, 136 | "source": [ 137 | "Test the connection" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": null, 143 | "metadata": { 144 | "id": "5w4eCb7xZZ-S" 145 | }, 146 | "outputs": [], 147 | "source": [ 148 | "driver.execute_query(\n", 149 | " \"\"\"\n", 150 | " MATCH (n) RETURN COUNT(n) as Count\n", 151 | " \"\"\",\n", 152 | " database_=DATABASE,\n", 153 | " routing_=RoutingControl.READ,\n", 154 | " result_transformer_= lambda r: r.to_df()\n", 155 | ")" 156 | ] 157 | }, 158 | { 159 | "cell_type": "markdown", 160 | "metadata": { 161 | "id": "cdTfdAyV2ZaR" 162 | }, 163 | "source": [ 164 | "Test whether we got our constraints" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": null, 170 | "metadata": { 171 | "id": "cdTfdAyV2ZaR" 172 | }, 173 | "outputs": [], 174 | "source": [ 175 | "schema_result_df = driver.execute_query(\n", 176 | " 'show indexes',\n", 177 | " database_=DATABASE,\n", 178 | " routing_=RoutingControl.READ,\n", 179 | " result_transformer_= lambda r: r.to_df()\n", 180 | ")" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": null, 186 | "metadata": { 187 | "id": "cdTfdAyV2ZaR" 188 | }, 189 | "outputs": [], 190 | "source": [ 191 | "schema_result_df.head(100)" 192 | ] 193 | }, 194 | { 195 | "cell_type": "markdown", 196 | "metadata": {}, 197 | "source": [ 198 | "## Agent Thinking\n", 199 | "\n", 200 | "Let's say we want to build an Agent with multiple tools. Let's try to provide the following functionality: \n", 201 | "\n", 202 | "1. Retrieve the skills of a person.\n", 203 | " - Input: Person\n", 204 | " - Output: Skills\n", 205 | " - Example: *What skills does Kristof Neys have?* \n", 206 | "3. Retrieve similar skills to other skills.\n", 207 | " - Input: Skills\n", 208 | " - Output: Skills\n", 209 | " - Example: *What skills are similar to PowerBI and Data Visualization?*\n", 210 | "4. Retrieve similar persons to a person specified. \n", 211 | " - Input: Person\n", 212 | " - Output: Person\n", 213 | " - Example: *\"Which persons have similar skills as Kristof Neys?\"*\n", 214 | "6. Retrieve Persons based on a set of skills.\n", 215 | " - Input: Skills\n", 216 | " - Output: Person\n", 217 | " - Example: *Which persons have Python and AWS experience?*" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": null, 223 | "metadata": {}, 224 | "outputs": [], 225 | "source": [ 226 | "embeddings = OpenAIEmbeddings(model=EMBEDDINGS_MODEL)" 227 | ] 228 | }, 229 | { 230 | "cell_type": "markdown", 231 | "metadata": {}, 232 | "source": [ 233 | "## 1 - Retrieve Skills of Person" 234 | ] 235 | }, 236 | { 237 | "cell_type": "markdown", 238 | "metadata": {}, 239 | "source": [ 240 | "Find the connected skills given a person name." 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": null, 246 | "metadata": {}, 247 | "outputs": [], 248 | "source": [ 249 | "person_name = \"Lucy Turner\"" 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": null, 255 | "metadata": {}, 256 | "outputs": [], 257 | "source": [ 258 | "person_skills_df = driver.execute_query(\n", 259 | " \"\"\"\n", 260 | " MATCH (p:Person{name: $person_name})-[:KNOWS]->(s:Skill)\n", 261 | " RETURN p.name as name, COLLECT(s.name) as skills\n", 262 | " \"\"\",\n", 263 | " database_=DATABASE,\n", 264 | " routing_=RoutingControl.READ,\n", 265 | " result_transformer_= lambda r: r.to_df(),\n", 266 | " person_name = person_name\n", 267 | ")" 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": null, 273 | "metadata": {}, 274 | "outputs": [], 275 | "source": [ 276 | "person_skills_df" 277 | ] 278 | }, 279 | { 280 | "cell_type": "markdown", 281 | "metadata": { 282 | "id": "0OMlYdxHWZLx" 283 | }, 284 | "source": [ 285 | "## 2 - Retrieve similar skills" 286 | ] 287 | }, 288 | { 289 | "cell_type": "markdown", 290 | "metadata": { 291 | "id": "volCOjn_jjm3" 292 | }, 293 | "source": [ 294 | "Retrieve skills based on a list of skills" 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": null, 300 | "metadata": {}, 301 | "outputs": [], 302 | "source": [ 303 | "skills = ['Contineous Delivery', 'Cloud Native', 'Security']\n", 304 | "skills_vectors = embeddings.embed_documents(skills)" 305 | ] 306 | }, 307 | { 308 | "cell_type": "code", 309 | "execution_count": null, 310 | "metadata": { 311 | "id": "volCOjn_jjm3" 312 | }, 313 | "outputs": [], 314 | "source": [ 315 | "search_persons_with_skills_df = driver.execute_query(\n", 316 | " \"\"\"\n", 317 | " UNWIND $skills_vectors AS v\n", 318 | " CALL db.index.vector.queryNodes('skill-embeddings', 3, TOFLOATLIST(v)) YIELD node, score\n", 319 | " WHERE score > 0.89\n", 320 | " OPTIONAL MATCH (node)-[:SIMILAR_SEMANTIC]-(s:Skill)\n", 321 | " WITH COLLECT(node) AS nodes, COLLECT(DISTINCT s) AS skills\n", 322 | " WITH nodes + skills AS all_skills\n", 323 | " UNWIND all_skills AS skill\n", 324 | " RETURN DISTINCT skill.name as skill_name\n", 325 | " \"\"\",\n", 326 | " database_=DATABASE,\n", 327 | " routing_=RoutingControl.READ,\n", 328 | " result_transformer_= lambda r: r.to_df(),\n", 329 | " skills_vectors = skills_vectors\n", 330 | ")" 331 | ] 332 | }, 333 | { 334 | "cell_type": "code", 335 | "execution_count": null, 336 | "metadata": {}, 337 | "outputs": [], 338 | "source": [ 339 | "search_persons_with_skills_df" 340 | ] 341 | }, 342 | { 343 | "cell_type": "markdown", 344 | "metadata": {}, 345 | "source": [ 346 | "## 3 - Person Similarity" 347 | ] 348 | }, 349 | { 350 | "cell_type": "markdown", 351 | "metadata": {}, 352 | "source": [ 353 | "## Strategy 3.1 - Communities" 354 | ] 355 | }, 356 | { 357 | "cell_type": "markdown", 358 | "metadata": {}, 359 | "source": [ 360 | "We can use the community here to find similar people" 361 | ] 362 | }, 363 | { 364 | "cell_type": "code", 365 | "execution_count": null, 366 | "metadata": {}, 367 | "outputs": [], 368 | "source": [ 369 | "person_name_1 = \"John Garcia\"" 370 | ] 371 | }, 372 | { 373 | "cell_type": "code", 374 | "execution_count": null, 375 | "metadata": {}, 376 | "outputs": [], 377 | "source": [ 378 | "person_similarity_community_df = driver.execute_query(\n", 379 | " \"\"\"\n", 380 | " MATCH (p1:Person {name: $person_name_1})-[:KNOWS]->(s:Skill)\n", 381 | " WITH p1, COLLECT(s.name) as s1\n", 382 | " MATCH (p2:Person {leiden_community: p1.leiden_community})-[:KNOWS]->(s2:Skill)\n", 383 | " RETURN p1.name AS person_1, s1 AS skills_1, p1.leiden_community AS community, p2.name AS person_2, COLLECT(s2.name) AS skills_2\n", 384 | " \"\"\",\n", 385 | " database_=DATABASE,\n", 386 | " routing_=RoutingControl.READ,\n", 387 | " result_transformer_= lambda r: r.to_df(),\n", 388 | " person_name_1 = person_name_1\n", 389 | ")" 390 | ] 391 | }, 392 | { 393 | "cell_type": "code", 394 | "execution_count": null, 395 | "metadata": {}, 396 | "outputs": [], 397 | "source": [ 398 | "person_similarity_community_df" 399 | ] 400 | }, 401 | { 402 | "cell_type": "markdown", 403 | "metadata": {}, 404 | "source": [ 405 | "You can find all Skills in the community in the browser:\n", 406 | "```\n", 407 | "MATCH p=(:Person{leiden_community: 88})-[:KNOWS]->(s:Skill)\n", 408 | "RETURN p\n", 409 | "```" 410 | ] 411 | }, 412 | { 413 | "cell_type": "markdown", 414 | "metadata": {}, 415 | "source": [ 416 | "### Strategy 3.2 - Similar Skillsets" 417 | ] 418 | }, 419 | { 420 | "cell_type": "markdown", 421 | "metadata": {}, 422 | "source": [ 423 | "We can use the SIMILAR_SKILLSET relationship to find similar persons" 424 | ] 425 | }, 426 | { 427 | "cell_type": "code", 428 | "execution_count": null, 429 | "metadata": {}, 430 | "outputs": [], 431 | "source": [ 432 | "person_name_1 = \"John Garcia\"" 433 | ] 434 | }, 435 | { 436 | "cell_type": "code", 437 | "execution_count": null, 438 | "metadata": {}, 439 | "outputs": [], 440 | "source": [ 441 | "person_similar_skillset_df = driver.execute_query(\n", 442 | " \"\"\"\n", 443 | " MATCH (p1:Person {name: $person_name_1})-[:KNOWS]->(s:Skill)\n", 444 | " WITH p1, COLLECT(s.name) as s1\n", 445 | " MATCH (p1)-[r:SIMILAR_SKILLSET]-(p2:Person)-[:KNOWS]->(s2:Skill)\n", 446 | " WHERE r.overlap > 1\n", 447 | " RETURN p1.name AS person_1, s1 AS skills_1, r.overlap AS score, p2.name AS person_2, COLLECT(DISTINCT s2.name) AS skills_2\n", 448 | " ORDER BY score DESC\n", 449 | " \"\"\",\n", 450 | " database_=DATABASE,\n", 451 | " routing_=RoutingControl.READ,\n", 452 | " result_transformer_= lambda r: r.to_df(),\n", 453 | " person_name_1 = person_name_1\n", 454 | ")" 455 | ] 456 | }, 457 | { 458 | "cell_type": "code", 459 | "execution_count": null, 460 | "metadata": {}, 461 | "outputs": [], 462 | "source": [ 463 | "person_similar_skillset_df" 464 | ] 465 | }, 466 | { 467 | "cell_type": "markdown", 468 | "metadata": {}, 469 | "source": [ 470 | "### Strategy 3.3 Similar Skillsets and Semantic Meaning" 471 | ] 472 | }, 473 | { 474 | "cell_type": "markdown", 475 | "metadata": {}, 476 | "source": [ 477 | "Use the Semantic Meaning and Skill overlap to find people with similar skills" 478 | ] 479 | }, 480 | { 481 | "cell_type": "code", 482 | "execution_count": null, 483 | "metadata": {}, 484 | "outputs": [], 485 | "source": [ 486 | "person_name_1 = \"John Garcia\"" 487 | ] 488 | }, 489 | { 490 | "cell_type": "code", 491 | "execution_count": null, 492 | "metadata": {}, 493 | "outputs": [], 494 | "source": [ 495 | "person_similarity_df = driver.execute_query(\n", 496 | " \"\"\"\n", 497 | " MATCH (p1:Person {name: $person_name_1})-[:KNOWS]->(s:Skill)\n", 498 | " WITH p1, COLLECT(s.name) as skills_1\n", 499 | " CALL (p1){\n", 500 | " MATCH (p1)-[:KNOWS]->(s1:Skill)-[r:SIMILAR_SEMANTIC]-(s2:Skill)<-[:KNOWS]-(p2:Person)\n", 501 | " \n", 502 | " RETURN p1 as person_1, p2 as person_2, SUM(r.score) AS score\n", 503 | " UNION\n", 504 | " MATCH (p1)-[r:SIMILAR_SKILLSET]-(p2:Person)\n", 505 | " RETURN p1 as person_1, p2 AS person_2, SUM(r.overlap) AS score\n", 506 | " }\n", 507 | " WITH person_1.name as person_1, skills_1, person_2, SUM(score) as score\n", 508 | " WHERE score >= 1\n", 509 | " MATCH (person_2)-[:KNOWS]->(s:Skill)\n", 510 | " RETURN person_1, skills_1, person_2.name as person_2, COLLECT(s.name) as skills_2, score\n", 511 | " ORDER BY score DESC LIMIT 5\n", 512 | " \"\"\",\n", 513 | " database_=DATABASE,\n", 514 | " routing_=RoutingControl.READ,\n", 515 | " result_transformer_= lambda r: r.to_df(),\n", 516 | " person_name_1 = person_name_1\n", 517 | ")" 518 | ] 519 | }, 520 | { 521 | "cell_type": "code", 522 | "execution_count": null, 523 | "metadata": {}, 524 | "outputs": [], 525 | "source": [ 526 | "person_similarity_df" 527 | ] 528 | }, 529 | { 530 | "cell_type": "markdown", 531 | "metadata": {}, 532 | "source": [ 533 | "## 4 - Recommendation of Person given on skills" 534 | ] 535 | }, 536 | { 537 | "cell_type": "markdown", 538 | "metadata": {}, 539 | "source": [ 540 | "## Vector Index Search" 541 | ] 542 | }, 543 | { 544 | "cell_type": "code", 545 | "execution_count": null, 546 | "metadata": {}, 547 | "outputs": [], 548 | "source": [ 549 | "skills = ['AWS', 'Security']" 550 | ] 551 | }, 552 | { 553 | "cell_type": "code", 554 | "execution_count": null, 555 | "metadata": {}, 556 | "outputs": [], 557 | "source": [ 558 | "skills_vectors = embeddings.embed_documents(skills)" 559 | ] 560 | }, 561 | { 562 | "cell_type": "markdown", 563 | "metadata": {}, 564 | "source": [ 565 | "We get the approximate top 10 nearest nodes to the search vector `v` and take the 3 first returned. Then put them together in a list (`skill_list`) and does same ranking as before (number of skills)" 566 | ] 567 | }, 568 | { 569 | "cell_type": "code", 570 | "execution_count": null, 571 | "metadata": {}, 572 | "outputs": [], 573 | "source": [ 574 | "nn_df = driver.execute_query(\n", 575 | " \"\"\"UNWIND $skills_vectors AS v\n", 576 | " CALL db.index.vector.queryNodes('skill-embeddings', 3, TOFLOATLIST(v)) YIELD node, score\n", 577 | " WHERE score > 0.85\n", 578 | " WITH v as embedding, COALESCE(COLLECT(node.name), []) AS top\n", 579 | " RETURN *\n", 580 | " \"\"\",\n", 581 | " database_=DATABASE,\n", 582 | " routing_=RoutingControl.READ,\n", 583 | " result_transformer_= lambda r: r.to_df(),\n", 584 | " skills_vectors = skills_vectors\n", 585 | ")\n", 586 | "nn_df['skills'] = skills\n", 587 | "cols = list(nn_df.columns)[-1:] + list(nn_df.columns)[:-1]\n", 588 | "nn_df = nn_df[cols]" 589 | ] 590 | }, 591 | { 592 | "cell_type": "code", 593 | "execution_count": null, 594 | "metadata": {}, 595 | "outputs": [], 596 | "source": [ 597 | "nn_df" 598 | ] 599 | }, 600 | { 601 | "cell_type": "code", 602 | "execution_count": null, 603 | "metadata": {}, 604 | "outputs": [], 605 | "source": [ 606 | "find_persons_given_skills_df = driver.execute_query(\n", 607 | " \"\"\"\n", 608 | " UNWIND $skills_vectors AS v\n", 609 | " CALL db.index.vector.queryNodes('skill-embeddings', 3, TOFLOATLIST(v)) YIELD node, score\n", 610 | " WHERE score > 0.85\n", 611 | " OPTIONAL MATCH (node)-[:SIMILAR_SEMANTIC]-(s:Skill)\n", 612 | " WITH COLLECT(node) AS nodes, COLLECT(DISTINCT s) AS skills\n", 613 | " WITH nodes + skills AS all_skills\n", 614 | " UNWIND all_skills AS skill\n", 615 | " MATCH (p:Person)-[:KNOWS]->(skill)\n", 616 | " RETURN p.name AS person, COUNT(DISTINCT(skill)) AS skill_count, COLLECT(DISTINCT(skill.name)) as similar_skills\n", 617 | " ORDER BY skill_count DESC LIMIT 10\n", 618 | " \"\"\",\n", 619 | " database_=DATABASE,\n", 620 | " routing_=RoutingControl.READ,\n", 621 | " result_transformer_= lambda r: r.to_df(),\n", 622 | " skills_vectors = skills_vectors\n", 623 | ")" 624 | ] 625 | }, 626 | { 627 | "cell_type": "code", 628 | "execution_count": null, 629 | "metadata": {}, 630 | "outputs": [], 631 | "source": [ 632 | "find_persons_given_skills_df" 633 | ] 634 | }, 635 | { 636 | "cell_type": "markdown", 637 | "metadata": {}, 638 | "source": [ 639 | "## Agents with GraphRAG" 640 | ] 641 | }, 642 | { 643 | "cell_type": "markdown", 644 | "metadata": {}, 645 | "source": [ 646 | "### Lets create a Retrieval agent" 647 | ] 648 | }, 649 | { 650 | "cell_type": "code", 651 | "execution_count": null, 652 | "metadata": {}, 653 | "outputs": [], 654 | "source": [ 655 | "class Skill(BaseModel):\n", 656 | " \"\"\"\n", 657 | " Represents a professional skill or knowledge of a person.\n", 658 | " \"\"\"\n", 659 | " name: str = Field(..., description=\"Sortened name of the skill\")" 660 | ] 661 | }, 662 | { 663 | "cell_type": "markdown", 664 | "metadata": {}, 665 | "source": [ 666 | "### Tool 1" 667 | ] 668 | }, 669 | { 670 | "cell_type": "code", 671 | "execution_count": null, 672 | "metadata": {}, 673 | "outputs": [], 674 | "source": [ 675 | "def retrieve_skills_of_person(person_name: str) -> pd.DataFrame:\n", 676 | " \"\"\"Retrieve the skills of a person. Person is provided with it's name\"\"\"\n", 677 | " return driver.execute_query(\n", 678 | " \"\"\"\n", 679 | " MATCH (p:Person{name: $person_name})-[:KNOWS]->(s:Skill)\n", 680 | " RETURN p.name as name, COLLECT(s.name) as skills\n", 681 | " \"\"\",\n", 682 | " database_=DATABASE,\n", 683 | " routing_=RoutingControl.READ,\n", 684 | " result_transformer_= lambda r: r.to_df(),\n", 685 | " person_name = person_name\n", 686 | " )" 687 | ] 688 | }, 689 | { 690 | "cell_type": "code", 691 | "execution_count": null, 692 | "metadata": {}, 693 | "outputs": [], 694 | "source": [ 695 | "retrieve_skills_of_person('Mia Nelson') " 696 | ] 697 | }, 698 | { 699 | "cell_type": "markdown", 700 | "metadata": {}, 701 | "source": [ 702 | "### Tool 2" 703 | ] 704 | }, 705 | { 706 | "cell_type": "code", 707 | "execution_count": null, 708 | "metadata": {}, 709 | "outputs": [], 710 | "source": [ 711 | "def find_similar_skills(skills: List[Skill]) -> pd.DataFrame:\n", 712 | " \"\"\"Find similar skills to list of skills specified. Skills are specified by a list of their names\"\"\"\n", 713 | " skills = [s.name for s in skills]\n", 714 | " skills_vectors = embeddings.embed_documents(skills)\n", 715 | " return driver.execute_query(\n", 716 | " \"\"\"\n", 717 | " UNWIND $skills_vectors AS v\n", 718 | " CALL db.index.vector.queryNodes('skill-embeddings', 3, TOFLOATLIST(v)) YIELD node, score\n", 719 | " WHERE score > 0.89\n", 720 | " OPTIONAL MATCH (node)-[:SIMILAR_SEMANTIC]-(s:Skill)\n", 721 | " WITH COLLECT(node) AS nodes, COLLECT(DISTINCT s) AS skills\n", 722 | " WITH nodes + skills AS all_skills\n", 723 | " UNWIND all_skills AS skill\n", 724 | " RETURN DISTINCT skill.name as skill_name\n", 725 | " \"\"\",\n", 726 | " database_=DATABASE,\n", 727 | " routing_=RoutingControl.READ,\n", 728 | " result_transformer_= lambda r: r.to_df(),\n", 729 | " skills_vectors = skills_vectors\n", 730 | ")" 731 | ] 732 | }, 733 | { 734 | "cell_type": "code", 735 | "execution_count": null, 736 | "metadata": {}, 737 | "outputs": [], 738 | "source": [ 739 | "find_similar_skills([Skill(name='Python')])" 740 | ] 741 | }, 742 | { 743 | "cell_type": "markdown", 744 | "metadata": {}, 745 | "source": [ 746 | "### Tool 3" 747 | ] 748 | }, 749 | { 750 | "cell_type": "code", 751 | "execution_count": null, 752 | "metadata": {}, 753 | "outputs": [], 754 | "source": [ 755 | "def person_similarity(person_name: str) -> pd.DataFrame:\n", 756 | " \"\"\"Find a similar person to the one specified based on their skill similarity. Persons are provided with their name\"\"\"\n", 757 | " \n", 758 | " return driver.execute_query(\n", 759 | " \"\"\"\n", 760 | " MATCH (p1:Person {name: $person_name})-[:KNOWS]->(s:Skill)\n", 761 | " WITH p1, COLLECT(s.name) as skills_1\n", 762 | " CALL (p1){\n", 763 | " MATCH (p1)-[:KNOWS]->(s1:Skill)-[r:SIMILAR_SEMANTIC]-(s2:Skill)<-[:KNOWS]-(p2:Person)\n", 764 | " RETURN p1 as person_1, p2 as person_2, SUM(r.score) AS score\n", 765 | " UNION \n", 766 | " MATCH (p1)-[r:SIMILAR_SKILLSET]-(p2:Person)\n", 767 | " RETURN p1 as person_1, p2 AS person_2, SUM(r.overlap) AS score\n", 768 | " }\n", 769 | " WITH person_1.name as person_1, skills_1, person_2, SUM(score) as score\n", 770 | " WHERE score >= 1\n", 771 | " MATCH (person_2)-[:KNOWS]->(s:Skill)\n", 772 | " RETURN person_1, skills_1, person_2.name as person_2, COLLECT(s.name) as skills_2, score\n", 773 | " ORDER BY score DESC LIMIT 5\n", 774 | " \"\"\",\n", 775 | " database_=DATABASE,\n", 776 | " routing_=RoutingControl.READ,\n", 777 | " result_transformer_= lambda r: r.to_df(),\n", 778 | " person_name = person_name\n", 779 | " )" 780 | ] 781 | }, 782 | { 783 | "cell_type": "code", 784 | "execution_count": null, 785 | "metadata": {}, 786 | "outputs": [], 787 | "source": [ 788 | "person_similarity(\"Christopher Jackson\")" 789 | ] 790 | }, 791 | { 792 | "cell_type": "markdown", 793 | "metadata": {}, 794 | "source": [ 795 | "### Tool 4" 796 | ] 797 | }, 798 | { 799 | "cell_type": "code", 800 | "execution_count": null, 801 | "metadata": {}, 802 | "outputs": [], 803 | "source": [ 804 | "def find_person_based_on_skills(skills: List[Skill]) -> pd.DataFrame:\n", 805 | " \"\"\"\n", 806 | " Find persons based on skills they have. Skills are specified by their names. \n", 807 | " Note that similar skills can be found. These are considered similar. \n", 808 | " \"\"\"\n", 809 | " skills = [s.name for s in skills]\n", 810 | " skills_vectors = embeddings.embed_documents(skills)\n", 811 | " return driver.execute_query(\n", 812 | " \"\"\"\n", 813 | " UNWIND $skills_vectors AS v\n", 814 | " CALL db.index.vector.queryNodes('skill-embeddings', 3, TOFLOATLIST(v)) YIELD node, score\n", 815 | " WHERE score > 0.89\n", 816 | " OPTIONAL MATCH (node)-[:SIMILAR_SEMANTIC]-(s:Skill)\n", 817 | " WITH COLLECT(node) AS nodes, COLLECT(DISTINCT s) AS skills\n", 818 | " WITH nodes + skills AS all_skills\n", 819 | " UNWIND all_skills AS skill\n", 820 | " MATCH (p:Person)-[:KNOWS]->(skill)\n", 821 | " RETURN p.name AS person, COUNT(DISTINCT(skill)) AS score, COLLECT(DISTINCT(skill.name)) as similar_skills\n", 822 | " ORDER BY score DESC LIMIT 10\n", 823 | " \"\"\",\n", 824 | " database_=DATABASE,\n", 825 | " routing_=RoutingControl.READ,\n", 826 | " result_transformer_= lambda r: r.to_df(),\n", 827 | " skills_vectors = skills_vectors\n", 828 | ")" 829 | ] 830 | }, 831 | { 832 | "cell_type": "code", 833 | "execution_count": null, 834 | "metadata": {}, 835 | "outputs": [], 836 | "source": [ 837 | "find_person_based_on_skills([Skill(name='Security'), Skill(name='Pandas')])" 838 | ] 839 | }, 840 | { 841 | "cell_type": "markdown", 842 | "metadata": {}, 843 | "source": [ 844 | "## Setting up the Agent" 845 | ] 846 | }, 847 | { 848 | "cell_type": "code", 849 | "execution_count": null, 850 | "metadata": {}, 851 | "outputs": [], 852 | "source": [ 853 | "llm = ChatOpenAI(model_name=LLM, temperature=0)" 854 | ] 855 | }, 856 | { 857 | "cell_type": "code", 858 | "execution_count": null, 859 | "metadata": {}, 860 | "outputs": [], 861 | "source": [ 862 | "response = llm.invoke([HumanMessage(content=\"hi!\")])\n", 863 | "response.content" 864 | ] 865 | }, 866 | { 867 | "cell_type": "code", 868 | "execution_count": null, 869 | "metadata": {}, 870 | "outputs": [], 871 | "source": [ 872 | "tools = [\n", 873 | " retrieve_skills_of_person, \n", 874 | " find_similar_skills,\n", 875 | " person_similarity,\n", 876 | " find_person_based_on_skills,\n", 877 | "]\n", 878 | "\n", 879 | "llm_with_tools = llm.bind_tools(tools)" 880 | ] 881 | }, 882 | { 883 | "cell_type": "code", 884 | "execution_count": null, 885 | "metadata": {}, 886 | "outputs": [], 887 | "source": [ 888 | "response = llm_with_tools.invoke([HumanMessage(content=\"What skills does Kristof Neys have?\")])\n", 889 | "\n", 890 | "print(f\"ContentString: {response.content}\")\n", 891 | "print(f\"ToolCalls: {response.tool_calls}\")" 892 | ] 893 | }, 894 | { 895 | "cell_type": "code", 896 | "execution_count": null, 897 | "metadata": {}, 898 | "outputs": [], 899 | "source": [ 900 | "response = llm_with_tools.invoke([HumanMessage(content=\"What skills are similar to PowerBI and Data Visualization?\")])\n", 901 | "\n", 902 | "print(f\"ContentString: {response.content}\")\n", 903 | "print(f\"ToolCalls: {response.tool_calls}\")" 904 | ] 905 | }, 906 | { 907 | "cell_type": "code", 908 | "execution_count": null, 909 | "metadata": {}, 910 | "outputs": [], 911 | "source": [ 912 | "response = llm_with_tools.invoke([HumanMessage(content=\"Which persons have similar skills as Kristof Neys?\")])\n", 913 | "\n", 914 | "print(f\"ContentString: {response.content}\")\n", 915 | "print(f\"ToolCalls: {response.tool_calls}\")" 916 | ] 917 | }, 918 | { 919 | "cell_type": "code", 920 | "execution_count": null, 921 | "metadata": {}, 922 | "outputs": [], 923 | "source": [ 924 | "response = llm_with_tools.invoke([HumanMessage(content=\"Which persons have Python and AWS experience?\")])\n", 925 | "\n", 926 | "print(f\"ContentString: {response.content}\")\n", 927 | "print(f\"ToolCalls: {response.tool_calls}\")" 928 | ] 929 | }, 930 | { 931 | "cell_type": "markdown", 932 | "metadata": {}, 933 | "source": [ 934 | "We can see that there's now no text content, but there is a tool call! It wants us to call the Tavily Search tool. This isn't calling that tool yet - it's just telling us to. In order to actually call it, we'll want to create our agent." 935 | ] 936 | }, 937 | { 938 | "cell_type": "markdown", 939 | "metadata": {}, 940 | "source": [ 941 | "## Running Agents with LangGraph" 942 | ] 943 | }, 944 | { 945 | "cell_type": "code", 946 | "execution_count": null, 947 | "metadata": {}, 948 | "outputs": [], 949 | "source": [ 950 | "agent_executor = create_react_agent(llm, tools)" 951 | ] 952 | }, 953 | { 954 | "cell_type": "code", 955 | "execution_count": null, 956 | "metadata": {}, 957 | "outputs": [], 958 | "source": [ 959 | "response = agent_executor.invoke({\"messages\": [HumanMessage(content=\"hi!\")]})" 960 | ] 961 | }, 962 | { 963 | "cell_type": "code", 964 | "execution_count": null, 965 | "metadata": {}, 966 | "outputs": [], 967 | "source": [ 968 | "response[\"messages\"]" 969 | ] 970 | }, 971 | { 972 | "cell_type": "markdown", 973 | "metadata": {}, 974 | "source": [ 975 | "#### Run some examples! " 976 | ] 977 | }, 978 | { 979 | "cell_type": "code", 980 | "execution_count": null, 981 | "metadata": {}, 982 | "outputs": [], 983 | "source": [ 984 | "def ask_to_agent(question):\n", 985 | " for step in agent_executor.stream(\n", 986 | " {\"messages\": [HumanMessage(content=question)]},\n", 987 | " stream_mode=\"values\",\n", 988 | " ):\n", 989 | " step[\"messages\"][-1].pretty_print()" 990 | ] 991 | }, 992 | { 993 | "cell_type": "code", 994 | "execution_count": null, 995 | "metadata": {}, 996 | "outputs": [], 997 | "source": [ 998 | "question = \"What skills does Kristof Neys have?\"" 999 | ] 1000 | }, 1001 | { 1002 | "cell_type": "code", 1003 | "execution_count": null, 1004 | "metadata": {}, 1005 | "outputs": [], 1006 | "source": [ 1007 | "ask_to_agent(question)" 1008 | ] 1009 | }, 1010 | { 1011 | "cell_type": "code", 1012 | "execution_count": null, 1013 | "metadata": {}, 1014 | "outputs": [], 1015 | "source": [ 1016 | "question = \"What skills are similar to PowerBI and Data Visualization?\"" 1017 | ] 1018 | }, 1019 | { 1020 | "cell_type": "code", 1021 | "execution_count": null, 1022 | "metadata": {}, 1023 | "outputs": [], 1024 | "source": [ 1025 | "ask_to_agent(question)" 1026 | ] 1027 | }, 1028 | { 1029 | "cell_type": "code", 1030 | "execution_count": null, 1031 | "metadata": {}, 1032 | "outputs": [], 1033 | "source": [ 1034 | "question = \"Which persons have similar skills as Daniel Hill?\"" 1035 | ] 1036 | }, 1037 | { 1038 | "cell_type": "code", 1039 | "execution_count": null, 1040 | "metadata": {}, 1041 | "outputs": [], 1042 | "source": [ 1043 | "ask_to_agent(question)" 1044 | ] 1045 | }, 1046 | { 1047 | "cell_type": "code", 1048 | "execution_count": null, 1049 | "metadata": {}, 1050 | "outputs": [], 1051 | "source": [ 1052 | "question = \"Which persons have Python and AWS experience?\"" 1053 | ] 1054 | }, 1055 | { 1056 | "cell_type": "code", 1057 | "execution_count": null, 1058 | "metadata": {}, 1059 | "outputs": [], 1060 | "source": [ 1061 | "ask_to_agent(question)" 1062 | ] 1063 | }, 1064 | { 1065 | "cell_type": "markdown", 1066 | "metadata": {}, 1067 | "source": [ 1068 | "## Chatbot" 1069 | ] 1070 | }, 1071 | { 1072 | "cell_type": "markdown", 1073 | "metadata": {}, 1074 | "source": [ 1075 | "Now create a chatbot with the agent providing the responses" 1076 | ] 1077 | }, 1078 | { 1079 | "cell_type": "code", 1080 | "execution_count": null, 1081 | "metadata": {}, 1082 | "outputs": [], 1083 | "source": [ 1084 | "def user(user_message, history):\n", 1085 | " if history is None:\n", 1086 | " history = []\n", 1087 | " history.append({\"role\": \"user\", \"content\": user_message})\n", 1088 | " return \"\", history\n", 1089 | "\n", 1090 | "def get_answer(history):\n", 1091 | " steps = []\n", 1092 | " full_prompt = \"\\n\".join([f\"{msg['role'].capitalize()}: {msg['content']}\" for msg in history])\n", 1093 | " \n", 1094 | " for step in agent_executor.stream(\n", 1095 | " {\"messages\": [HumanMessage(content=full_prompt)]},\n", 1096 | " stream_mode=\"values\",\n", 1097 | " ):\n", 1098 | " step[\"messages\"][-1].pretty_print()\n", 1099 | " steps.append(step[\"messages\"][-1].content)\n", 1100 | " \n", 1101 | " return steps[-1]\n", 1102 | "\n", 1103 | "def bot(history):\n", 1104 | " bot_message = get_answer(history)\n", 1105 | " history.append({\"role\": \"assistant\", \"content\": \"\"})\n", 1106 | "\n", 1107 | " for character in bot_message:\n", 1108 | " history[-1][\"content\"] += character\n", 1109 | " time.sleep(0.01)\n", 1110 | " yield history\n", 1111 | "\n", 1112 | "with gr.Blocks() as demo:\n", 1113 | " chatbot = gr.Chatbot(\n", 1114 | " label=\"Chatbot on a Graph\",\n", 1115 | " avatar_images=[\n", 1116 | " \"https://png.pngtree.com/png-vector/20220525/ourmid/pngtree-concept-of-facial-animal-avatar-chatbot-dog-chat-machine-illustration-vector-png-image_46652864.jpg\",\n", 1117 | " \"https://d-cb.jc-cdn.com/sites/crackberry.com/files/styles/larger/public/article_images/2023/08/openai-logo.jpg\"\n", 1118 | " ],\n", 1119 | " type=\"messages\", \n", 1120 | " )\n", 1121 | " msg = gr.Textbox(label=\"Message\")\n", 1122 | " clear = gr.Button(\"Clear\")\n", 1123 | "\n", 1124 | " msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(\n", 1125 | " bot, [chatbot], chatbot\n", 1126 | " )\n", 1127 | "\n", 1128 | " clear.click(lambda: [], None, chatbot, queue=False)\n", 1129 | "\n", 1130 | "demo.queue()\n", 1131 | "demo.launch(share=True)" 1132 | ] 1133 | }, 1134 | { 1135 | "cell_type": "markdown", 1136 | "metadata": {}, 1137 | "source": [ 1138 | "If you want to have the light-mode for the chatbot paste the following after the URL: /?__theme=light" 1139 | ] 1140 | }, 1141 | { 1142 | "cell_type": "markdown", 1143 | "metadata": {}, 1144 | "source": [ 1145 | "### Text2Cypher" 1146 | ] 1147 | }, 1148 | { 1149 | "cell_type": "markdown", 1150 | "metadata": {}, 1151 | "source": [ 1152 | "If time allows we can still experiment with the Text2Cypher functionality. " 1153 | ] 1154 | }, 1155 | { 1156 | "cell_type": "code", 1157 | "execution_count": null, 1158 | "metadata": {}, 1159 | "outputs": [], 1160 | "source": [ 1161 | "text2cypher_prompt = PromptTemplate.from_template(\n", 1162 | " \"\"\"\n", 1163 | " Task: Generate a Cypher statement for querying a Neo4j graph database from a user input. \n", 1164 | " - Do not include triple backticks ``` or ```cypher or any additional text except the generated Cypher statement in your response.\n", 1165 | " - Do not use any properties or relationships not included in the schema.\n", 1166 | " \n", 1167 | " Schema:\n", 1168 | " {schema}\n", 1169 | " \n", 1170 | " #User Input\n", 1171 | " {question}\n", 1172 | " \n", 1173 | " Cypher query:\n", 1174 | " \"\"\"\n", 1175 | ")" 1176 | ] 1177 | }, 1178 | { 1179 | "cell_type": "code", 1180 | "execution_count": null, 1181 | "metadata": {}, 1182 | "outputs": [], 1183 | "source": [ 1184 | "annotated_schema = \"\"\"\n", 1185 | " Nodes:\n", 1186 | " Person:\n", 1187 | " description: \"A person in our talent pool.\"\n", 1188 | " properties:\n", 1189 | " name:\n", 1190 | " type: \"string\"\n", 1191 | " description: \"The full name of the person. serves as a unique identifier.\"\n", 1192 | " email:\n", 1193 | " type: \"string\"\n", 1194 | " description: \"The email address of the person.\"\n", 1195 | " leiden_community:\n", 1196 | " type: \"integer\"\n", 1197 | " description: \"The talent community for the person. People in the same talent segment share similar skills.\"\n", 1198 | " Skill:\n", 1199 | " description: \"A professional skill.\"\n", 1200 | " properties:\n", 1201 | " name:\n", 1202 | " type: \"string\"\n", 1203 | " description: \"The unique name of the skill.\"\n", 1204 | " Relationships:\n", 1205 | " KNOWS:\n", 1206 | " description: \"A person knowing a skill.\"\n", 1207 | " query_pattern: \"(:Person)-[:KNOWS]->(:Skill)\"\n", 1208 | " \"\"\"" 1209 | ] 1210 | }, 1211 | { 1212 | "cell_type": "code", 1213 | "execution_count": null, 1214 | "metadata": {}, 1215 | "outputs": [], 1216 | "source": [ 1217 | "text2cypher_llm = ChatOpenAI(model=LLM, temperature=0)" 1218 | ] 1219 | }, 1220 | { 1221 | "cell_type": "code", 1222 | "execution_count": null, 1223 | "metadata": {}, 1224 | "outputs": [], 1225 | "source": [ 1226 | "@tool\n", 1227 | "def perform_aggregation_query(question: str) -> pd.DataFrame:\n", 1228 | " \"\"\"\n", 1229 | " perform an aggregation query on the Neo4j graph database and obtain the results.\n", 1230 | " \"\"\"\n", 1231 | " prompt = text2cypher_prompt.invoke({'schema': annotated_schema, 'question': question})\n", 1232 | " query = text2cypher_llm.invoke(prompt).content\n", 1233 | " print(f\"executing Cypher query:\\n{query}\")\n", 1234 | " return driver.execute_query(\n", 1235 | " query,\n", 1236 | " database_=DATABASE,\n", 1237 | " routing_=RoutingControl.READ,\n", 1238 | " result_transformer_= lambda r: r.to_df()\n", 1239 | " ) " 1240 | ] 1241 | }, 1242 | { 1243 | "cell_type": "code", 1244 | "execution_count": null, 1245 | "metadata": {}, 1246 | "outputs": [], 1247 | "source": [ 1248 | "perform_aggregation_query('describe communities by skills') " 1249 | ] 1250 | }, 1251 | { 1252 | "cell_type": "code", 1253 | "execution_count": null, 1254 | "metadata": {}, 1255 | "outputs": [], 1256 | "source": [ 1257 | "perform_aggregation_query('how many people share skills with Isabella Allen, and what are the skills')" 1258 | ] 1259 | }, 1260 | { 1261 | "cell_type": "code", 1262 | "execution_count": null, 1263 | "metadata": {}, 1264 | "outputs": [], 1265 | "source": [ 1266 | "perform_aggregation_query('Can you list me a 5 random person name from the database?')" 1267 | ] 1268 | } 1269 | ], 1270 | "metadata": { 1271 | "colab": { 1272 | "provenance": [] 1273 | }, 1274 | "kernelspec": { 1275 | "display_name": "Python 3 (ipykernel)", 1276 | "language": "python", 1277 | "name": "python3" 1278 | }, 1279 | "language_info": { 1280 | "codemirror_mode": { 1281 | "name": "ipython", 1282 | "version": 3 1283 | }, 1284 | "file_extension": ".py", 1285 | "mimetype": "text/x-python", 1286 | "name": "python", 1287 | "nbconvert_exporter": "python", 1288 | "pygments_lexer": "ipython3", 1289 | "version": "3.11.5" 1290 | } 1291 | }, 1292 | "nbformat": 4, 1293 | "nbformat_minor": 4 1294 | } 1295 | -------------------------------------------------------------------------------- /talent/workshop_slides.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neo4j-product-examples/genai-workshop/ef9077abccaa9238f2abd556d1ef5beac64610d7/talent/workshop_slides.pdf -------------------------------------------------------------------------------- /talent/ws_temp.txt: -------------------------------------------------------------------------------- 1 | NEO4J_USERNAME = "neo4j" 2 | NEO4J_PASSWORD = "12345" 3 | NEO4J_URI = "neo4j+s://xxxxx.databases.neo4j.io:7687" 4 | NEO4J_DATABASE = "neo4j" 5 | OPENAI_API_KEY = "" 6 | LLM = "" 7 | EMBEDDINGS_MODEL = "" --------------------------------------------------------------------------------