├── src └── python │ └── codequestion │ ├── __init__.py │ ├── etl │ ├── __init__.py │ └── stackexchange │ │ ├── __init__.py │ │ ├── decompress.py │ │ ├── sift.py │ │ ├── execute.py │ │ ├── xml2db.py │ │ └── db2qa.py │ ├── path.py │ ├── topics.py │ ├── tokenizer.py │ ├── models.py │ ├── download.py │ ├── console.py │ ├── index.py │ ├── vectors.py │ ├── search.py │ └── evaluate.py ├── demo.gif ├── logo.png ├── images ├── topics.gif ├── vscode.png ├── architecture.png ├── architecture-dark.png └── architecture.excalidraw ├── .gitignore ├── config ├── index.v1.yml └── index.yml ├── .coveragerc ├── test ├── python │ ├── utils.py │ ├── testdownload.py │ ├── testconsole.py │ └── testindex.py └── stackexchange │ └── query.txt ├── .pre-commit-config.yaml ├── .pylintrc ├── Makefile ├── .github └── workflows │ └── build.yml ├── setup.py ├── LICENSE └── README.md /src/python/codequestion/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/python/codequestion/etl/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /demo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neuml/codequestion/HEAD/demo.gif -------------------------------------------------------------------------------- /logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neuml/codequestion/HEAD/logo.png -------------------------------------------------------------------------------- /images/topics.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neuml/codequestion/HEAD/images/topics.gif -------------------------------------------------------------------------------- /images/vscode.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neuml/codequestion/HEAD/images/vscode.png -------------------------------------------------------------------------------- /images/architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neuml/codequestion/HEAD/images/architecture.png -------------------------------------------------------------------------------- /images/architecture-dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neuml/codequestion/HEAD/images/architecture-dark.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | dist/ 3 | htmlcov/ 4 | *egg-info/ 5 | __pycache__/ 6 | .coverage 7 | .coverage.* 8 | *.pyc 9 | -------------------------------------------------------------------------------- /config/index.v1.yml: -------------------------------------------------------------------------------- 1 | path: stackexchange-300d.magnitude 2 | content: True 3 | objects: True 4 | quantize: True 5 | storevectors: True 6 | scoring: 7 | method: bm25 8 | k1: 0.1 9 | pca: 3 10 | -------------------------------------------------------------------------------- /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | source = src/python 3 | concurrency = multiprocessing,thread 4 | disable_warnings = no-data-collected 5 | 6 | [combine] 7 | disable_warnings = no-data-collected 8 | 9 | [report] 10 | exclude_lines = 11 | if __name__ == .__main__.: 12 | -------------------------------------------------------------------------------- /src/python/codequestion/etl/stackexchange/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Stack Exchange imports 3 | """ 4 | 5 | from .db2qa import DB2QA 6 | from .decompress import Decompress 7 | from .execute import Execute 8 | from .sift import Sift 9 | from .xml2db import XML2DB 10 | -------------------------------------------------------------------------------- /test/python/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utils module 3 | """ 4 | 5 | 6 | class Utils: 7 | """ 8 | Utility constants and methods 9 | """ 10 | 11 | PATH = "/tmp/codequestion" 12 | STACKEXCHANGE = PATH + "/stackexchange" 13 | QUESTIONS = STACKEXCHANGE + "/questions.db" 14 | TESTS = PATH + "/test" 15 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pycqa/pylint 3 | rev: v2.12.1 4 | hooks: 5 | - id: pylint 6 | args: 7 | - -d import-error 8 | - -d duplicate-code 9 | - repo: https://github.com/ambv/black 10 | rev: 22.3.0 11 | hooks: 12 | - id: black 13 | language_version: python3 14 | -------------------------------------------------------------------------------- /config/index.yml: -------------------------------------------------------------------------------- 1 | path: sentence-transformers/all-MiniLM-L6-v2 2 | content: True 3 | objects: True 4 | quantize: True 5 | functions: 6 | - name: graph 7 | function: graph.attribute 8 | expressions: 9 | - name: topic 10 | expression: graph(indexid, 'topic') 11 | - name: topicrank 12 | expression: graph(indexid, 'topicrank') 13 | graph: 14 | limit: 100 15 | minscore: 0.2 16 | topics: 17 | stopwords: 18 | - stackoverflow 19 | -------------------------------------------------------------------------------- /.pylintrc: -------------------------------------------------------------------------------- 1 | [BASIC] 2 | module-rgx=[a-z_][a-zA-Z0-9_]{2,30}$ 3 | method-rgx=[a-z_][a-zA-Z0-9_]{2,30}$ 4 | function-rgx=[a-z_][a-zA-Z0-9_]{2,30}$ 5 | argument-rgx=[a-z_][a-zA-Z0-9_]{0,30}$ 6 | variable-rgx=[a-z_][a-zA-Z0-9_]{0,30}$ 7 | attr-rgx=[a-z_][a-zA-Z0-9_]{0,30}$ 8 | 9 | [DESIGN] 10 | max-args=10 11 | max-locals=40 12 | max-returns=10 13 | max-attributes=20 14 | min-public-methods=0 15 | 16 | [FORMAT] 17 | max-line-length=150 18 | 19 | [MESSAGES CONTROL] 20 | disable=R0201,W0621 21 | -------------------------------------------------------------------------------- /test/python/testdownload.py: -------------------------------------------------------------------------------- 1 | """ 2 | Download module tests 3 | """ 4 | 5 | import os 6 | import unittest 7 | 8 | from codequestion.download import Download 9 | 10 | # pylint: disable=C0411 11 | from utils import Utils 12 | 13 | 14 | class TestDownload(unittest.TestCase): 15 | """ 16 | Download tests. 17 | """ 18 | 19 | def testDownload(self): 20 | """ 21 | Test download 22 | """ 23 | 24 | download = Download() 25 | download( 26 | "https://github.com/neuml/codequestion/archive/refs/heads/master.zip", 27 | Utils.PATH, 28 | ) 29 | 30 | # Check archive uncompressed successfully 31 | self.assertTrue(os.path.exists(Utils.PATH + "/codequestion-master/setup.py")) 32 | -------------------------------------------------------------------------------- /src/python/codequestion/path.py: -------------------------------------------------------------------------------- 1 | """ 2 | Path module 3 | """ 4 | 5 | from rich.console import Console 6 | 7 | 8 | class Path: 9 | """ 10 | Traverse semantic graphs. 11 | """ 12 | 13 | def __init__(self, embeddings): 14 | """ 15 | Creates a new path action. 16 | 17 | Args: 18 | embeddings: embeddings instance 19 | """ 20 | 21 | self.embeddings = embeddings 22 | self.graph = embeddings.graph 23 | 24 | def __call__(self, start, end): 25 | """ 26 | Runs a path action. 27 | 28 | Args: 29 | start: start node id 30 | end: end node id 31 | """ 32 | 33 | console = Console() 34 | 35 | path = self.graph.showpath(start, end) 36 | for x, uid in enumerate(path): 37 | query = f"select question from txtai where id = '{uid}'" 38 | question = self.embeddings.search(query, 1)[0]["question"] 39 | console.print(f"{x + 1}. {question} ({uid})") 40 | -------------------------------------------------------------------------------- /src/python/codequestion/etl/stackexchange/decompress.py: -------------------------------------------------------------------------------- 1 | """ 2 | Decompress module 3 | """ 4 | 5 | import shlex 6 | import shutil 7 | import subprocess 8 | 9 | 10 | class Decompress: 11 | """ 12 | Runs a 7zip extract command via an external process. 13 | """ 14 | 15 | def __call__(self, path): 16 | """ 17 | Runs the 7za extraction. 18 | 19 | Args: 20 | path: input directory path with 7z files 21 | """ 22 | 23 | # Check for 7za, default to 7z 24 | binary = "7za" if shutil.which("7za") else "7z" 25 | 26 | # Build command 27 | path = path.replace("\\", "/") 28 | command = f"{binary} e {path}/*.7z Posts.xml -y -o{path}" 29 | print(command) 30 | 31 | # Start command 32 | with subprocess.Popen( 33 | shlex.split(command), stdout=subprocess.PIPE, universal_newlines=True 34 | ) as process: 35 | while True: 36 | output = process.stdout.readline() 37 | if output == "" and process.poll() is not None: 38 | break 39 | if output: 40 | print(output.strip()) 41 | 42 | # Call final poll on completion 43 | process.poll() 44 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # Project utility scripts 2 | .PHONY: test 3 | 4 | # Setup environment 5 | export SRC_DIR := ./src/python 6 | export TEST_DIR := ./test/python 7 | export PYTHONPATH := ${SRC_DIR}:${TEST_DIR}:${PYTHONPATH} 8 | export PATH := ${TEST_DIR}:${PATH} 9 | export PYTHONWARNINGS := ignore 10 | export TOKENIZERS_PARALLELISM := False 11 | 12 | # Default python executable if not provided 13 | PYTHON ?= python 14 | 15 | # Download test data 16 | data: 17 | mkdir -p /tmp/codequestion 18 | wget -N https://archive.org/download/stackexchange_20220606/ai.stackexchange.com.7z -P /tmp/codequestion/stackexchange/ai 19 | wget -N https://raw.githubusercontent.com/neuml/codequestion/master/config/index.v1.yml -P /tmp/codequestion/ 20 | wget -N https://raw.githubusercontent.com/neuml/codequestion/master/config/index.yml -P /tmp/codequestion/ 21 | 22 | wget https://raw.githubusercontent.com/neuml/codequestion/master/test/stackexchange/query.txt -P /tmp/codequestion/test/stackexchange 23 | wget http://ixa2.si.ehu.es/stswiki/images/4/48/Stsbenchmark.tar.gz -P /tmp/codequestion 24 | tar -C /tmp/codequestion/test -xvzf /tmp/codequestion/Stsbenchmark.tar.gz 25 | 26 | # Unit tests 27 | test: 28 | ${PYTHON} -m unittest discover -v -s ${TEST_DIR} 29 | 30 | # Run tests while calculating code coverage 31 | coverage: 32 | coverage run -m unittest discover -v -s ${TEST_DIR} 33 | coverage combine 34 | -------------------------------------------------------------------------------- /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | # GitHub Actions build workflow 2 | name: build 3 | 4 | on: ["push", "pull_request"] 5 | 6 | jobs: 7 | build: 8 | runs-on: ${{ matrix.os }} 9 | strategy: 10 | matrix: 11 | os: [ubuntu-latest, macos-latest, windows-latest] 12 | 13 | timeout-minutes: 60 14 | steps: 15 | - name: Checkout code 16 | uses: actions/checkout@v3 17 | 18 | - name: Install Python - Linux 19 | uses: actions/setup-python@v4 20 | with: 21 | python-version: 3.8 22 | if: matrix.os == 'ubuntu-latest' 23 | 24 | - name: Install Python - macOS/Windows 25 | uses: actions/setup-python@v4 26 | with: 27 | python-version: 3.9 28 | if: matrix.os != 'ubuntu-latest' 29 | 30 | - name: Install dependencies - macOS 31 | run: | 32 | echo "OMP_NUM_THREADS=1" >> $GITHUB_ENV 33 | if: matrix.os == 'macos-latest' 34 | 35 | - name: Install dependencies - Windows 36 | run: choco install wget 37 | if: matrix.os == 'windows-latest' 38 | 39 | - name: Build 40 | run: | 41 | pip install -U pip 42 | pip install -U wheel coverage coveralls 43 | pip install . txtai[similarity] 44 | python --version 45 | make data coverage 46 | 47 | - uses: pre-commit/action@v3.0.0 48 | if: matrix.os == 'ubuntu-latest' 49 | 50 | - name: Test Coverage 51 | run: coveralls --service=github 52 | if: matrix.os == 'ubuntu-latest' 53 | env: 54 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 55 | -------------------------------------------------------------------------------- /src/python/codequestion/topics.py: -------------------------------------------------------------------------------- 1 | """ 2 | Topics module 3 | """ 4 | 5 | from rich.console import Console 6 | 7 | from txtai.embeddings import Embeddings 8 | 9 | 10 | class Topics: 11 | """ 12 | Query topic models. 13 | """ 14 | 15 | def __init__(self, embeddings): 16 | """ 17 | Creates a new topics action. 18 | 19 | Args: 20 | embeddings: embeddings instance 21 | """ 22 | 23 | self.embeddings = embeddings 24 | self.topics = embeddings.graph.topics 25 | 26 | # Build on-the-fly topics index 27 | self.topicembed = Embeddings({"path": "sentence-transformers/all-MiniLM-L6-v2"}) 28 | self.topicembed.index((x, topic, None) for x, topic in enumerate(self.topics)) 29 | 30 | def __call__(self, query=None): 31 | """ 32 | Runs a topics action. 33 | 34 | Args: 35 | query: optional query to filter topics, otherwise top topics are shown 36 | """ 37 | 38 | console = Console() 39 | 40 | topics = list(self.topics.keys()) 41 | if query: 42 | results = self.topicembed.search(query, 10) 43 | else: 44 | results = [(x, 1.0) for x in range(10)] 45 | 46 | for uid, score in results: 47 | if score >= 0.1: 48 | topic = topics[uid] 49 | console.print(f"[bright_green]{topic}[/bright_green]") 50 | 51 | # Print example question 52 | query = f"select id, question from txtai where similar('{topic}')" 53 | result = self.embeddings.search(query, 1)[0] 54 | console.print(f"{result['question']} ({result['id']})\n") 55 | -------------------------------------------------------------------------------- /src/python/codequestion/tokenizer.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tokenizer module 3 | """ 4 | 5 | import re 6 | import string 7 | 8 | 9 | class Tokenizer: 10 | """ 11 | Text tokenization methods 12 | """ 13 | 14 | # Use standard python punctuation chars but allow tokens to end in # (to allow c#, f#) and + to allow (c++ g++) 15 | PUNCTUATION = string.punctuation.replace("#", "").replace("+", "") 16 | 17 | # fmt: off 18 | # English Stop Word List (Standard stop words used by Apache Lucene) 19 | STOP_WORDS = {"a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", 20 | "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", 21 | "they", "this", "to", "was", "will", "with"} 22 | # fmt: on 23 | 24 | @staticmethod 25 | def tokenize(text): 26 | """ 27 | Tokenizes input text into a list of tokens. Filters tokens that match a specific pattern and removes stop words. 28 | 29 | Args: 30 | text: input text 31 | 32 | Returns: 33 | list of tokens 34 | """ 35 | 36 | # Convert to all lowercase, split on whitespace, strip punctuation 37 | tokens = [token.strip(Tokenizer.PUNCTUATION) for token in text.lower().split()] 38 | 39 | # Filter tokens that are numbers or a valid string at least 2 characters long. Remove stop words. 40 | # Assume tokens already are uncased (all lowercase) 41 | return [ 42 | token 43 | for token in tokens 44 | if (re.match(r"^[#*+\-.0-9:@_a-z]{2,}$", token) or token.isdigit()) 45 | and token not in Tokenizer.STOP_WORDS 46 | ] 47 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # pylint: disable = C0111 2 | from setuptools import find_packages, setup 3 | 4 | with open("README.md", "r", encoding="utf-8") as f: 5 | # Remove GitHub dark mode images 6 | DESCRIPTION = "".join([line for line in f if "gh-dark-mode-only" not in line]) 7 | 8 | setup( 9 | name="codequestion", 10 | version="2.2.0", 11 | author="NeuML", 12 | description="Ask coding questions directly from the terminal", 13 | long_description=DESCRIPTION, 14 | long_description_content_type="text/markdown", 15 | url="https://github.com/neuml/codequestion", 16 | project_urls={ 17 | "Documentation": "https://github.com/neuml/codequestion", 18 | "Issue Tracker": "https://github.com/neuml/codequestion/issues", 19 | "Source Code": "https://github.com/neuml/codequestion", 20 | }, 21 | license="Apache 2.0: http://www.apache.org/licenses/LICENSE-2.0", 22 | packages=find_packages(where="src/python"), 23 | package_dir={"": "src/python"}, 24 | keywords="search embedding machine-learning nlp", 25 | python_requires=">=3.8", 26 | entry_points={ 27 | "console_scripts": [ 28 | "codequestion = codequestion.console:main", 29 | ], 30 | }, 31 | install_requires=[ 32 | "html2markdown>=0.1.7", 33 | "rich>=12.0.1", 34 | "scipy>=1.4.1", 35 | "tqdm>=4.48.0", 36 | "txtai[graph]>=6.0.0", 37 | ], 38 | classifiers=[ 39 | "License :: OSI Approved :: Apache Software License", 40 | "Operating System :: OS Independent", 41 | "Programming Language :: Python :: 3", 42 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 43 | "Topic :: Software Development", 44 | "Topic :: Text Processing :: Indexing", 45 | "Topic :: Utilities", 46 | ], 47 | ) 48 | -------------------------------------------------------------------------------- /src/python/codequestion/models.py: -------------------------------------------------------------------------------- 1 | """ 2 | Models module 3 | """ 4 | 5 | import os 6 | import os.path 7 | 8 | 9 | class Models: 10 | """ 11 | Common methods for generating data paths. 12 | """ 13 | 14 | @staticmethod 15 | def basePath(create=False): 16 | """ 17 | Base data path. 18 | 19 | Args: 20 | create: if directory should be created 21 | 22 | Returns: 23 | path 24 | """ 25 | 26 | # Derive base path 27 | path = os.environ.get("CODEQUESTION_HOME") 28 | 29 | # Default model base path when environment variable is empty 30 | path = path if path else os.path.join(os.path.expanduser("~"), ".codequestion") 31 | 32 | # Create directory if required 33 | if create: 34 | os.makedirs(path, exist_ok=True) 35 | 36 | return path 37 | 38 | @staticmethod 39 | def modelPath(name, create=False): 40 | """ 41 | Model path for name 42 | 43 | Args: 44 | name: model name 45 | create: if directory should be created 46 | 47 | Returns: 48 | path 49 | """ 50 | 51 | path = os.path.join(Models.basePath(), "models", name) 52 | 53 | # Create directory if required 54 | if create: 55 | os.makedirs(path, exist_ok=True) 56 | 57 | return path 58 | 59 | @staticmethod 60 | def vectorPath(name, create=False): 61 | """ 62 | Vector path for name 63 | 64 | Args: 65 | name: vectors name 66 | create: if directory should be created 67 | 68 | Returns: 69 | path 70 | """ 71 | 72 | path = os.path.join(Models.basePath(), "vectors") 73 | 74 | # Create directory path if required 75 | if create: 76 | os.makedirs(path, exist_ok=True) 77 | 78 | # Append file name to path 79 | return os.path.join(path, name) 80 | -------------------------------------------------------------------------------- /src/python/codequestion/download.py: -------------------------------------------------------------------------------- 1 | """ 2 | Download module 3 | """ 4 | 5 | import os.path 6 | import tempfile 7 | import zipfile 8 | 9 | from urllib.request import urlopen 10 | 11 | from tqdm import tqdm 12 | 13 | from .models import Models 14 | 15 | 16 | class Download: 17 | """ 18 | Downloads a pre-trained model. 19 | """ 20 | 21 | def __call__(self, url, path=None): 22 | """ 23 | Downloads a pre-trained model from url into the local model cache directory. 24 | 25 | Args: 26 | url: url model path 27 | """ 28 | 29 | # Get base models path 30 | path = path if path else Models.basePath(True) 31 | dest = os.path.join(tempfile.gettempdir(), os.path.basename(url)) 32 | 33 | print(f"Downloading model from {url} to {dest}") 34 | 35 | # Download file 36 | self.download(url, dest) 37 | 38 | print(f"Decompressing model to {path}") 39 | 40 | # Ensure file was downloaded successfully 41 | if os.path.exists(dest): 42 | with zipfile.ZipFile(dest, "r") as z: 43 | z.extractall(path) 44 | 45 | print("Download complete") 46 | 47 | def download(self, url, dest): 48 | """ 49 | Downloads a remote file from url and stores at dest. 50 | 51 | Args: 52 | url: remote url 53 | dest: destination file path 54 | """ 55 | 56 | with urlopen(url) as response: 57 | buffer = 16 * 1024 58 | headers = response.info() 59 | size = int(headers["Content-Length"]) if "Content-Length" in headers else -1 60 | 61 | with tqdm(total=size, unit="B", unit_scale=True, unit_divisor=1024) as pbar: 62 | with open(dest, "wb") as f: 63 | while True: 64 | chunk = response.read(buffer) 65 | if not chunk: 66 | break 67 | 68 | f.write(chunk) 69 | pbar.update(len(chunk)) 70 | 71 | 72 | if __name__ == "__main__": 73 | download = Download() 74 | download( 75 | "https://github.com/neuml/codequestion/releases/download/v2.0.0/cqmodel.zip" 76 | ) 77 | -------------------------------------------------------------------------------- /src/python/codequestion/etl/stackexchange/sift.py: -------------------------------------------------------------------------------- 1 | """ 2 | Sift module 3 | """ 4 | 5 | import re 6 | 7 | 8 | class Sift: 9 | """ 10 | Filters a raw posts.xml file for matching results. Uses raw text processing to avoid overhead of parsing xml. 11 | """ 12 | 13 | def __call__(self, infile, outfile): 14 | """ 15 | Processes a raw Posts.xml file. The Posts dump is in Id order ascending. 16 | 17 | Args: 18 | infile: path to input file 19 | outfile: path to output file 20 | """ 21 | 22 | print(f"Converting {infile} to {outfile}") 23 | 24 | # Set of answer ids 25 | ids = set() 26 | 27 | with open(infile, encoding="utf-8") as xml: 28 | with open(outfile, "w", encoding="utf-8") as output: 29 | # Write xml start 30 | output.write("\n") 31 | 32 | for line in xml: 33 | # PostTypeId = 1 (Question) with accepted answer. 34 | if "AcceptedAnswerId" in line: 35 | # Parse answer id and score 36 | answer = self.parse(r"AcceptedAnswerId=\"([0-9]+)\"", line) 37 | score = self.parse(r"Score=\"([0-9]+)\"", line) 38 | 39 | # Require a score of 10+. 40 | if score >= 10: 41 | # Add answer id to ids list 42 | ids.add(answer) 43 | 44 | # Write accepted line 45 | output.write(line) 46 | 47 | # PostTypeId = 2 (Answer) 48 | elif 'PostTypeId="2"' in line: 49 | # Parse post id 50 | pid = self.parse(r"Id=\"([0-9]+)\"", line) 51 | 52 | if pid in ids: 53 | # Write output line and remove from ids list 54 | output.write(line) 55 | ids.remove(pid) 56 | 57 | # Write xml end 58 | output.write("\n") 59 | 60 | def parse(self, pattern, line): 61 | """ 62 | Parses an int field and returns the value if found. Returns -1 if no value found. 63 | 64 | Args: 65 | pattern: regex pattern 66 | line: input line 67 | 68 | Return: 69 | field value 70 | """ 71 | 72 | field = re.search(pattern, line) 73 | return int(field.group(1)) if field else -1 74 | -------------------------------------------------------------------------------- /test/python/testconsole.py: -------------------------------------------------------------------------------- 1 | """ 2 | Console module tests 3 | """ 4 | 5 | import contextlib 6 | import io 7 | import os 8 | import unittest 9 | 10 | from codequestion.console import Console 11 | from codequestion.etl.stackexchange import Execute 12 | from codequestion.index import Index 13 | 14 | # pylint: disable=C0411 15 | from utils import Utils 16 | 17 | 18 | class TestConsole(unittest.TestCase): 19 | """ 20 | Console tests. 21 | """ 22 | 23 | @classmethod 24 | def setUpClass(cls): 25 | """ 26 | Initialize test data. 27 | """ 28 | 29 | os.environ["CODEQUESTION_HOME"] = Utils.STACKEXCHANGE + ".console" 30 | 31 | # Run etl process 32 | Execute.SOURCES = ["ai"] 33 | 34 | execute = Execute() 35 | execute(Utils.STACKEXCHANGE) 36 | 37 | # Create embeddings index 38 | index = Index() 39 | index(Utils.PATH + "/index.yml", Utils.QUESTIONS) 40 | 41 | cls.console = Console() 42 | cls.console.preloop() 43 | 44 | def testHelp(self): 45 | """ 46 | Test help command 47 | """ 48 | 49 | self.assertIn(".limit", self.command("help")) 50 | self.assertIn(".limit", self.command("help .limit")) 51 | 52 | def testLimit(self): 53 | """ 54 | Test .limit command 55 | """ 56 | 57 | self.assertEqual(self.command(".limit 1"), "") 58 | 59 | def testPath(self): 60 | """ 61 | Test .path command 62 | """ 63 | 64 | self.assertIn("1. ", self.command(".path 0 1")) 65 | 66 | def testSearch(self): 67 | """ 68 | Test search 69 | """ 70 | 71 | self.assertIn("Question", self.command("ai")) 72 | 73 | def testShow(self): 74 | """ 75 | Test .show command 76 | """ 77 | 78 | self.assertIn("Question", self.command(".show 0")) 79 | 80 | def testtopics(self): 81 | """ 82 | Test .topics command 83 | """ 84 | 85 | self.assertNotIn("ERROR", self.command(".topics")) 86 | self.assertNotIn("ERROR", self.command(".topics ai")) 87 | 88 | def command(self, command): 89 | """ 90 | Runs a console command. 91 | 92 | Args: 93 | command: command to run 94 | 95 | Returns: 96 | command output 97 | """ 98 | 99 | # Run info 100 | output = io.StringIO() 101 | with contextlib.redirect_stdout(output): 102 | self.console.onecmd(command) 103 | 104 | return output.getvalue() 105 | -------------------------------------------------------------------------------- /test/python/testindex.py: -------------------------------------------------------------------------------- 1 | """ 2 | Index module tests 3 | """ 4 | 5 | import contextlib 6 | import io 7 | import os 8 | import unittest 9 | 10 | from codequestion.evaluate import StackExchange, STS 11 | from codequestion.index import Index 12 | from codequestion.search import Search 13 | from codequestion.vectors import Vectors 14 | 15 | # pylint: disable=C0411 16 | from utils import Utils 17 | 18 | 19 | class TestIndex(unittest.TestCase): 20 | """ 21 | Index tests. 22 | """ 23 | 24 | def testTransformers(self): 25 | """ 26 | Test transformers-backed index 27 | """ 28 | 29 | os.environ["CODEQUESTION_HOME"] = Utils.STACKEXCHANGE + ".transformers" 30 | 31 | # Create embeddings index 32 | index = Index() 33 | index(Utils.PATH + "/index.yml", Utils.QUESTIONS) 34 | 35 | # Run tests 36 | self.runTests() 37 | 38 | def testWordVectors(self): 39 | """ 40 | Test word vector-backed index 41 | """ 42 | 43 | os.environ["CODEQUESTION_HOME"] = Utils.STACKEXCHANGE + ".wv" 44 | 45 | # Build word vectors 46 | vectors = Vectors() 47 | vectors(Utils.QUESTIONS, 300, 3) 48 | 49 | # Create embeddings index 50 | index = Index() 51 | index(Utils.PATH + "/index.v1.yml", Utils.QUESTIONS) 52 | 53 | # Run tests 54 | self.runTests() 55 | 56 | def runTests(self): 57 | """ 58 | Run index tests. 59 | """ 60 | 61 | self.search() 62 | self.stackexchange() 63 | self.sts() 64 | 65 | def search(self): 66 | """ 67 | Run search test. 68 | """ 69 | 70 | # Test search 71 | search = Search() 72 | self.assertIn( 73 | "machine learning", self.command(lambda: search("machine learning")) 74 | ) 75 | 76 | def stackexchange(self): 77 | """ 78 | Run stack exchange test. 79 | """ 80 | 81 | action = StackExchange() 82 | self.assertIn( 83 | "Mean Reciprocal Rank", self.command(lambda: action(Utils.TESTS, None)) 84 | ) 85 | self.assertIn( 86 | "Mean Reciprocal Rank", self.command(lambda: action(Utils.TESTS, "bm25")) 87 | ) 88 | 89 | def sts(self): 90 | """ 91 | Run STS test. 92 | """ 93 | 94 | action = STS() 95 | self.assertIn("Pearson", self.command(lambda: action(Utils.TESTS, None))) 96 | 97 | def command(self, command): 98 | """ 99 | Runs a console command. 100 | 101 | Args: 102 | command: command to run 103 | 104 | Returns: 105 | command output 106 | """ 107 | 108 | # Run info 109 | output = io.StringIO() 110 | with contextlib.redirect_stdout(output): 111 | command() 112 | 113 | return output.getvalue() 114 | -------------------------------------------------------------------------------- /src/python/codequestion/etl/stackexchange/execute.py: -------------------------------------------------------------------------------- 1 | """ 2 | Execute module 3 | """ 4 | 5 | import os 6 | import os.path 7 | import sys 8 | 9 | from .db2qa import DB2QA 10 | from .decompress import Decompress 11 | from .sift import Sift 12 | from .xml2db import XML2DB 13 | 14 | 15 | class Execute: 16 | """ 17 | Main execution method to build a consolidated questions.db file from Stack Exchange Data Dumps. 18 | """ 19 | 20 | # List of sources 21 | SOURCES = [ 22 | "ai", 23 | "android", 24 | "apple", 25 | "arduino", 26 | "askubuntu", 27 | "avp", 28 | "codereview", 29 | "cs", 30 | "datascience", 31 | "dba", 32 | "devops", 33 | "dsp", 34 | "raspberrypi", 35 | "reverseengineering", 36 | "scicomp", 37 | "serverfault", 38 | "security", 39 | "stackoverflow", 40 | "stats", 41 | "superuser", 42 | "unix", 43 | "vi", 44 | "wordpress", 45 | ] 46 | 47 | def __call__(self, path): 48 | """ 49 | Converts a directory of raw sources to a single output questions database. 50 | 51 | Args: 52 | path: base directory path 53 | """ 54 | 55 | # Iterates through a directory of raw sources and builds staging databases 56 | databases = self.process(path) 57 | 58 | # Output database file 59 | qafile = os.path.join(path, "questions.db") 60 | 61 | # Build consolidated SQLite questions database 62 | db2qa = DB2QA() 63 | db2qa(databases, qafile) 64 | 65 | def process(self, path): 66 | """ 67 | Iterates through each source and converts raw xml to SQLite databases. Returns a list of 68 | output databases. 69 | 70 | Args: 71 | path: input directory path with raw source data directories 72 | 73 | Returns: 74 | paths to output databases 75 | """ 76 | 77 | # Extract filtered content and build source databases to process 78 | for source in Execute.SOURCES: 79 | spath = os.path.join(path, source) 80 | 81 | # Extract Posts.xml from 7za file 82 | decompress = Decompress() 83 | decompress(spath) 84 | 85 | posts = os.path.join(spath, "Posts.xml") 86 | filtered = os.path.join(spath, "Filtered.xml") 87 | 88 | # Filter Posts.xml file for matching questions 89 | sift = Sift() 90 | sift(posts, filtered) 91 | 92 | dbfile = os.path.join(spath, f"{source}.db") 93 | 94 | # Convert filtered Posts.xml file to SQLite db file 95 | xml2db = XML2DB() 96 | xml2db(filtered, dbfile) 97 | 98 | # Get list of all databases to consolidate 99 | return [ 100 | os.path.join(path, source, f"{source}.db") for source in Execute.SOURCES 101 | ] 102 | 103 | 104 | if __name__ == "__main__": 105 | # Input data directory 106 | path = sys.argv[1] 107 | if not os.path.exists(path): 108 | print("Data directory does not exist, exiting") 109 | sys.exit() 110 | 111 | # Run ETL process 112 | execute = Execute() 113 | execute(path) 114 | -------------------------------------------------------------------------------- /src/python/codequestion/console.py: -------------------------------------------------------------------------------- 1 | """ 2 | Console module 3 | """ 4 | 5 | from cmd import Cmd 6 | 7 | from rich.console import Console as RichConsole 8 | 9 | from .path import Path 10 | from .search import Search 11 | from .topics import Topics 12 | 13 | 14 | class Console(Cmd): 15 | """ 16 | codequestion console. 17 | """ 18 | 19 | def __init__(self): 20 | """ 21 | Creates a new codequestion console. 22 | """ 23 | 24 | super().__init__() 25 | 26 | # Display configuration 27 | self.intro = "codequestion console" 28 | self.prompt = ">>> " 29 | self.console = RichConsole() 30 | 31 | # Search parameters 32 | self.search = None 33 | self.embeddings = None 34 | self.limit = 1 35 | 36 | # Topics action 37 | self.topics = None 38 | 39 | # Path traversal action 40 | self.path = None 41 | 42 | def preloop(self): 43 | """ 44 | Loads initial configuration. 45 | """ 46 | 47 | # Load query and embeddings 48 | self.search = Search() 49 | self.embeddings = self.search.embeddings 50 | 51 | # Load graph-based actions, if necessary 52 | if self.embeddings.graph: 53 | if self.embeddings.graph.topics: 54 | self.topics = Topics(self.embeddings) 55 | 56 | self.path = Path(self.embeddings) 57 | 58 | def default(self, line): 59 | """ 60 | Default event loop. 61 | 62 | Args: 63 | line: command line 64 | """ 65 | 66 | # pylint: disable=W0703 67 | try: 68 | command = line.lower() 69 | if command.startswith(".limit"): 70 | command = self.split(line) 71 | self.limit = int(command[1]) 72 | elif command.startswith(".path") and self.path: 73 | command = self.split(line) 74 | start, end = command[1].split() 75 | self.path(int(start), int(end)) 76 | elif command.startswith(".show"): 77 | command = self.split(line) 78 | self.search(uid=command[1]) 79 | elif command.startswith(".topics") and self.topics: 80 | command = self.split(line) 81 | self.topics(command[1] if len(command) > 1 else None) 82 | else: 83 | # Search is default action 84 | self.search(line, self.limit) 85 | except Exception: 86 | self.console.print_exception() 87 | 88 | def do_help(self, arg): 89 | """ 90 | Shows a help message. 91 | 92 | Args: 93 | arg: optional help message argument 94 | """ 95 | 96 | commands = { 97 | ".limit": "(number)\t\tset the maximum number of query rows to return", 98 | ".path": "(start) (end)\tprints a semantic path between questions", 99 | ".show": "(id)\t\tprint question with specified id", 100 | ".topics": "(query)\t\tshows topics best matching query. if query is empty, top topics are shown", 101 | } 102 | 103 | if arg in commands: 104 | self.console.print(f"{arg} {commands[arg]}") 105 | else: 106 | for command, message in commands.items(): 107 | self.console.print(f"{command} {message}") 108 | 109 | self.console.print("\nDefault mode runs a search query") 110 | 111 | def split(self, command, default=None): 112 | """ 113 | Splits command by whitespace. 114 | 115 | Args: 116 | command: command line 117 | default: default command action 118 | 119 | Returns: 120 | command action 121 | """ 122 | 123 | values = command.split(" ", 1) 124 | return values if len(values) > 1 else (command, default) 125 | 126 | 127 | def main(): 128 | """ 129 | Console execution loop. 130 | """ 131 | 132 | Console().cmdloop() 133 | 134 | 135 | if __name__ == "__main__": 136 | main() 137 | -------------------------------------------------------------------------------- /src/python/codequestion/index.py: -------------------------------------------------------------------------------- 1 | """ 2 | Index module 3 | """ 4 | 5 | import os.path 6 | import sqlite3 7 | import sys 8 | 9 | from tqdm import tqdm 10 | from txtai.app import Application 11 | from txtai.embeddings import Embeddings 12 | 13 | from .models import Models 14 | from .tokenizer import Tokenizer 15 | 16 | 17 | class Index: 18 | """ 19 | Builds a new embeddings index. 20 | """ 21 | 22 | def __call__(self, config, dbfile): 23 | """ 24 | Builds and saves an embeddings index. 25 | 26 | Args: 27 | config: input configuration file 28 | dbfile: input SQLite file 29 | """ 30 | 31 | embeddings = self.build(config, dbfile) 32 | embeddings.save(Models.modelPath("stackexchange")) 33 | 34 | def build(self, config, dbfile): 35 | """ 36 | Builds an embeddings index. 37 | 38 | Args: 39 | config: input configuration file 40 | dbfile: input SQLite file 41 | 42 | Returns: 43 | embeddings index 44 | """ 45 | 46 | # Configure embeddings index 47 | config = Application.read(config) 48 | 49 | # Resolve full path to vectors file, if necessary 50 | if config.get("scoring"): 51 | config["path"] = os.path.join(Models.vectorPath(config["path"])) 52 | 53 | # Create embeddings index 54 | embeddings = Embeddings(config) 55 | 56 | # Build scoring index, if scoring method provided 57 | if embeddings.isweighted(): 58 | embeddings.score(self.stream(dbfile, embeddings, "Building scoring index")) 59 | 60 | # Build embeddings index 61 | embeddings.index(self.stream(dbfile, embeddings, "Building embeddings index")) 62 | 63 | return embeddings 64 | 65 | def stream(self, dbfile, embeddings, message): 66 | """ 67 | Streams questions from a questions.db file. This method is a generator and will yield a row at time. 68 | 69 | Args: 70 | dbfile: input SQLite file 71 | embeddings: embeddings instance 72 | message: progress bar message 73 | """ 74 | 75 | # Connection to database file 76 | db = sqlite3.connect(dbfile) 77 | db.row_factory = sqlite3.Row 78 | cur = db.cursor() 79 | 80 | # Get total number of questions 81 | cur.execute("SELECT count(*) from Questions") 82 | total = cur.fetchone()[0] 83 | 84 | # Query for iterating over questions.db rows 85 | cur.execute( 86 | "SELECT Id, Source, SourceId, Date, Tags, Question, QuestionUser, Answer, AnswerUser, Reference FROM Questions" 87 | ) 88 | 89 | for row in tqdm(cur, total=total, desc=message): 90 | # Transform all keys to lowercase 91 | row = {k.lower(): row[k] for k in row.keys()} 92 | 93 | # Store answer as object 94 | row["object"] = row.pop("answer") 95 | 96 | # Build text and yield (id, text, tags) tuple 97 | row["text"] = row["question"] + " " + row["source"] + " " + row["tags"] 98 | 99 | # Use custom tokenizer for word vector models 100 | if embeddings.isweighted(): 101 | row["text"] = Tokenizer.tokenize(row["text"]) 102 | 103 | # Yield document 104 | yield (row["id"], row, row["tags"]) 105 | 106 | # Free database resources 107 | db.close() 108 | 109 | 110 | # pylint: disable=C0103 111 | if __name__ == "__main__": 112 | # Path to index configuration file 113 | config = sys.argv[1] if len(sys.argv) > 1 else None 114 | if not config or not os.path.exists(config): 115 | print("Path to index configuration file does not exist, exiting") 116 | sys.exit() 117 | 118 | # Path to questions.db file 119 | dbfile = sys.argv[2] if len(sys.argv) > 1 else None 120 | if not dbfile or not os.path.exists(dbfile): 121 | print("Path to questions.db file does not exist, exiting") 122 | sys.exit() 123 | 124 | # Build index 125 | index = Index() 126 | index(config, dbfile) 127 | -------------------------------------------------------------------------------- /src/python/codequestion/vectors.py: -------------------------------------------------------------------------------- 1 | """ 2 | Vectors module 3 | """ 4 | 5 | import os 6 | import os.path 7 | import sqlite3 8 | import sys 9 | import tempfile 10 | 11 | from tqdm import tqdm 12 | from txtai.vectors import WordVectors 13 | 14 | from .models import Models 15 | from .tokenizer import Tokenizer 16 | 17 | 18 | class RowIterator: 19 | """ 20 | Iterates over rows in a database query. Allows for multiple iterations. 21 | """ 22 | 23 | def __init__(self, dbfile): 24 | """ 25 | Initializes RowIterator. 26 | 27 | Args: 28 | dbfile: path to SQLite file 29 | """ 30 | 31 | # Store database file 32 | self.dbfile = dbfile 33 | 34 | self.rows = self.stream(self.dbfile) 35 | 36 | def __iter__(self): 37 | """ 38 | Creates a database query generator. 39 | 40 | Returns: 41 | generator 42 | """ 43 | 44 | # reset the generator 45 | self.rows = self.stream(self.dbfile) 46 | return self 47 | 48 | def __next__(self): 49 | """ 50 | Gets the next result in the current generator. 51 | 52 | Returns: 53 | tokens 54 | """ 55 | 56 | result = next(self.rows) 57 | if result is None: 58 | raise StopIteration 59 | 60 | return result 61 | 62 | def stream(self, dbfile): 63 | """ 64 | Connects to SQLite file at dbfile and yields parsed tokens for each row. 65 | 66 | Args: 67 | dbfile: path to SQLite file 68 | """ 69 | 70 | # Connection to database file 71 | db = sqlite3.connect(dbfile) 72 | cur = db.cursor() 73 | 74 | # Get total number of questions 75 | cur.execute("SELECT count(*) from Questions") 76 | total = cur.fetchone()[0] 77 | 78 | # Query for iterating over questions.db rows 79 | cur.execute("SELECT Question, Source, Tags FROM questions") 80 | 81 | for question in tqdm(cur, total=total, desc="Tokenizing input"): 82 | # Tokenize question, source and tags 83 | tokens = Tokenizer.tokenize( 84 | question[0] + " " + question[1] + " " + question[2] 85 | ) 86 | 87 | # Skip documents with no tokens parsed 88 | if tokens: 89 | yield tokens 90 | 91 | # Free database resources 92 | db.close() 93 | 94 | 95 | class Vectors: 96 | """ 97 | Methods to build a FastText model. 98 | """ 99 | 100 | def __call__(self, dbfile, size, mincount): 101 | """ 102 | Converts dbfile into a fastText model using pymagnitude's SQLite output format. 103 | 104 | Args: 105 | dbfile: input SQLite file 106 | size: dimensions for fastText model 107 | mincount: minimum number of times a token must appear in input 108 | """ 109 | 110 | # Stream tokens to temporary file 111 | tokens = self.tokens(dbfile) 112 | 113 | # Output file path 114 | path = Models.vectorPath(f"stackexchange-{size}d", True) 115 | 116 | # Build word vectors model 117 | WordVectors.build(tokens, size, mincount, path) 118 | 119 | # Remove temporary tokens file 120 | os.remove(tokens) 121 | 122 | def tokens(self, dbfile): 123 | """ 124 | Iterates over each row in dbfile and writes parsed tokens to a temporary file for processing. 125 | 126 | Args: 127 | dbfile: SQLite file to read 128 | 129 | Returns: 130 | path to output file 131 | """ 132 | 133 | tokens = None 134 | 135 | # Stream tokens to temp working file 136 | with tempfile.NamedTemporaryFile( 137 | mode="w", suffix=".txt", delete=False 138 | ) as output: 139 | # Save file path 140 | tokens = output.name 141 | 142 | for row in RowIterator(dbfile): 143 | output.write(" ".join(row) + "\n") 144 | 145 | return tokens 146 | 147 | 148 | # pylint: disable=C0103 149 | if __name__ == "__main__": 150 | # Path to questions.db file 151 | dbfile = sys.argv[1] if len(sys.argv) > 1 else None 152 | if not dbfile or not os.path.exists(dbfile): 153 | print("Path to questions.db file does not exist, exiting") 154 | sys.exit() 155 | 156 | # Resolve questions.db path and run 157 | vectors = Vectors() 158 | vectors(dbfile, 300, 3) 159 | -------------------------------------------------------------------------------- /src/python/codequestion/search.py: -------------------------------------------------------------------------------- 1 | """ 2 | Search module 3 | """ 4 | 5 | import os 6 | import os.path 7 | import re 8 | 9 | import html2markdown 10 | 11 | from rich.console import Console 12 | from rich.markdown import Markdown 13 | from txtai.embeddings import Embeddings 14 | 15 | from .models import Models 16 | from .tokenizer import Tokenizer 17 | 18 | 19 | class Search: 20 | """ 21 | Search an embeddings index. 22 | """ 23 | 24 | def __init__(self): 25 | """ 26 | Creates a new search action. 27 | """ 28 | 29 | # Load embeddings index 30 | self.embeddings = self.load() 31 | self.console = Console() 32 | 33 | def __call__(self, query=None, limit=1, uid=None): 34 | """ 35 | Runs a search action. 36 | 37 | Args: 38 | query: query string 39 | limit: number of results to return 40 | uid: id to show 41 | """ 42 | 43 | # Query prefix 44 | prefix = "select id, score, questionuser, question, tags, date, answeruser, object answer, reference from txtai where" 45 | 46 | if uid is not None: 47 | # ID query 48 | query = f"{prefix} id = '{uid}'" 49 | elif self.embeddings.isweighted(): 50 | # Use custom tokenizer for word vector models 51 | query = Tokenizer.tokenize(query) 52 | 53 | # Run search and build id query 54 | result = self.embeddings.search(query, 1)[0] if query else {} 55 | query = f""" 56 | select id, {result.get('score')} score, questionuser, question, tags, date, answeruser, object answer, reference 57 | from txtai 58 | where id = '{result.get('id')}' 59 | """ 60 | else: 61 | # Default similar clause query 62 | query = f"{prefix} similar('{query}')" 63 | 64 | # Render results 65 | for result in self.embeddings.search(query, limit): 66 | # Show result 67 | self.result(result, limit) 68 | 69 | self.console.print() 70 | 71 | def load(self): 72 | """ 73 | Loads an embeddings model. 74 | 75 | Returns: 76 | embeddings 77 | """ 78 | 79 | path = Models.modelPath("stackexchange") 80 | 81 | if os.path.isfile(os.path.join(path, "config")): 82 | print(f"Loading model from {path}") 83 | embeddings = Embeddings() 84 | embeddings.load(path) 85 | else: 86 | print("ERROR: loading model: ensure model is installed") 87 | print( 88 | "ERROR: Pre-trained model can be installed by running python -m codequestion.download" 89 | ) 90 | raise FileNotFoundError(f"Unable to load codequestion model from {path}") 91 | 92 | return embeddings 93 | 94 | def result(self, result, limit): 95 | """ 96 | Renders a result row. 97 | 98 | Args: 99 | result: result row 100 | limit: number of results 101 | """ 102 | 103 | # If score is empty, this a direct query 104 | score = result["score"] 105 | score = score if score is not None else 1.0 106 | 107 | self.console.print( 108 | f"[bright_green]Question (by {result['questionuser']}): {result['question']} [{score:4f}][/bright_green]", 109 | highlight=False, 110 | ) 111 | self.console.print(f"Id: {result['id']}", highlight=False) 112 | self.console.print(f"Last Activity: {result['date']}", highlight=False) 113 | self.console.print(f"Tags: {result['tags']}") 114 | self.console.print(f"Answer (by {result['answeruser']}):\n", highlight=False) 115 | self.console.print(self.markdown(result["answer"])) 116 | self.console.print(f"\nReference: {result['reference']}") 117 | 118 | # Print results divider 119 | if limit > 1: 120 | self.console.rule() 121 | 122 | def markdown(self, text): 123 | """ 124 | Converts html text to markdown. 125 | 126 | Args: 127 | text: html text 128 | 129 | Returns: 130 | text as markdown 131 | """ 132 | 133 | # Remove rel attributes as they are not supported by html2markdown 134 | text = re.sub(r' rel=".+?">', ">", text) 135 | 136 | # Convert html to markdown 137 | text = html2markdown.convert(text) 138 | 139 | # Decode [<>&] characters 140 | text = text.replace("<", "<").replace(">", ">").replace("&", "&") 141 | 142 | # Wrap as Rich Markdown 143 | return Markdown(text) 144 | -------------------------------------------------------------------------------- /src/python/codequestion/etl/stackexchange/xml2db.py: -------------------------------------------------------------------------------- 1 | """ 2 | XML2DB module 3 | """ 4 | 5 | import os 6 | import xml.etree.cElementTree as etree 7 | import sqlite3 8 | 9 | 10 | class XML2DB: 11 | """ 12 | Converts a filtered posts xml file to a staging SQLite database for processing. 13 | """ 14 | 15 | # Questions schema 16 | QUESTIONS = { 17 | "Id": "INTEGER PRIMARY KEY", 18 | "AcceptedAnswerId": "INTEGER", 19 | "CreationDate": "DATETIME", 20 | "LastActivityDate": "DATETIME", 21 | "Score": "INTEGER", 22 | "ViewCount": "INTEGER", 23 | "OwnerUserId": "INTEGER", 24 | "OwnerDisplayName": "TEXT", 25 | "Title": "TEXT", 26 | "Tags": "TEXT", 27 | "AnswerCount": "INTEGER", 28 | "CommentCount": "INTEGER", 29 | "FavoriteCount": "INTEGER", 30 | "ClosedDate": "DATETIME", 31 | } 32 | 33 | # Answers schema 34 | ANSWERS = { 35 | "Id": "INTEGER PRIMARY KEY", 36 | "ParentId": "INTEGER", 37 | "CreationDate": "DATETIME", 38 | "Score": "INTEGER", 39 | "Body": "TEXT", 40 | "OwnerUserId": "INTEGER", 41 | "OwnerDisplayName": "TEXT", 42 | } 43 | 44 | # SQL statements 45 | CREATE_TABLE = "CREATE TABLE IF NOT EXISTS {table} ({fields})" 46 | INSERT_ROW = "INSERT INTO {table} ({columns}) VALUES ({values})" 47 | 48 | def __call__(self, infile, dbfile): 49 | """ 50 | Converts xml infile to SQLite dbfile. 51 | 52 | Args: 53 | infile: input xml file 54 | dbfile: output sqlite file 55 | """ 56 | 57 | print(f"Converting {infile} to {dbfile}") 58 | 59 | # Delete existing file 60 | if os.path.exists(dbfile): 61 | os.remove(dbfile) 62 | 63 | # Create new database 64 | db = sqlite3.connect(dbfile) 65 | 66 | # Create database tables if necessary 67 | self.create(db, XML2DB.QUESTIONS, "questions") 68 | self.create(db, XML2DB.ANSWERS, "answers") 69 | 70 | count = 0 71 | with open(infile, encoding="utf-8") as xml: 72 | context, root = self.xmlstream(xml) 73 | 74 | for event, row in context: 75 | if event == "end": 76 | # Execute insert statement 77 | self.insert(db, row) 78 | 79 | count += 1 80 | if count % 10000 == 0: 81 | print(f"Inserted {count} rows") 82 | 83 | # Free memory 84 | root.clear() 85 | 86 | print(f"Total rows inserted: {count}") 87 | 88 | # Commit changes 89 | db.commit() 90 | 91 | def create(self, db, table, name): 92 | """ 93 | Creates a SQLite table. 94 | 95 | Args: 96 | db: database connection 97 | table: table schema 98 | name: table name 99 | """ 100 | 101 | columns = [f"{name} {ctype}" for name, ctype in table.items()] 102 | create = XML2DB.CREATE_TABLE.format(table=name, fields=", ".join(columns)) 103 | 104 | # pylint: disable=W0703 105 | try: 106 | db.execute(create) 107 | except Exception as e: 108 | print(create) 109 | print("Failed to create table: " + e) 110 | 111 | def xmlstream(self, xml): 112 | """ 113 | Creates a xml stream for iterative parsing. 114 | 115 | Args: 116 | xml: input file 117 | 118 | Returns: 119 | context, root 120 | """ 121 | 122 | # Parse the tree 123 | context = etree.iterparse(xml, events=("start", "end")) 124 | 125 | # turn it into an iterator 126 | context = iter(context) 127 | 128 | # get the root element 129 | _, root = next(context) 130 | 131 | return context, root 132 | 133 | def insert(self, db, row): 134 | """ 135 | Inserts row into database. 136 | 137 | Args: 138 | db: database connection 139 | row: row tuple 140 | """ 141 | 142 | if "PostTypeId" in row.attrib: 143 | # PostType="1" - Question, PostType="2" - Answer 144 | table = ( 145 | XML2DB.QUESTIONS if row.attrib["PostTypeId"] == "1" else XML2DB.ANSWERS 146 | ) 147 | name = "questions" if row.attrib["PostTypeId"] == "1" else "answers" 148 | 149 | # Build insert prepared statement 150 | columns = [name for name, _ in table.items()] 151 | insert = XML2DB.INSERT_ROW.format( 152 | table=name, 153 | columns=", ".join(columns), 154 | values=("?, " * len(columns))[:-2], 155 | ) 156 | 157 | # Execute insert statement 158 | db.execute(insert, self.values(table, row, columns)) 159 | 160 | def values(self, table, row, columns): 161 | """ 162 | Formats and converts row into database types based on table schema. 163 | 164 | Args: 165 | table: table schema 166 | row: row tuple 167 | columns: column names 168 | 169 | Returns: 170 | Database schema formatted row tuple 171 | """ 172 | 173 | values = [] 174 | for column in columns: 175 | # Get column value 176 | value = row.attrib[column] if column in row.attrib else None 177 | 178 | if table[column].startswith("INTEGER"): 179 | values.append(int(value) if value else 0) 180 | elif table[column] == "BOOLEAN": 181 | values.append(1 if value == "TRUE" else 0) 182 | else: 183 | values.append(value) 184 | 185 | return values 186 | -------------------------------------------------------------------------------- /src/python/codequestion/evaluate.py: -------------------------------------------------------------------------------- 1 | """ 2 | Evaluate module 3 | """ 4 | 5 | import argparse 6 | import csv 7 | import os 8 | 9 | from scipy.stats import pearsonr, spearmanr 10 | from tqdm import tqdm 11 | from txtai.embeddings import Embeddings 12 | 13 | from .models import Models 14 | from .tokenizer import Tokenizer 15 | 16 | 17 | class StackExchange: 18 | """ 19 | Stack Exchange query-answer dataset. 20 | """ 21 | 22 | def __call__(self, path, method): 23 | """ 24 | Evaluates a pre-trained model against the Stack Exchange query-answer dataset. 25 | 26 | Args: 27 | path: path to tests 28 | method: run method 29 | """ 30 | 31 | # Load model 32 | embeddings = self.load() 33 | 34 | # Statistics 35 | mrr = [] 36 | 37 | # Build scoring index 38 | if method in ("bm25", "tfidf", "sif"): 39 | scoring = Embeddings({"keyword": True, "content": True}) 40 | scoring.index(self.stream(embeddings, "Building keyword index")) 41 | embeddings = scoring 42 | 43 | # Run test data 44 | with open( 45 | os.path.join(path, "stackexchange", "query.txt"), encoding="utf-8" 46 | ) as rows: 47 | for row in rows: 48 | query, sourceid, source, _ = row.split("|", 3) 49 | print(query, sourceid, source) 50 | 51 | # Run search 52 | results = self.search(embeddings, query) 53 | 54 | # Get row index within results 55 | index = -1 56 | for x, result in enumerate(results): 57 | if ( 58 | int(sourceid) == result["sourceid"] 59 | and source == result["source"] 60 | ): 61 | index = x 62 | 63 | # Calculate stats 64 | calc = 1 / (1 + index) if index != -1 else 0.0 65 | print(calc) 66 | mrr.append(calc) 67 | 68 | mrr = sum(mrr) / len(mrr) 69 | print("Mean Reciprocal Rank = ", mrr) 70 | 71 | def load(self): 72 | """ 73 | Loads a pre-trained embeddings model 74 | 75 | Returns: 76 | embeddings 77 | """ 78 | 79 | # Loading embeddings model 80 | embeddings = Embeddings() 81 | embeddings.load(Models.modelPath("stackexchange")) 82 | 83 | return embeddings 84 | 85 | def stream(self, embeddings, message): 86 | """ 87 | Streams content from an embeddings index. This method is a generator and will yield a row at time. 88 | 89 | Args: 90 | embeddings: embeddings index 91 | message: progress bar message 92 | """ 93 | 94 | offset, batch = 0, 1000 95 | with tqdm(total=embeddings.count(), desc=message) as progress: 96 | for offset in range(0, embeddings.count(), batch): 97 | for result in embeddings.search( 98 | f"select id, text, tags, source, sourceid from txtai limit {batch} offset {offset}" 99 | ): 100 | yield (result["id"], result, None) 101 | 102 | progress.update(batch) 103 | 104 | def search(self, embeddings, query): 105 | """ 106 | Executes a search. 107 | 108 | Args: 109 | embeddings: embeddings instance 110 | query: query to run 111 | 112 | Returns: 113 | search results 114 | """ 115 | 116 | results = None 117 | if embeddings.isweighted(): 118 | # Use custom tokenizer for word vector models 119 | uids = [ 120 | row["id"] for row in embeddings.search(Tokenizer.tokenize(query), 10) 121 | ] 122 | 123 | # Get source id + source for each result 124 | results = [] 125 | for uid in uids: 126 | results.append( 127 | embeddings.search( 128 | f"select sourceid, source from txtai where id = {uid}" 129 | )[0] 130 | ) 131 | else: 132 | # Select source id + source with standard similar clause 133 | results = embeddings.search( 134 | f"select sourceid, source from txtai where similar('{query}') limit 10" 135 | ) 136 | 137 | return results 138 | 139 | 140 | class STS: 141 | """ 142 | STS Benchmark Dataset 143 | General text similarity 144 | 145 | http://ixa2.si.ehu.es/stswiki/index.php/STSbenchmark 146 | """ 147 | 148 | def __call__(self, path, method): 149 | """ 150 | Test a list of vector models. 151 | 152 | Args: 153 | path: path to tests 154 | method: run method 155 | """ 156 | 157 | # Load embeddings instance - used to calculate similarity 158 | embeddings = Embeddings() 159 | embeddings.load(Models.modelPath("stackexchange")) 160 | 161 | # Test model against sts dataset 162 | self.test(embeddings, path, method) 163 | 164 | def test(self, embeddings, path, method): 165 | """ 166 | Tests input Embeddings model against STS benchmark data. 167 | 168 | Args: 169 | embeddings: embeddings instance 170 | path: path to tests 171 | method: run method 172 | """ 173 | 174 | # Test file path 175 | path = os.path.join( 176 | path, "stsbenchmark", f"sts-{'dev' if method == 'dev' else 'test'}.csv" 177 | ) 178 | 179 | # Read test data 180 | rows = self.read(path) 181 | 182 | # Calculated scores and ground truth labels 183 | scores = [] 184 | labels = [] 185 | 186 | for row in rows: 187 | text1, text2 = row[2], row[3] 188 | 189 | # Use custom tokenizer for word vector models 190 | if embeddings.isweighted(): 191 | text1 = Tokenizer.tokenize(text1) 192 | text2 = Tokenizer.tokenize(text2) 193 | 194 | if text1 and text2: 195 | score = embeddings.similarity(text1, [text2])[0][1] 196 | scores.append(score) 197 | 198 | # Ground truth score normalized between 0 - 1 199 | labels.append(row[1]) 200 | 201 | print("Pearson score =", pearsonr(scores, labels)) 202 | print("Spearman score =", spearmanr(scores, labels)) 203 | 204 | def read(self, path): 205 | """ 206 | Reads a STS data file. 207 | 208 | Args: 209 | path: full path to file 210 | 211 | Returns: 212 | rows 213 | """ 214 | 215 | with open(path, encoding="utf-8") as f: 216 | data = csv.reader(f, delimiter="\t", quoting=csv.QUOTE_NONE) 217 | 218 | rows = [] 219 | 220 | # Column Index-Name: 4-score, 5-string 1, 6-string 2 221 | for x, row in enumerate(data): 222 | # Normalize score from 0-5 to 0-1. 1 being most similar. 223 | score = float(row[4]) / 5.0 224 | 225 | # Store row as id (1 indexed), normalized score, string 1, string 2 226 | rows.append((x + 1, score, row[5], row[6])) 227 | 228 | return rows 229 | 230 | 231 | if __name__ == "__main__": 232 | # Command line parser 233 | parser = argparse.ArgumentParser(description="Evaluate") 234 | parser.add_argument( 235 | "-s", "--source", required=True, help="data source", metavar="SOURCE" 236 | ) 237 | parser.add_argument( 238 | "-p", "--path", required=True, help="path to test files", metavar="PATH" 239 | ) 240 | parser.add_argument("-m", "--method", help="run method", metavar="METHOD") 241 | 242 | # Parse command line arguments 243 | args = parser.parse_args() 244 | 245 | # Get eval action 246 | action = STS() if args.source.lower() == "sts" else StackExchange() 247 | 248 | # Run eval action 249 | action(args.path, args.method) 250 | -------------------------------------------------------------------------------- /src/python/codequestion/etl/stackexchange/db2qa.py: -------------------------------------------------------------------------------- 1 | """ 2 | DB2QA module 3 | """ 4 | 5 | import os 6 | import re 7 | import sqlite3 8 | 9 | 10 | class DB2QA: 11 | """ 12 | Converts multiple staging SQLite database (questions, answers in separate tables per source) into a consolidated SQLite database 13 | with a single questions table. 14 | """ 15 | 16 | # Questions schema 17 | QUESTIONS = { 18 | "Id": "INTEGER PRIMARY KEY", 19 | "Source": "TEXT", 20 | "SourceId": "INTEGER", 21 | "Date": "DATETIME", 22 | "Tags": "TEXT", 23 | "Question": "TEXT", 24 | "QuestionUser": "TEXT", 25 | "Answer": "TEXT", 26 | "AnswerUser": "TEXT", 27 | "Reference": "TEXT", 28 | } 29 | 30 | # List of sources 31 | SOURCES = { 32 | "ai": "https://ai.stackexchange.com", 33 | "android": "https://android.stackexchange.com", 34 | "apple": "https://apple.stackexchange.com", 35 | "arduino": "https://arduino.stackexchange.com", 36 | "askubuntu": "https://askubuntu.com", 37 | "avp": "https://avp.stackexchange.com", 38 | "codereview": "https://codereview.stackexchange.com", 39 | "cs": "https://cs.stackexchange.com", 40 | "datascience": "http://datascience.stackexchange.com", 41 | "dba": "https://dba.stackexchange.com", 42 | "devops": "https://devops.stackexchange.com", 43 | "dsp": "https://dsp.stackexchange.com", 44 | "raspberrypi": "https://raspberrypi.stackexchange.com", 45 | "reverseengineering": "https://reverseengineering.stackexchange.com", 46 | "scicomp": "https://scicomp.stackexchange.com", 47 | "security": "https://security.stackexchange.com", 48 | "serverfault": "https://serverfault.com", 49 | "stackoverflow": "https://stackoverflow.com", 50 | "stats": "https://stats.stackexchange.com", 51 | "superuser": "https://superuser.com", 52 | "unix": "https://unix.stackexchange.com", 53 | "vi": "https://vi.stackexchange.com", 54 | "wordpress": "https://wordpress.stackexchange.com", 55 | } 56 | 57 | # SQL statements 58 | CREATE_TABLE = "CREATE TABLE IF NOT EXISTS {table} ({fields})" 59 | INSERT_ROW = "INSERT INTO {table} ({columns}) VALUES ({values})" 60 | CREATE_SOURCE_INDEX = "CREATE INDEX source ON questions(Source, SourceId)" 61 | CREATE_TEXT_INDEX = "CREATE VIRTUAL TABLE search USING fts5(Id, Question, Tags)" 62 | INSERT_TEXT_ROWS = "INSERT INTO search SELECT Id, Question, Tags from questions" 63 | 64 | def __call__(self, databases, qafile): 65 | """ 66 | Executes a run to convert a list of databases to a single consolidated questions db file. 67 | 68 | Args: 69 | databases: paths to input databases 70 | qafile: output database path 71 | """ 72 | 73 | print(f"Converting {databases} to {qafile}") 74 | 75 | # Delete existing file 76 | if os.path.exists(qafile): 77 | os.remove(qafile) 78 | 79 | # Create output database 80 | qa = sqlite3.connect(qafile) 81 | 82 | # Create questions table 83 | self.create(qa, DB2QA.QUESTIONS, "questions") 84 | 85 | # Row index 86 | index = 0 87 | 88 | for dbfile in databases: 89 | print("Processing " + dbfile) 90 | 91 | # Create source name 92 | source = os.path.splitext(os.path.basename(dbfile))[0].lower() 93 | 94 | # Input database 95 | db = sqlite3.connect(dbfile) 96 | cur = db.cursor() 97 | 98 | cur.execute( 99 | "SELECT Id, AcceptedAnswerId, OwnerUserId, OwnerDisplayName, LastActivityDate, Title, Tags FROM questions" 100 | ) 101 | 102 | # Need to select all rows to allow execution of insert statements 103 | for question in cur.fetchall(): 104 | # Find accepted answer 105 | answer = self.find(question, cur) 106 | if answer: 107 | # Combine into single question row 108 | self.insert(qa, index, source, question, answer) 109 | 110 | index += 1 111 | if index % 10000 == 0: 112 | print(f"Inserted {index} rows") 113 | 114 | db.close() 115 | 116 | print(f"Total rows inserted: {index}") 117 | 118 | # Create indices 119 | for statement in [ 120 | DB2QA.CREATE_SOURCE_INDEX, 121 | DB2QA.CREATE_TEXT_INDEX, 122 | DB2QA.INSERT_TEXT_ROWS, 123 | ]: 124 | qa.execute(statement) 125 | 126 | # Commit changes and close 127 | qa.commit() 128 | qa.close() 129 | 130 | def create(self, db, table, name): 131 | """ 132 | Creates a SQLite table. 133 | 134 | Args: 135 | db: database connection 136 | table: table schema 137 | name: table name 138 | """ 139 | 140 | columns = [f"{name} {ctype}" for name, ctype in table.items()] 141 | create = DB2QA.CREATE_TABLE.format(table=name, fields=", ".join(columns)) 142 | 143 | # pylint: disable=W0703 144 | try: 145 | db.execute(create) 146 | except Exception as e: 147 | print(create) 148 | print("Failed to create table: " + e) 149 | 150 | def find(self, question, cur): 151 | """ 152 | Finds a corresponding answer for the input question. 153 | 154 | Args: 155 | question: input question row 156 | cur: database cursor 157 | 158 | Returns: 159 | Answer row if found, None otherwise 160 | """ 161 | 162 | # Query for accepted answer 163 | cur.execute( 164 | "SELECT Body, OwnerUserId, OwnerDisplayName from answers where Id = ?", 165 | [question[1]], 166 | ) 167 | answer = cur.fetchone() 168 | 169 | if answer and answer[0]: 170 | # Check if answer has a message body 171 | return answer 172 | 173 | return None 174 | 175 | def insert(self, db, index, source, question, answer): 176 | """ 177 | Builds and inserts a consolidated question. 178 | 179 | Args: 180 | db: database connection 181 | index: row index 182 | source: question source 183 | question: question row 184 | answer: answer row 185 | """ 186 | 187 | table = DB2QA.QUESTIONS 188 | 189 | # Build insert prepared statement 190 | columns = [name for name, _ in table.items()] 191 | insert = DB2QA.INSERT_ROW.format( 192 | table="questions", 193 | columns=", ".join(columns), 194 | values=("?, " * len(columns))[:-2], 195 | ) 196 | 197 | # Build row of insert values 198 | row = self.build(index, source, question, answer) 199 | 200 | # Execute insert statement 201 | db.execute(insert, self.values(table, row, columns)) 202 | 203 | def build(self, index, source, question, answer): 204 | """ 205 | Builds a consolidated question row. 206 | 207 | Args: 208 | index: row index 209 | source: question source 210 | question: question row 211 | answer: answer row 212 | 213 | Returns: 214 | row tuple 215 | """ 216 | 217 | # Parse tags into list of tags 218 | tags = re.sub(r"[<>]", " ", question[6]).split() if question[6] else None 219 | 220 | # Get user display name, fallback to user id 221 | quser = question[3] if question[3] else str(question[2]) 222 | auser = answer[2] if answer[2] else str(answer[1]) 223 | 224 | # Create URL reference 225 | reference = f"{DB2QA.SOURCES[source]}/questions/{question[0]}" 226 | 227 | # Id, Source, SourceId, Date, Tags, Question, QuestionUser, Answer, AnswerUser, Reference 228 | return ( 229 | index, 230 | source, 231 | question[0], 232 | question[4], 233 | " ".join(tags), 234 | question[5], 235 | quser, 236 | answer[0], 237 | auser, 238 | reference, 239 | ) 240 | 241 | def values(self, table, row, columns): 242 | """ 243 | Formats and converts row into database types based on table schema. 244 | 245 | Args: 246 | table: table schema 247 | row: row tuple 248 | columns: column names 249 | 250 | Returns: 251 | Database schema formatted row tuple 252 | """ 253 | 254 | values = [] 255 | for x, column in enumerate(columns): 256 | # Get value 257 | value = row[x] 258 | 259 | if table[column].startswith("INTEGER"): 260 | values.append(int(value) if value else 0) 261 | elif table[column] == "BOOLEAN": 262 | values.append(1 if value == "TRUE" else 0) 263 | else: 264 | values.append(value) 265 | 266 | return values 267 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | Copyright 2020- NeuML LLC 179 | 180 | Licensed under the Apache License, Version 2.0 (the "License"); 181 | you may not use this file except in compliance with the License. 182 | You may obtain a copy of the License at 183 | 184 | http://www.apache.org/licenses/LICENSE-2.0 185 | 186 | Unless required by applicable law or agreed to in writing, software 187 | distributed under the License is distributed on an "AS IS" BASIS, 188 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 189 | See the License for the specific language governing permissions and 190 | limitations under the License. -------------------------------------------------------------------------------- /test/stackexchange/query.txt: -------------------------------------------------------------------------------- 1 | android get screen dimensions|4743116|stackoverflow|Get screen width and height in Android|android 2 | android unique device id|2785485|stackoverflow|Is there a unique Android device ID?|android uniqueidentifier 3 | apt list all installed packages|17823|askubuntu|How to list all installed packages|apt package-management 4 | assembly pop stack|4584089|stackoverflow|What is the function of the push / pop instructions used on registers in x86 assembly?|assembly x86 stack terminology 5 | bash create uuid|103359|serverfault|How to create a UUID in bash?|bash uuid 6 | bash current time milliseconds|151109|serverfault|How do I get the current Unix time in milliseconds in Bash?|bash unix time 7 | bash list files|7265272|stackoverflow|How to list files in directory using bash?|bash 8 | bash sort du by output size|62411|serverfault|How can I sort du -h output by size|linux bash du gnu 9 | c cast malloc|605845|stackoverflow|Do I cast the result of malloc?|c malloc casting 10 | c++ convert string lower case|313970|stackoverflow|How to convert std::string to lower case?|c++ string c++-standard-library tolower 11 | centos print current version|54987|unix|How to determine CentOS version?|centos version 12 | centos upgrade 5 to 6|309053|superuser|How to upgrade CentOS 5.6 to 6.0?|linux centos 13 | centos when does cron.daily run|135906|serverfault|When does `cron.daily` run?|linux centos redhat cron 14 | c++ read file into ascii string|2602013|stackoverflow|Read whole ASCII file into C++ std::string|c++ string caching file-io standard-library 15 | c++ sleep|4184468|stackoverflow|Sleep for milliseconds|c++ linux sleep 16 | cuda gcc version|6622454|stackoverflow|CUDA incompatible with my gcc version|gcc cuda debian 17 | cuda get version|9727688|stackoverflow|How to get the cuda version?|cuda 18 | curl show http status|272265|superuser|Getting curl to output HTTP status code?|http curl status 19 | database inner join vs outer join|38549|stackoverflow|What is the difference between "INNER JOIN" and "OUTER JOIN"?|sql database join inner-join outer-join 20 | docker vs virtual machine|16047306|stackoverflow|How is Docker different from a virtual machine?|docker containers virtual-machine virtualization 21 | dpkg install deb file|159094|unix|How to install a deb file, by dpkg -i or by apt?|software-installation apt dpkg deb 22 | ec2 delete terminated instance|393417|serverfault|Delete Amazon EC2 terminated instance|amazon-ec2 23 | fedora print current version|540603|stackoverflow|How can I find the version of the Fedora I use?|linux fedora 24 | fedora test fstab|174181|serverfault|How do you validate fstab without rebooting?|fedora mount fstab 25 | gcc get assembly output|137038|stackoverflow|How do you get assembler output from C/C++ source in gcc?|c++ c debugging gcc assembly 26 | gcc vs g++|172587|stackoverflow|What is the difference between g++ and gcc?|c++ gcc g++ 27 | git add empty directory|115983|stackoverflow|How can I add an empty directory to a Git repository?|git directory git-add 28 | git discard unstaged changes|52704|stackoverflow|How do I discard unstaged changes in Git?|git version-control 29 | git pull vs fetch|292357|stackoverflow|What is the difference between 'git pull' and 'git fetch'?|git version-control git-pull git-fetch 30 | git undo most recent local commit|927358|stackoverflow|How do I undo the most recent local commits in Git?|git version-control git-commit undo pre-commit 31 | java fix nullpointerexception|218384|stackoverflow|What is a NullPointerException, and how do I fix it?|java nullpointerexception 32 | java get random number|5887709|stackoverflow|Getting random numbers in Java|java random 33 | java hashmap vs hashtable|40471|stackoverflow|What are the differences between a HashMap and a Hashtable in Java?|java collections hashmap hashtable 34 | java heap space error|37335|stackoverflow|How to deal with "java.lang.OutOfMemoryError: Java heap space" error?|java java-ee jvm out-of-memory heap-memory 35 | java outofmemoryerror poi|6069847|stackoverflow|java.lang.OutOfMemoryError: Java heap space while reading excel with Apache POI|java apache-poi 36 | java print date|26717733|stackoverflow|print current date in java|java time 37 | java round decimal|11701399|stackoverflow|round up to 2 decimal places in java?|java 38 | javascript check if string contains substring|1789945|stackoverflow|How to check whether a string contains a substring in JavaScript?|javascript string substring string-matching 39 | javascript create timestamp|221294|stackoverflow|How do you get a timestamp in JavaScript?|javascript date datetime timestamp unix-timestamp 40 | javascript encode url|332872|stackoverflow|Encode URL in JavaScript?|javascript url encoding 41 | javascript for-each in array|9329446|stackoverflow|For-each over an array in JavaScript?|javascript arrays loops foreach iteration 42 | javascript generate uuid|105034|stackoverflow|Create GUID / UUID in JavaScript?|javascript guid uuid 43 | javascript html5 local storage add object|2010892|stackoverflow|How to store objects in HTML5 localStorage/sessionStorage|javascript html local-storage 44 | javascript include file in another file|950087|stackoverflow|How do I include a JavaScript file in another JavaScript file?|javascript file import include 45 | javascript redirect to another website|503093|stackoverflow|How do I redirect to another webpage?|javascript jquery redirect 46 | javascript remove element from array|5767325|stackoverflow|How can I remove a specific item from an array?|javascript arrays 47 | javascript replace all occurrences in string|1144783|stackoverflow|How do I replace all occurrences of a string in JavaScript?|javascript string replace 48 | javascript validate email address|46155|stackoverflow|How can I validate an email address in JavaScript?|javascrit html regex email-validation 49 | java array sort|8938235|stackoverflow|Sort an array in Java|java arrays 50 | java split string|3481828|stackoverflow|How do I split a string in Java?|java string split 51 | java stringbuilder vs stringbuffer|355089|stackoverflow|Difference between StringBuilder and StringBuffer|java stringbuilder stringbuffer 52 | java string to int|5585779|stackoverflow|How do I convert a String to an int in Java?|java string int type-conversion 53 | java wait vs sleep|1036754|stackoverflow|Difference between "wait()" and "sleep()" in Java|java multithreading wait sleep java-threads 54 | java ways to iterate list|18410035|stackoverflow|Ways to iterate over a list in Java|java loops collections iteration 55 | java write file|2885173|stackoverflow|How do I create a file and write to it in Java?|java file-io 56 | json comments|244777|stackoverflow|Can comments be used in JSON?|json comments 57 | json proper content type|477816|stackoverflow|Which JSON content type do I use?|json mime-types content-type 58 | linux check if port open|309052|serverfault|Check if port is open or closed on a Linux server?|linux port telnet 59 | linux find files with specific text|16956810|stackoverflow|How to find all files containing specific text (string) on Linux?|linux text grep directory find 60 | ping specific port|309357|serverfault|Ping a Specific Port|ping 61 | psql list all databases and tables|1285|dba|How do I list all databases and tables using psql?|postgresql tools psql command-line 62 | python add new keys to dictionary|1024847|stackoverflow|How can I add new keys to a dictionary?|python dictionary lookup 63 | python aes decryption|12524994|stackoverflow|Encrypt & Decrypt using PyCrypto AES 256|python encryption padding pycrypto initialization-vector 64 | python call external script|9318581|stackoverflow|Python - how do I call external python programs?|python call external 65 | python check if list empty|53513|stackoverflow|How do I check if a list is empty?|python list 66 | python clone list|2612802|stackoverflow|How do I clone a list so that it doesn't change unexpectedly after assignment?|python list reference copy clone 67 | python contains string|3437059|stackoverflow|Does Python have a string 'contains' substring method?|python string substring contains 68 | python convert int to bytes|21017698|stackoverflow|Converting int to bytes in Python 3|python python-3.x 69 | python google sheets api|56084171|stackoverflow|Accessing Google Sheets Api with Python|stackoverflow python google-sheets-api 70 | python opencv draw rectangle|23720875|stackoverflow|How to draw a rectangle around a region of interest in python|python opencv computer-vision draw 71 | python parse csv|12296585|stackoverflow|Python Parse CSV Correctly|python parsing csv 72 | python parse float|379906|stackoverflow|How do I parse a string to a float or int?|python parsing floating-point type-conversion integer 73 | python parse json|7771011|stackoverflow|How to parse data in JSON format?|python json parsing 74 | python pdf extract text|15583535|stackoverflow|How to extract text from a PDF file in Python?|python pypdf 75 | python read wav file|2060628|stackoverflow|Reading *.wav files in Python|python audio wav wave 76 | python sort dictionary by value|613183|stackoverflow|How do I sort a dictionary by value?|python sorting dictionary 77 | python staticmethod vs classmethod|136097|stackoverflow|Difference between @staticmethod and @classmethod|python oop methods python-decorators 78 | python utc to localtime|4770297|stackoverflow|Convert UTC datetime string to local datetime|python datetime utc localtime 79 | python yield generator|41136410|stackoverflow|Python `yield from`, or return a generator?|python function return generator 80 | restful programming|671118|stackoverflow|What is RESTful programming?|rest http architecture definition 81 | rest put vs post|630453|stackoverflow|What is the difference between POST and PUT in HTTP?|http rest post put 82 | ruby shell command|2232|stackoverflow|How to call shell commands from Ruby|ruby shell interop 83 | ruby switch statement|948135|stackoverflow|How to write a switch statement in Ruby|ruby switch-statement conditional-statements 84 | scp recursive copy directories|264595|serverfault|Can scp copy directories recursively?|linux scp 85 | server room identify burning smell|496139|serverfault|Something is burning in the server room; how can I quickly identify what it is?|hardware 86 | sftp port|74176|serverfault|What port does SFTP use?|sftp 87 | ssh automate script with password|241588|serverfault|How to automate SSH login with password?|ssh password automation 88 | ssh diff|59140|serverfault|How do diff over ssh?|diff 89 | svd for pca|134282|stats|Relationship between SVD and PCA. How to use SVD to perform PCA?|pca dimensionality-reduction matrix svd faq 90 | tensorflow install ubuntu 14.04|41875915|stackoverflow|Install tensorflow on Ubuntu 14.04|python python-2.7 ubuntu tensorflow pip 91 | ubuntu show current version|12493|askubuntu|How can I find the version of Ubuntu that is installed?|versions release-management 92 | ubuntu install xfce|116602|askubuntu|How to install Xfce desktop environment?|xfce 93 | unzip zip file terminal|86849|askubuntu|How to unzip a zip file from the Terminal?|command-line zip 94 | vi copy text clipboard|84|vi|How can I copy text to the system clipboard from Vim?|cut-copy-paste os-clipboard 95 | windows check cpu temperature|395434|superuser|How can I check the temperature of my CPU in Windows?|windows cpu temperature 96 | windows compare pdf files|46123|superuser|How to compare the differences between two PDF files on Windows?|windows pdf file-comparison 97 | windows ctrl+alt+delete remote desktop|57222|serverfault|How to send ctrl+alt+del using Remote Desktop?|windows remote-desktop 98 | windows list running processes command line|914782|superuser|How do you list all processes on the command line in Windows?|windows command-line 99 | windows sudo|9652720|stackoverflow|How to run 'sudo' command in windows|windows 100 | windows wireless keyboard toaster|792607|superuser|Why does Windows think that my wireless keyboard is a toaster?|windows-7 device-manager 101 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 |

4 | 5 |

6 | Semantic search for developers 7 |

8 | 9 |

10 | 11 | Version 12 | 13 | 14 | GitHub last commit 15 | 16 | 17 | GitHub issues 18 | 19 | 20 | Build Status 21 | 22 | 23 | Coverage Status 24 | 25 |

26 | 27 | ------------------------------------------------------------------------------------------------------------------------------------------------------- 28 | 29 | codequestion is a semantic search application for developer questions. 30 | 31 | ![demo](https://raw.githubusercontent.com/neuml/codequestion/master/demo.gif) 32 | 33 | Developers typically have a web browser window open while they work and run web searches as questions arise. With codequestion, this can be done from a local context. This application executes similarity queries to find similar questions to the input query. 34 | 35 | The default model for codequestion is built off the [Stack Exchange Dumps on archive.org](https://archive.org/details/stackexchange). Once a model is installed, codequestion runs locally, no network connection is required. 36 | 37 | ![architecture](https://raw.githubusercontent.com/neuml/codequestion/master/images/architecture.png#gh-light-mode-only) 38 | ![architecture](https://raw.githubusercontent.com/neuml/codequestion/master/images/architecture-dark.png#gh-dark-mode-only) 39 | 40 | codequestion is built with Python 3.8+ and [txtai](https://github.com/neuml/txtai). 41 | 42 | ## Installation 43 | 44 | The easiest way to install is via pip and PyPI 45 | 46 | ``` 47 | pip install codequestion 48 | ``` 49 | 50 | Python 3.8+ is supported. Using a Python [virtual environment](https://docs.python.org/3/library/venv.html) is recommended. 51 | 52 | codequestion can also be installed directly from GitHub to access the latest, unreleased features. 53 | 54 | ``` 55 | pip install git+https://github.com/neuml/codequestion 56 | ``` 57 | 58 | See [this link](https://neuml.github.io/txtai/install/#environment-specific-prerequisites) for environment-specific troubleshooting. 59 | 60 | ## Download a model 61 | 62 | Once codequestion is installed, a model needs to be downloaded. 63 | 64 | ``` 65 | python -m codequestion.download 66 | ``` 67 | 68 | The model will be stored in ~/.codequestion/ 69 | 70 | The model can also be manually installed if the machine doesn't have direct internet access. The default model is pulled from the [GitHub release page](https://github.com/neuml/codequestion/releases) 71 | 72 | ``` 73 | unzip cqmodel.zip ~/.codequestion 74 | ``` 75 | 76 | ## Search 77 | 78 | Start up a codequestion shell to get started. 79 | 80 | ``` 81 | codequestion 82 | ``` 83 | 84 | A prompt will appear. Queries can be typed into the console. Type `help` to see all available commands. 85 | 86 | ![demo](https://raw.githubusercontent.com/neuml/codequestion/master/demo.gif) 87 | 88 | ## Topics 89 | 90 | The latest release integrates [txtai 5.0](https://medium.com/neuml/whats-new-in-txtai-5-0-e5c75a13b101), which has support for semantic graphs. 91 | 92 | Semantic graphs add support for topic modeling and path traversal. Topics organize questions into groups with similar concepts. Path traversal uses the semantic graph to show how two potentially disparate entries are connected. An example covering both topic and path traversal is shown below. 93 | 94 | ![topics](https://raw.githubusercontent.com/neuml/codequestion/master/images/topics.gif) 95 | 96 | ## VS Code 97 | 98 | A codequestion prompt can be started within Visual Studio Code. This enables asking coding questions right from your IDE. 99 | 100 | Run `` Ctrl+` `` to open a new terminal then type `codequestion`. 101 | 102 | ![vscode](https://raw.githubusercontent.com/neuml/codequestion/master/images/vscode.png) 103 | 104 | ## API service 105 | 106 | codequestion builds a standard txtai embeddings index. As such, it supports hosting the index via a [txtai API service](https://neuml.github.io/txtai/api). 107 | 108 | Running the following: 109 | 110 | _app.yml_ 111 | ```yaml 112 | path: /home/user/.codequestion/models/stackexchange/ 113 | embeddings: 114 | ``` 115 | 116 | ``` 117 | # Install API extra 118 | pip install txtai[api] 119 | 120 | # Start API 121 | CONFIG=app.yml uvicorn "txtai.api:app" 122 | 123 | # Test API 124 | curl "http://127.0.0.1:8000/search?query=python+query+sqlite&limit=1" 125 | ``` 126 | 127 | Outputs: 128 | ```json 129 | [{ 130 | "id":"616429", 131 | "text":"How to fetch data from sqlite using python? stackoverflow python sqlite", 132 | "score":0.8401689529418945 133 | }] 134 | ``` 135 | 136 | Additional metadata fields can be pulled back with SQL statements. 137 | 138 | ``` 139 | curl 140 | --get 141 | --data-urlencode "query=select id, date, tags, question, score from txtai where similar('python query sqlite')" 142 | --data-urlencode "limit=1" 143 | "http://127.0.0.1:8000/search" 144 | ``` 145 | 146 | ```json 147 | [{ 148 | "id":"616429", 149 | "date":"2022-05-23T10:45:40.397", 150 | "tags":"python sqlite", 151 | "question":"How to fetch data from sqlite using python?", 152 | "score":0.8401689529418945 153 | }] 154 | ``` 155 | 156 | ## Tech overview 157 | The following is an overview covering how this project works. 158 | 159 | ### Process the raw data dumps 160 | The raw 7z XML dumps from Stack Exchange are processed through a series of steps (see [building a model](#building-a-model)). Only highly scored questions with accepted answers are retrieved for storage in the model. Questions and answers are consolidated into a single SQLite file called questions.db. The schema for questions.db is below. 161 | 162 | *questions.db schema* 163 | 164 | Id INTEGER PRIMARY KEY 165 | Source TEXT 166 | SourceId INTEGER 167 | Date DATETIME 168 | Tags TEXT 169 | Question TEXT 170 | QuestionUser TEXT 171 | Answer TEXT 172 | AnswerUser TEXT 173 | Reference TEXT 174 | 175 | ### Index 176 | codequestion builds a txtai embeddings index for questions.db. Each question in the questions.db schema is vectorized with a sentence-transformers model. Once questions.db is converted to a collection of sentence embeddings, the embeddings are normalized and stored in Faiss, which enables fast similarity searches. 177 | 178 | ### Query 179 | codequestion tokenizes each query using the same method as during indexing. Those tokens are used to build a sentence embedding. That embedding is queried against the Faiss index to find the most similar questions. 180 | 181 | ## Build a model 182 | The following steps show how to build a codequestion model using Stack Exchange archives. 183 | 184 | _This is not necessary if using the default model from the [GitHub release page](https://github.com/neuml/codequestion/releases)_ 185 | 186 | 1.) Download files from Stack Exchange: https://archive.org/details/stackexchange 187 | 188 | 2.) Place selected files into a directory structure like shown below (current process requires all these files). 189 | 190 | - stackexchange/ai/ai.stackexchange.com.7z 191 | - stackexchange/android/android.stackexchange.com.7z 192 | - stackexchange/apple/apple.stackexchange.com.7z 193 | - stackexchange/arduino/arduino.stackexchange.com.7z 194 | - stackexchange/askubuntu/askubuntu.com.7z 195 | - stackexchange/avp/avp.stackexchange.com.7z 196 | - stackexchange/codereview/codereview.stackexchange.com.7z 197 | - stackexchange/cs/cs.stackexchange.com.7z 198 | - stackexchange/datascience/datascience.stackexchange.com.7z 199 | - stackexchange/dba/dba.stackexchange.com.7z 200 | - stackexchange/devops/devops.stackexchange.com.7z 201 | - stackexchange/dsp/dsp.stackexchange.com.7z 202 | - stackexchange/raspberrypi/raspberrypi.stackexchange.com.7z 203 | - stackexchange/reverseengineering/reverseengineering.stackexchange.com.7z 204 | - stackexchange/scicomp/scicomp.stackexchange.com.7z 205 | - stackexchange/security/security.stackexchange.com.7z 206 | - stackexchange/serverfault/serverfault.com.7z 207 | - stackexchange/stackoverflow/stackoverflow.com-Posts.7z 208 | - stackexchange/stats/stats.stackexchange.com.7z 209 | - stackexchange/superuser/superuser.com.7z 210 | - stackexchange/unix/unix.stackexchange.com.7z 211 | - stackexchange/vi/vi.stackexchange.com.7z 212 | - stackexchange/wordpress/wordpress.stackexchange.com.7z 213 | 214 | 3.) Run the ETL process 215 | 216 | ``` 217 | python -m codequestion.etl.stackexchange.execute stackexchange 218 | ``` 219 | 220 | This will create the file stackexchange/questions.db 221 | 222 | 4.) __OPTIONAL:__ Build word vectors - only necessary if using a word vectors model. If using word vector models, make sure to run `pip install txtai[similarity]` 223 | 224 | ``` 225 | python -m codequestion.vectors stackexchange/questions.db 226 | ``` 227 | 228 | This will create the file ~/.codequestion/vectors/stackexchange-300d.magnitude 229 | 230 | 5.) Build embeddings index 231 | 232 | ``` 233 | python -m codequestion.index index.yml stackexchange/questions.db 234 | ``` 235 | 236 | The [default index.yml](https://raw.githubusercontent.com/neuml/codequestion/master/config/index.yml) file is found on GitHub. Settings can be changed to customize how the index is built. 237 | 238 | After this step, the index is created and all necessary files are ready to query. 239 | 240 | ## Model accuracy 241 | The following sections show test results for codequestion v2 and codequestion v1 using the latest Stack Exchange dumps. Version 2 uses a sentence-transformers model. Version 1 uses a word vectors model with BM25 weighting. BM25 and TF-IDF are shown to establish a baseline score. 242 | 243 | **StackExchange Query** 244 | 245 | Models are scored using [Mean Reciprocal Rank (MRR)](https://en.wikipedia.org/wiki/Mean_reciprocal_rank). 246 | 247 | | Model | MRR | 248 | | ------------------- | :---: | 249 | | all-MiniLM-L6-v2 | 85.0 | 250 | | SE 300d - BM25 | 77.1 | 251 | | BM25 | 67.7 | 252 | | TF-IDF | 61.7 | 253 | 254 | **STS Benchmark** 255 | 256 | Models are scored using [Pearson Correlation](https://en.wikipedia.org/wiki/Pearson_correlation_coefficient). Note that the word vectors model is only trained on Stack Exchange data, so it isn't expected to generalize as well against the STS dataset. 257 | 258 | | Model | Supervision | Dev | Test | 259 | | ---------------- | :-----------: | :---: | :---: | 260 | | all-MiniLM-L6-v2 | Train | 87.0 | 82.7 | 261 | | SE 300d - BM25 | Train | 74.0 | 67.4 | 262 | 263 | ## Tests 264 | To reproduce the tests above, run the following. Substitute $TEST_PATH with any local path. 265 | 266 | mkdir -p $TEST_PATH 267 | wget https://raw.githubusercontent.com/neuml/codequestion/master/test/stackexchange/query.txt -P $TEST_PATH/stackexchange 268 | wget http://ixa2.si.ehu.es/stswiki/images/4/48/Stsbenchmark.tar.gz 269 | tar -C $TEST_PATH -xvzf Stsbenchmark.tar.gz 270 | python -m codequestion.evaluate -s test -p $TEST_PATH 271 | 272 | ## Further reading 273 | 274 | - [Find answers with codequestion 2.0](https://medium.com/neuml/find-answers-with-codequestion-2-0-50b2cfd8c8fe) 275 | - [Building a sentence embedding index with fastText and BM25 (codequestion 1.0)](https://towardsdatascience.com/building-a-sentence-embedding-index-with-fasttext-and-bm25-f07e7148d240) 276 | -------------------------------------------------------------------------------- /images/architecture.excalidraw: -------------------------------------------------------------------------------- 1 | { 2 | "type": "excalidraw", 3 | "version": 2, 4 | "source": "https://excalidraw.com", 5 | "elements": [ 6 | { 7 | "type": "text", 8 | "version": 780, 9 | "versionNonce": 881355380, 10 | "isDeleted": false, 11 | "id": "Buic2Lx427wuSIW8P_Rw5", 12 | "fillStyle": "hachure", 13 | "strokeWidth": 1, 14 | "strokeStyle": "solid", 15 | "roughness": 1, 16 | "opacity": 100, 17 | "angle": 0, 18 | "x": 736, 19 | "y": 179, 20 | "strokeColor": "#000000", 21 | "backgroundColor": "#228be6", 22 | "width": 658, 23 | "height": 46, 24 | "seed": 373648901, 25 | "groupIds": [], 26 | "roundness": null, 27 | "boundElements": [], 28 | "updated": 1674310362127, 29 | "link": null, 30 | "locked": false, 31 | "fontSize": 36, 32 | "fontFamily": 1, 33 | "text": "Semantic search for coding questions", 34 | "baseline": 32, 35 | "textAlign": "left", 36 | "verticalAlign": "top", 37 | "containerId": null, 38 | "originalText": "Semantic search for coding questions" 39 | }, 40 | { 41 | "type": "rectangle", 42 | "version": 2366, 43 | "versionNonce": 106997452, 44 | "isDeleted": false, 45 | "id": "U2NgEIEiFpAlwmv5Xnyzr", 46 | "fillStyle": "hachure", 47 | "strokeWidth": 1, 48 | "strokeStyle": "solid", 49 | "roughness": 1, 50 | "opacity": 40, 51 | "angle": 0, 52 | "x": 536.6352719532887, 53 | "y": 425.4438888888891, 54 | "strokeColor": "#000000", 55 | "backgroundColor": "#000000", 56 | "width": 1018.3567583077031, 57 | "height": 410.1764927948917, 58 | "seed": 1946478225, 59 | "groupIds": [], 60 | "roundness": null, 61 | "boundElements": [], 62 | "updated": 1674310362127, 63 | "link": "", 64 | "locked": false 65 | }, 66 | { 67 | "type": "rectangle", 68 | "version": 1576, 69 | "versionNonce": 175832052, 70 | "isDeleted": false, 71 | "id": "UO6MS3wSDu7yg2421__LI", 72 | "fillStyle": "hachure", 73 | "strokeWidth": 1, 74 | "strokeStyle": "solid", 75 | "roughness": 1, 76 | "opacity": 100, 77 | "angle": 0, 78 | "x": 934.1111111111111, 79 | "y": 267, 80 | "strokeColor": "#ffeb3b", 81 | "backgroundColor": "#ffeb3b", 82 | "width": 214, 83 | "height": 49, 84 | "seed": 1629565989, 85 | "groupIds": [ 86 | "3sURMvhuRfR0M-Q3VRPbg" 87 | ], 88 | "roundness": null, 89 | "boundElements": [ 90 | { 91 | "type": "text", 92 | "id": "8sp7H8ijWBlh6aMgZ0XTP" 93 | }, 94 | { 95 | "id": "Qzp41i_jzQIBlAB_qFKFH", 96 | "type": "arrow" 97 | }, 98 | { 99 | "id": "SJ0F0Y81z9hir5qQWAJjk", 100 | "type": "arrow" 101 | } 102 | ], 103 | "updated": 1674310362127, 104 | "link": null, 105 | "locked": false 106 | }, 107 | { 108 | "type": "rectangle", 109 | "version": 2351, 110 | "versionNonce": 733605196, 111 | "isDeleted": false, 112 | "id": "qYd3q0Vjks7VOHUC9RR51", 113 | "fillStyle": "hachure", 114 | "strokeWidth": 1, 115 | "strokeStyle": "solid", 116 | "roughness": 1, 117 | "opacity": 100, 118 | "angle": 0, 119 | "x": 550.1111111111111, 120 | "y": 267.5, 121 | "strokeColor": "#03a9f4", 122 | "backgroundColor": "#03a9f4", 123 | "width": 219, 124 | "height": 52, 125 | "seed": 1441952427, 126 | "groupIds": [ 127 | "3sURMvhuRfR0M-Q3VRPbg" 128 | ], 129 | "roundness": null, 130 | "boundElements": [ 131 | { 132 | "type": "text", 133 | "id": "WPeWn6N4rCHf0jY16N9Ge" 134 | }, 135 | { 136 | "id": "Qzp41i_jzQIBlAB_qFKFH", 137 | "type": "arrow" 138 | } 139 | ], 140 | "updated": 1674310362127, 141 | "link": null, 142 | "locked": false 143 | }, 144 | { 145 | "type": "text", 146 | "version": 2088, 147 | "versionNonce": 1998544244, 148 | "isDeleted": false, 149 | "id": "WPeWn6N4rCHf0jY16N9Ge", 150 | "fillStyle": "hachure", 151 | "strokeWidth": 1, 152 | "strokeStyle": "solid", 153 | "roughness": 1, 154 | "opacity": 100, 155 | "angle": 0, 156 | "x": 629.6111111111111, 157 | "y": 274, 158 | "strokeColor": "#000", 159 | "backgroundColor": "#fa5252", 160 | "width": 60, 161 | "height": 39, 162 | "seed": 870516459, 163 | "groupIds": [ 164 | "3sURMvhuRfR0M-Q3VRPbg" 165 | ], 166 | "roundness": null, 167 | "boundElements": [], 168 | "updated": 1674310362127, 169 | "link": null, 170 | "locked": false, 171 | "fontSize": 28, 172 | "fontFamily": 1, 173 | "text": "ETL", 174 | "baseline": 27, 175 | "textAlign": "center", 176 | "verticalAlign": "middle", 177 | "containerId": "qYd3q0Vjks7VOHUC9RR51", 178 | "originalText": "ETL" 179 | }, 180 | { 181 | "type": "rectangle", 182 | "version": 1785, 183 | "versionNonce": 321283020, 184 | "isDeleted": false, 185 | "id": "5VuUdI_BsJ5pyE1nTqJUI", 186 | "fillStyle": "hachure", 187 | "strokeWidth": 1, 188 | "strokeStyle": "solid", 189 | "roughness": 1, 190 | "opacity": 100, 191 | "angle": 0, 192 | "x": 1333.111111111111, 193 | "y": 268, 194 | "strokeColor": "#00e676", 195 | "backgroundColor": "#00e676", 196 | "width": 218, 197 | "height": 49, 198 | "seed": 1044404613, 199 | "groupIds": [ 200 | "3sURMvhuRfR0M-Q3VRPbg" 201 | ], 202 | "roundness": null, 203 | "boundElements": [ 204 | { 205 | "id": "bJJ9SGsJsvT071qBBH0w5", 206 | "type": "text" 207 | }, 208 | { 209 | "id": "SJ0F0Y81z9hir5qQWAJjk", 210 | "type": "arrow" 211 | } 212 | ], 213 | "updated": 1674310362127, 214 | "link": null, 215 | "locked": false 216 | }, 217 | { 218 | "type": "text", 219 | "version": 1985, 220 | "versionNonce": 1673236212, 221 | "isDeleted": false, 222 | "id": "bJJ9SGsJsvT071qBBH0w5", 223 | "fillStyle": "hachure", 224 | "strokeWidth": 1, 225 | "strokeStyle": "solid", 226 | "roughness": 1, 227 | "opacity": 100, 228 | "angle": 0, 229 | "x": 1338.111111111111, 230 | "y": 274.5, 231 | "strokeColor": "#000", 232 | "backgroundColor": "#fa5252", 233 | "width": 208, 234 | "height": 36, 235 | "seed": 128953675, 236 | "groupIds": [ 237 | "3sURMvhuRfR0M-Q3VRPbg" 238 | ], 239 | "roundness": null, 240 | "boundElements": [], 241 | "updated": 1674310362127, 242 | "link": null, 243 | "locked": false, 244 | "fontSize": 28, 245 | "fontFamily": 1, 246 | "text": "Search", 247 | "baseline": 25, 248 | "textAlign": "center", 249 | "verticalAlign": "middle", 250 | "containerId": "5VuUdI_BsJ5pyE1nTqJUI", 251 | "originalText": "Search" 252 | }, 253 | { 254 | "type": "text", 255 | "version": 1602, 256 | "versionNonce": 1925188172, 257 | "isDeleted": false, 258 | "id": "8sp7H8ijWBlh6aMgZ0XTP", 259 | "fillStyle": "hachure", 260 | "strokeWidth": 1, 261 | "strokeStyle": "solid", 262 | "roughness": 1, 263 | "opacity": 100, 264 | "angle": 0, 265 | "x": 939.1111111111111, 266 | "y": 273.5, 267 | "strokeColor": "#000", 268 | "backgroundColor": "transparent", 269 | "width": 204, 270 | "height": 36, 271 | "seed": 1854823263, 272 | "groupIds": [ 273 | "3sURMvhuRfR0M-Q3VRPbg" 274 | ], 275 | "roundness": null, 276 | "boundElements": [], 277 | "updated": 1674310362127, 278 | "link": null, 279 | "locked": false, 280 | "fontSize": 28, 281 | "fontFamily": 1, 282 | "text": "Index", 283 | "baseline": 25, 284 | "textAlign": "center", 285 | "verticalAlign": "middle", 286 | "containerId": "UO6MS3wSDu7yg2421__LI", 287 | "originalText": "Index" 288 | }, 289 | { 290 | "type": "text", 291 | "version": 1134, 292 | "versionNonce": 550867060, 293 | "isDeleted": false, 294 | "id": "jWJpSXHkTCzRTCA4tbAgv", 295 | "fillStyle": "hachure", 296 | "strokeWidth": 1, 297 | "strokeStyle": "solid", 298 | "roughness": 1, 299 | "opacity": 100, 300 | "angle": 0, 301 | "x": 549.6111111111111, 302 | "y": 347.30499999999995, 303 | "strokeColor": "#000", 304 | "backgroundColor": "#03a9f4", 305 | "width": 270, 306 | "height": 42, 307 | "seed": 1241563487, 308 | "groupIds": [ 309 | "3sURMvhuRfR0M-Q3VRPbg" 310 | ], 311 | "roundness": null, 312 | "boundElements": [], 313 | "updated": 1674310362127, 314 | "link": null, 315 | "locked": false, 316 | "fontSize": 16, 317 | "fontFamily": 1, 318 | "text": "- Parse and transform input\n- Filter down to \"popular\" answers", 319 | "baseline": 36, 320 | "textAlign": "left", 321 | "verticalAlign": "top", 322 | "containerId": null, 323 | "originalText": "- Parse and transform input\n- Filter down to \"popular\" answers" 324 | }, 325 | { 326 | "type": "text", 327 | "version": 1121, 328 | "versionNonce": 761323724, 329 | "isDeleted": false, 330 | "id": "qEnmXs0P_MQE8r4c4OWGh", 331 | "fillStyle": "hachure", 332 | "strokeWidth": 1, 333 | "strokeStyle": "solid", 334 | "roughness": 1, 335 | "opacity": 100, 336 | "angle": 0, 337 | "x": 932.6111111111111, 338 | "y": 346.2074999999999, 339 | "strokeColor": "#000", 340 | "backgroundColor": "#f44336", 341 | "width": 245, 342 | "height": 42, 343 | "seed": 1038536465, 344 | "groupIds": [ 345 | "3sURMvhuRfR0M-Q3VRPbg" 346 | ], 347 | "roundness": null, 348 | "boundElements": [], 349 | "updated": 1674310362127, 350 | "link": null, 351 | "locked": false, 352 | "fontSize": 16, 353 | "fontFamily": 1, 354 | "text": "- Transform input into numbers\n- Store content with vectors", 355 | "baseline": 36, 356 | "textAlign": "left", 357 | "verticalAlign": "top", 358 | "containerId": null, 359 | "originalText": "- Transform input into numbers\n- Store content with vectors" 360 | }, 361 | { 362 | "type": "text", 363 | "version": 1185, 364 | "versionNonce": 2117296628, 365 | "isDeleted": false, 366 | "id": "1q8bzjK8lnKUZj8_A9v7D", 367 | "fillStyle": "hachure", 368 | "strokeWidth": 1, 369 | "strokeStyle": "solid", 370 | "roughness": 1, 371 | "opacity": 100, 372 | "angle": 0, 373 | "x": 1245.111111111111, 374 | "y": 349.2074999999999, 375 | "strokeColor": "#000", 376 | "backgroundColor": "#f44336", 377 | "width": 322, 378 | "height": 42, 379 | "seed": 304472945, 380 | "groupIds": [ 381 | "3sURMvhuRfR0M-Q3VRPbg" 382 | ], 383 | "roundness": null, 384 | "boundElements": [], 385 | "updated": 1674310362127, 386 | "link": null, 387 | "locked": false, 388 | "fontSize": 16, 389 | "fontFamily": 1, 390 | "text": "- Find similar content with vector search\n- Explore topics and relationships", 391 | "baseline": 36, 392 | "textAlign": "left", 393 | "verticalAlign": "top", 394 | "containerId": null, 395 | "originalText": "- Find similar content with vector search\n- Explore topics and relationships" 396 | }, 397 | { 398 | "type": "arrow", 399 | "version": 3387, 400 | "versionNonce": 983754572, 401 | "isDeleted": false, 402 | "id": "Qzp41i_jzQIBlAB_qFKFH", 403 | "fillStyle": "hachure", 404 | "strokeWidth": 1, 405 | "strokeStyle": "solid", 406 | "roughness": 1, 407 | "opacity": 100, 408 | "angle": 0, 409 | "x": 771.6111111111111, 410 | "y": 289.8470411964629, 411 | "strokeColor": "#000", 412 | "backgroundColor": "#f44336", 413 | "width": 158.1310513485223, 414 | "height": 0.5692601572380909, 415 | "seed": 660786897, 416 | "groupIds": [ 417 | "3sURMvhuRfR0M-Q3VRPbg" 418 | ], 419 | "roundness": { 420 | "type": 2 421 | }, 422 | "boundElements": [], 423 | "updated": 1674310362127, 424 | "link": null, 425 | "locked": false, 426 | "startBinding": { 427 | "elementId": "qYd3q0Vjks7VOHUC9RR51", 428 | "focus": -0.15367587596362536, 429 | "gap": 2.5 430 | }, 431 | "endBinding": { 432 | "elementId": "UO6MS3wSDu7yg2421__LI", 433 | "focus": 0.027437144815141, 434 | "gap": 4.3689486514776945 435 | }, 436 | "lastCommittedPoint": null, 437 | "startArrowhead": null, 438 | "endArrowhead": "arrow", 439 | "points": [ 440 | [ 441 | 0, 442 | 0 443 | ], 444 | [ 445 | 158.1310513485223, 446 | 0.5692601572380909 447 | ] 448 | ] 449 | }, 450 | { 451 | "type": "arrow", 452 | "version": 3907, 453 | "versionNonce": 1658520436, 454 | "isDeleted": false, 455 | "id": "SJ0F0Y81z9hir5qQWAJjk", 456 | "fillStyle": "hachure", 457 | "strokeWidth": 1, 458 | "strokeStyle": "solid", 459 | "roughness": 1, 460 | "opacity": 100, 461 | "angle": 0, 462 | "x": 1150.611111111111, 463 | "y": 292.6790761701911, 464 | "strokeColor": "#000", 465 | "backgroundColor": "#f44336", 466 | "width": 181.5, 467 | "height": 1.5898915058209013, 468 | "seed": 899541905, 469 | "groupIds": [ 470 | "3sURMvhuRfR0M-Q3VRPbg" 471 | ], 472 | "roundness": { 473 | "type": 2 474 | }, 475 | "boundElements": [], 476 | "updated": 1674310362127, 477 | "link": null, 478 | "locked": false, 479 | "startBinding": { 480 | "elementId": "UO6MS3wSDu7yg2421__LI", 481 | "focus": 0.08406032225724415, 482 | "gap": 2.5 483 | }, 484 | "endBinding": { 485 | "elementId": "5VuUdI_BsJ5pyE1nTqJUI", 486 | "focus": 0.09327847520504394, 487 | "gap": 1 488 | }, 489 | "lastCommittedPoint": null, 490 | "startArrowhead": null, 491 | "endArrowhead": "arrow", 492 | "points": [ 493 | [ 494 | 0, 495 | 0 496 | ], 497 | [ 498 | 181.5, 499 | -1.5898915058209013 500 | ] 501 | ] 502 | }, 503 | { 504 | "type": "text", 505 | "version": 121, 506 | "versionNonce": 1369352592, 507 | "isDeleted": false, 508 | "id": "0S4gs8k1Aw_EE3epHrlwi", 509 | "fillStyle": "hachure", 510 | "strokeWidth": 1, 511 | "strokeStyle": "solid", 512 | "roughness": 1, 513 | "opacity": 100, 514 | "angle": 0, 515 | "x": 558.9603174603171, 516 | "y": 440.1485317460317, 517 | "strokeColor": "#000000", 518 | "backgroundColor": "transparent", 519 | "width": 520, 520 | "height": 52, 521 | "seed": 1521084272, 522 | "groupIds": [], 523 | "roundness": null, 524 | "boundElements": [], 525 | "updated": 1665016874534, 526 | "link": null, 527 | "locked": false, 528 | "fontSize": 20, 529 | "fontFamily": 1, 530 | "text": ">>> python build pdf\n---------------------------------------------------------------", 531 | "baseline": 44, 532 | "textAlign": "left", 533 | "verticalAlign": "top", 534 | "containerId": null, 535 | "originalText": ">>> python build pdf\n---------------------------------------------------------------" 536 | }, 537 | { 538 | "type": "text", 539 | "version": 948, 540 | "versionNonce": 249568716, 541 | "isDeleted": false, 542 | "id": "hnqGO83Op144jMURaGlCf", 543 | "fillStyle": "hachure", 544 | "strokeWidth": 1, 545 | "strokeStyle": "solid", 546 | "roughness": 1, 547 | "opacity": 100, 548 | "angle": 0, 549 | "x": 567.8492063492062, 550 | "y": 489.03742063492075, 551 | "strokeColor": "#000000", 552 | "backgroundColor": "transparent", 553 | "width": 397, 554 | "height": 156, 555 | "seed": 1108820368, 556 | "groupIds": [], 557 | "roundness": null, 558 | "boundElements": [ 559 | { 560 | "id": "f3vLDOpOTtgvPlvxSLtb6", 561 | "type": "arrow" 562 | } 563 | ], 564 | "updated": 1674310362127, 565 | "link": null, 566 | "locked": false, 567 | "fontSize": 20, 568 | "fontFamily": 1, 569 | "text": "\n\nId: 219570\nLast Activity: 2016-11-22T09:07:49.983\nTags: python pdf pdf-generation\nAnswer (by 772200):", 570 | "baseline": 148, 571 | "textAlign": "left", 572 | "verticalAlign": "top", 573 | "containerId": null, 574 | "originalText": "\n\nId: 219570\nLast Activity: 2016-11-22T09:07:49.983\nTags: python pdf pdf-generation\nAnswer (by 772200):" 575 | }, 576 | { 577 | "type": "text", 578 | "version": 173, 579 | "versionNonce": 1895751536, 580 | "isDeleted": false, 581 | "id": "AKQSU-yPRu9JKbNgVRi9v", 582 | "fillStyle": "hachure", 583 | "strokeWidth": 1, 584 | "strokeStyle": "solid", 585 | "roughness": 1, 586 | "opacity": 100, 587 | "angle": 0, 588 | "x": 566.7380952380952, 589 | "y": 630.1485317460314, 590 | "strokeColor": "#000000", 591 | "backgroundColor": "transparent", 592 | "width": 322, 593 | "height": 130, 594 | "seed": 1679216, 595 | "groupIds": [], 596 | "roundness": null, 597 | "boundElements": [], 598 | "updated": 1665016981495, 599 | "link": null, 600 | "locked": false, 601 | "fontSize": 20.069228106611277, 602 | "fontFamily": 1, 603 | "text": "\nThe two that come to mind are:\n\n • pyPdf2\n • PDFMiner", 604 | "baseline": 122, 605 | "textAlign": "left", 606 | "verticalAlign": "top", 607 | "containerId": null, 608 | "originalText": "\nThe two that come to mind are:\n\n • pyPdf2\n • PDFMiner" 609 | }, 610 | { 611 | "type": "text", 612 | "version": 85, 613 | "versionNonce": 1520293264, 614 | "isDeleted": false, 615 | "id": "oWhhEqzBly2k2HiDjxdzq", 616 | "fillStyle": "hachure", 617 | "strokeWidth": 1, 618 | "strokeStyle": "solid", 619 | "roughness": 1, 620 | "opacity": 100, 621 | "angle": 0, 622 | "x": 568.9603174603171, 623 | "y": 779.0374206349206, 624 | "strokeColor": "#03a9f4", 625 | "backgroundColor": "transparent", 626 | "width": 550, 627 | "height": 26, 628 | "seed": 1958330768, 629 | "groupIds": [], 630 | "roundness": null, 631 | "boundElements": [], 632 | "updated": 1665016908431, 633 | "link": null, 634 | "locked": false, 635 | "fontSize": 20, 636 | "fontFamily": 1, 637 | "text": "Reference: https://stackoverflow.com/questions/6413441", 638 | "baseline": 18, 639 | "textAlign": "left", 640 | "verticalAlign": "top", 641 | "containerId": null, 642 | "originalText": "Reference: https://stackoverflow.com/questions/6413441" 643 | }, 644 | { 645 | "type": "text", 646 | "version": 584, 647 | "versionNonce": 1695708404, 648 | "isDeleted": false, 649 | "id": "E-1wDKL8ZmPnn0ONPMDti", 650 | "fillStyle": "hachure", 651 | "strokeWidth": 1, 652 | "strokeStyle": "solid", 653 | "roughness": 1, 654 | "opacity": 100, 655 | "angle": 0, 656 | "x": 566.738095238095, 657 | "y": 502.3707539682539, 658 | "strokeColor": "#000000", 659 | "backgroundColor": "transparent", 660 | "width": 517, 661 | "height": 26, 662 | "seed": 2028071280, 663 | "groupIds": [], 664 | "roundness": null, 665 | "boundElements": [ 666 | { 667 | "id": "Nb5_4C9PyVmUjcNmwmtTf", 668 | "type": "arrow" 669 | } 670 | ], 671 | "updated": 1674310362127, 672 | "link": null, 673 | "locked": false, 674 | "fontSize": 20, 675 | "fontFamily": 1, 676 | "text": "Question (by 312251): Python PDF library [0.801761]", 677 | "baseline": 18, 678 | "textAlign": "left", 679 | "verticalAlign": "top", 680 | "containerId": null, 681 | "originalText": "Question (by 312251): Python PDF library [0.801761]" 682 | }, 683 | { 684 | "type": "arrow", 685 | "version": 502, 686 | "versionNonce": 978951056, 687 | "isDeleted": false, 688 | "id": "Nb5_4C9PyVmUjcNmwmtTf", 689 | "fillStyle": "hachure", 690 | "strokeWidth": 1, 691 | "strokeStyle": "solid", 692 | "roughness": 1, 693 | "opacity": 100, 694 | "angle": 0, 695 | "x": 1307.8492063492063, 696 | "y": 480.8848231593677, 697 | "strokeColor": "#000000", 698 | "backgroundColor": "#000000", 699 | "width": 217.77777777777783, 700 | "height": 31.597041919997253, 701 | "seed": 1504292752, 702 | "groupIds": [], 703 | "roundness": { 704 | "type": 2 705 | }, 706 | "boundElements": [], 707 | "updated": 1665017411344, 708 | "link": null, 709 | "locked": false, 710 | "startBinding": { 711 | "elementId": "bxrIQaIPIjEN65QVVyKjd", 712 | "focus": 0.6279335165098477, 713 | "gap": 3.8333333333332575 714 | }, 715 | "endBinding": { 716 | "elementId": "E-1wDKL8ZmPnn0ONPMDti", 717 | "focus": 0.3581099106309167, 718 | "gap": 6.333333333333485 719 | }, 720 | "lastCommittedPoint": null, 721 | "startArrowhead": null, 722 | "endArrowhead": "arrow", 723 | "points": [ 724 | [ 725 | 0, 726 | 0 727 | ], 728 | [ 729 | -65.55555555555566, 730 | 24.930375253330624 731 | ], 732 | [ 733 | -217.77777777777783, 734 | 31.597041919997253 735 | ] 736 | ] 737 | }, 738 | { 739 | "type": "rectangle", 740 | "version": 2726, 741 | "versionNonce": 53642316, 742 | "isDeleted": false, 743 | "id": "bxrIQaIPIjEN65QVVyKjd", 744 | "fillStyle": "hachure", 745 | "strokeWidth": 1, 746 | "strokeStyle": "solid", 747 | "roughness": 1, 748 | "opacity": 100, 749 | "angle": 0, 750 | "x": 1311.6825396825395, 751 | "y": 454.2596428571427, 752 | "strokeColor": "#5f3dc4", 753 | "backgroundColor": "#5f3dc4", 754 | "width": 219, 755 | "height": 52, 756 | "seed": 1176432016, 757 | "groupIds": [ 758 | "XHWKg8UL3ErDF5KJBC0TT" 759 | ], 760 | "roundness": null, 761 | "boundElements": [ 762 | { 763 | "id": "Y6VADemG1rq3Yf1_X3Rkw", 764 | "type": "text" 765 | }, 766 | { 767 | "id": "Qzp41i_jzQIBlAB_qFKFH", 768 | "type": "arrow" 769 | }, 770 | { 771 | "id": "Nb5_4C9PyVmUjcNmwmtTf", 772 | "type": "arrow" 773 | } 774 | ], 775 | "updated": 1674310362127, 776 | "link": null, 777 | "locked": false 778 | }, 779 | { 780 | "type": "text", 781 | "version": 1543, 782 | "versionNonce": 1775569264, 783 | "isDeleted": false, 784 | "id": "Y6VADemG1rq3Yf1_X3Rkw", 785 | "fillStyle": "hachure", 786 | "strokeWidth": 1, 787 | "strokeStyle": "solid", 788 | "roughness": 1, 789 | "opacity": 100, 790 | "angle": 0, 791 | "x": 1329.6825396825395, 792 | "y": 460.7596428571427, 793 | "strokeColor": "#000000", 794 | "backgroundColor": "#fa5252", 795 | "width": 183, 796 | "height": 39, 797 | "seed": 2097855376, 798 | "groupIds": [ 799 | "XHWKg8UL3ErDF5KJBC0TT" 800 | ], 801 | "roundness": null, 802 | "boundElements": [], 803 | "updated": 1665017413674, 804 | "link": null, 805 | "locked": false, 806 | "fontSize": 28, 807 | "fontFamily": 1, 808 | "text": "Vector match", 809 | "baseline": 27, 810 | "textAlign": "center", 811 | "verticalAlign": "middle", 812 | "containerId": "bxrIQaIPIjEN65QVVyKjd", 813 | "originalText": "Vector match" 814 | }, 815 | { 816 | "type": "rectangle", 817 | "version": 1089, 818 | "versionNonce": 691460496, 819 | "isDeleted": false, 820 | "id": "G-qwb8bQ8dozMQZmcq1Gl", 821 | "fillStyle": "hachure", 822 | "strokeWidth": 1, 823 | "strokeStyle": "solid", 824 | "roughness": 1, 825 | "opacity": 100, 826 | "angle": 0, 827 | "x": 1319.738095238095, 828 | "y": 685.7596428571427, 829 | "strokeColor": "#03a9f4", 830 | "backgroundColor": "#03a9f4", 831 | "width": 214, 832 | "height": 49, 833 | "seed": 1905111440, 834 | "groupIds": [ 835 | "wf5G07CYJirX_YOSAFeDv" 836 | ], 837 | "roundness": null, 838 | "boundElements": [ 839 | { 840 | "id": "T7Dd9_CUf4IoQLKnCySvb", 841 | "type": "text" 842 | }, 843 | { 844 | "id": "Qzp41i_jzQIBlAB_qFKFH", 845 | "type": "arrow" 846 | }, 847 | { 848 | "id": "SJ0F0Y81z9hir5qQWAJjk", 849 | "type": "arrow" 850 | }, 851 | { 852 | "id": "6CxseRtxEY_xN1wGA8ahy", 853 | "type": "arrow" 854 | } 855 | ], 856 | "updated": 1665017719529, 857 | "link": null, 858 | "locked": false 859 | }, 860 | { 861 | "type": "text", 862 | "version": 1104, 863 | "versionNonce": 1279521680, 864 | "isDeleted": false, 865 | "id": "T7Dd9_CUf4IoQLKnCySvb", 866 | "fillStyle": "hachure", 867 | "strokeWidth": 1, 868 | "strokeStyle": "solid", 869 | "roughness": 1, 870 | "opacity": 100, 871 | "angle": 0, 872 | "x": 1380.238095238095, 873 | "y": 690.7596428571427, 874 | "strokeColor": "#000000", 875 | "backgroundColor": "#03a9f4", 876 | "width": 93, 877 | "height": 39, 878 | "seed": 2089545584, 879 | "groupIds": [ 880 | "wf5G07CYJirX_YOSAFeDv" 881 | ], 882 | "roundness": null, 883 | "boundElements": [], 884 | "updated": 1665017732426, 885 | "link": null, 886 | "locked": false, 887 | "fontSize": 28, 888 | "fontFamily": 1, 889 | "text": "Answer", 890 | "baseline": 27, 891 | "textAlign": "center", 892 | "verticalAlign": "middle", 893 | "containerId": "G-qwb8bQ8dozMQZmcq1Gl", 894 | "originalText": "Answer" 895 | }, 896 | { 897 | "type": "arrow", 898 | "version": 616, 899 | "versionNonce": 1298745200, 900 | "isDeleted": false, 901 | "id": "6CxseRtxEY_xN1wGA8ahy", 902 | "fillStyle": "hachure", 903 | "strokeWidth": 1, 904 | "strokeStyle": "solid", 905 | "roughness": 1, 906 | "opacity": 100, 907 | "angle": 0, 908 | "x": 1303.9603174603174, 909 | "y": 708.4075937236478, 910 | "strokeColor": "#000000", 911 | "backgroundColor": "#000000", 912 | "width": 377.7777777777778, 913 | "height": 6.2911737786592425, 914 | "seed": 1277086096, 915 | "groupIds": [], 916 | "roundness": { 917 | "type": 2 918 | }, 919 | "boundElements": [], 920 | "updated": 1665017719530, 921 | "link": null, 922 | "locked": false, 923 | "startBinding": { 924 | "elementId": "G-qwb8bQ8dozMQZmcq1Gl", 925 | "focus": 0.17330116418533134, 926 | "gap": 15.777777777777601 927 | }, 928 | "endBinding": null, 929 | "lastCommittedPoint": null, 930 | "startArrowhead": null, 931 | "endArrowhead": "arrow", 932 | "points": [ 933 | [ 934 | 0, 935 | 0 936 | ], 937 | [ 938 | -225.55555555555566, 939 | 5.1800626675482135 940 | ], 941 | [ 942 | -377.7777777777778, 943 | 6.2911737786592425 944 | ] 945 | ] 946 | }, 947 | { 948 | "type": "rectangle", 949 | "version": 1744, 950 | "versionNonce": 394840052, 951 | "isDeleted": false, 952 | "id": "mskx8L2KXgKOLHKahrjQI", 953 | "fillStyle": "hachure", 954 | "strokeWidth": 1, 955 | "strokeStyle": "solid", 956 | "roughness": 1, 957 | "opacity": 100, 958 | "angle": 0, 959 | "x": 1315.5158730158726, 960 | "y": 571.3151984126984, 961 | "strokeColor": "#fa5252", 962 | "backgroundColor": "#ff7043", 963 | "width": 218, 964 | "height": 49, 965 | "seed": 1444977008, 966 | "groupIds": [ 967 | "Ow3OuCkl-1gnPf96uZhoJ" 968 | ], 969 | "roundness": null, 970 | "boundElements": [ 971 | { 972 | "id": "dADm_k9Od8a9ANLvFiLsB", 973 | "type": "text" 974 | }, 975 | { 976 | "id": "SJ0F0Y81z9hir5qQWAJjk", 977 | "type": "arrow" 978 | }, 979 | { 980 | "id": "f3vLDOpOTtgvPlvxSLtb6", 981 | "type": "arrow" 982 | } 983 | ], 984 | "updated": 1674310379960, 985 | "link": null, 986 | "locked": false 987 | }, 988 | { 989 | "type": "text", 990 | "version": 1496, 991 | "versionNonce": 406774092, 992 | "isDeleted": false, 993 | "id": "dADm_k9Od8a9ANLvFiLsB", 994 | "fillStyle": "hachure", 995 | "strokeWidth": 1, 996 | "strokeStyle": "solid", 997 | "roughness": 1, 998 | "opacity": 100, 999 | "angle": 0, 1000 | "x": 1353.5158730158726, 1001 | "y": 576.3151984126984, 1002 | "strokeColor": "#000000", 1003 | "backgroundColor": "#ff7043", 1004 | "width": 142, 1005 | "height": 39, 1006 | "seed": 464251760, 1007 | "groupIds": [ 1008 | "Ow3OuCkl-1gnPf96uZhoJ" 1009 | ], 1010 | "roundness": null, 1011 | "boundElements": [], 1012 | "updated": 1674310379960, 1013 | "link": null, 1014 | "locked": false, 1015 | "fontSize": 28, 1016 | "fontFamily": 1, 1017 | "text": "Metadata", 1018 | "baseline": 27, 1019 | "textAlign": "center", 1020 | "verticalAlign": "middle", 1021 | "containerId": "mskx8L2KXgKOLHKahrjQI", 1022 | "originalText": "Metadata" 1023 | }, 1024 | { 1025 | "type": "arrow", 1026 | "version": 583, 1027 | "versionNonce": 129907088, 1028 | "isDeleted": false, 1029 | "id": "f3vLDOpOTtgvPlvxSLtb6", 1030 | "fillStyle": "hachure", 1031 | "strokeWidth": 1, 1032 | "strokeStyle": "solid", 1033 | "roughness": 1, 1034 | "opacity": 100, 1035 | "angle": 0, 1036 | "x": 1305.0714285714284, 1037 | "y": 592.5642974832429, 1038 | "strokeColor": "#000000", 1039 | "backgroundColor": "#000000", 1040 | "width": 321.1111111111113, 1041 | "height": 15.467803352397482, 1042 | "seed": 449970544, 1043 | "groupIds": [], 1044 | "roundness": { 1045 | "type": 2 1046 | }, 1047 | "boundElements": [], 1048 | "updated": 1665017397537, 1049 | "link": null, 1050 | "locked": false, 1051 | "startBinding": { 1052 | "elementId": "mskx8L2KXgKOLHKahrjQI", 1053 | "focus": 0.2891539308183631, 1054 | "gap": 10.444444444444116 1055 | }, 1056 | "endBinding": { 1057 | "elementId": "hnqGO83Op144jMURaGlCf", 1058 | "focus": 0.5894787674956782, 1059 | "gap": 19.111111111110972 1060 | }, 1061 | "lastCommittedPoint": null, 1062 | "startArrowhead": null, 1063 | "endArrowhead": "arrow", 1064 | "points": [ 1065 | [ 1066 | 0, 1067 | 0 1068 | ], 1069 | [ 1070 | -74.44444444444457, 1071 | 3.2455811301751964 1072 | ], 1073 | [ 1074 | -321.1111111111113, 1075 | 15.467803352397482 1076 | ] 1077 | ] 1078 | } 1079 | ], 1080 | "appState": { 1081 | "gridSize": null, 1082 | "viewBackgroundColor": "#fff" 1083 | }, 1084 | "files": {} 1085 | } --------------------------------------------------------------------------------