├── .gitignore ├── requirements.txt ├── run_demo.py ├── LICENSE ├── README.md ├── data └── raw_data.json └── analitika.py /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | .DS_Store 3 | /myenv 4 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | gensim==4.3.2 2 | h5py==3.11.0 3 | nltk==3.8.1 4 | numpy==2.0.0 5 | pandas==2.2.2 6 | transformers==4.40.2 7 | -------------------------------------------------------------------------------- /run_demo.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from config import get_config 4 | import analitika 5 | 6 | def run_demo(): 7 | print("Running demo with default configuration...") 8 | analitika.process_data() 9 | 10 | print("\nRunning demo with custom tokenizer...") 11 | config = get_config() 12 | config['TOKENIZER'] = 'custom' 13 | analitika.CONFIG = config 14 | analitika.process_data() 15 | 16 | print("\nRunning demo with data augmentation disabled...") 17 | config = get_config() 18 | config['ENABLE_AUGMENTATION'] = False 19 | analitika.CONFIG = config 20 | analitika.process_data() 21 | 22 | if __name__ == '__main__': 23 | run_demo() -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Andrew Stepin 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 🚀 Article Data Processing 2 | 3 | [](https://opensource.org/licenses/MIT) 4 | [](https://www.python.org/downloads/release/python-370/) 5 | [](CONTRIBUTING.md) 6 | 7 | This is a really old script from IBM Watson times I'm trying to keep afresh :) 8 | 9 | ## 🌟 So, what it does? 10 | 11 | - 🎯 **Efficient Processing**: Tokenize and filter articles (now ***a little bit*** faster) 12 | - 🧠 **Pre-trained Embeddings**: 13 | - 🔮 **Data Augment**: Expand your dataset 14 | - 💾 **Storage**: I/O HDF5 15 | - 🛠 **Customizable**: Hopefully! ;) 16 | 17 | ## 🚀 Quick Start 18 | 19 | 1. Clone the repo: 20 | ``` 21 | git clone https://github.com/0101011/analitika.git 22 | ``` 23 | 24 | 2. Install dependencies: 25 | ``` 26 | pip install -r requirements.txt 27 | ``` 28 | 29 | 3. Run the script: 30 | ``` 31 | python analitika.py 32 | ``` 33 | 34 | ## 📚 Table of Contents 35 | 36 | - [Installation](#-installation) 37 | - [Usage](#-usage) 38 | - [Configuration](#-configuration) 39 | - [Contributing](#-contributing) 40 | - [License](#-license) 41 | 42 | ## 🎮 Usage 43 | 44 | 1. Place your `raw_data.json` in the `data/` directory 45 | 2. (Optional) Add pre-trained embeddings to `data/` 46 | 3. Run the script: 47 | ``` 48 | python analitika.py 49 | ``` 50 | 4. Find processed data in `data/` as HDF5 and pickle files 51 | 52 | ## ⚙ Configuration 53 | 54 | Customize the script by modifying these variables: 55 | 56 | - `WHITELIST`: Allowed characters 57 | - `VOCAB_SIZE`: Maximum vocabulary size 58 | - `limit`: Length constraints for articles 59 | 60 | ## 🤝 Contributing 61 | 62 | Here are some ways you can contribute: 63 | 64 | - 💡 My goal was to develop a package or CLI tool out of it. Maybe we'll come up with something. 65 | 66 | ## 📜 License 67 | 68 | This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details. 69 | 70 | ## 🙌 Acknowledgements 71 | 72 | - [NLTK](https://www.nltk.org/) for natural language processing 73 | - [Gensim](https://radimrehurek.com/gensim/) for word embeddings 74 | - [HDF5 for Python](https://www.h5py.org/) for efficient data storage 75 | 76 | --- 77 | 78 |
79 | Made with ❤️ by [Your Name] 80 |
81 | -------------------------------------------------------------------------------- /data/raw_data.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "abstract": "Introduction to Machine Learning", 4 | "article": "Machine learning is the study of computer algorithms that improve automatically through experience. It is used across various domains, from image recognition to natural language processing." 5 | }, 6 | { 7 | "abstract": "Web Development Trends in 2023", 8 | "article": "Web development continues to evolve with new trends in frameworks, tools, and best practices. This article explores upcoming trends in frontend and backend development for 2023." 9 | }, 10 | { 11 | "abstract": "Blockchain Technology and Its Applications", 12 | "article": "Blockchain is a decentralized and distributed digital ledger technology that records transactions across multiple computers. This paper discusses blockchain's applications beyond cryptocurrencies, such as supply chain management and voting systems." 13 | }, 14 | { 15 | "abstract": "Cybersecurity Best Practices for Small Businesses", 16 | "article": "Small businesses face increasing cyber threats. This guide provides best practices for securing networks, data, and customer information, emphasizing proactive measures and employee training." 17 | }, 18 | { 19 | "abstract": "Data Science: Predictive Analytics with Python", 20 | "article": "Predictive analytics leverages historical data to predict future outcomes. This tutorial uses Python libraries like pandas and scikit-learn to build predictive models for business applications." 21 | }, 22 | { 23 | "abstract": "Artificial Intelligence in Healthcare", 24 | "article": "AI is transforming healthcare with applications in medical imaging, drug discovery, and personalized medicine. This review highlights AI technologies and their potential to improve patient outcomes." 25 | }, 26 | { 27 | "abstract": "DevOps Automation: CI/CD Pipelines", 28 | "article": "Continuous Integration and Continuous Deployment (CI/CD) pipelines automate software delivery processes. This article discusses CI/CD pipeline best practices and tools like Jenkins and GitLab." 29 | }, 30 | { 31 | "abstract": "Open Source Contribution Guidelines", 32 | "article": "Contributing to open source projects can be rewarding. This guide outlines contribution guidelines, including code standards, issue triaging, and pull request etiquette, fostering collaboration and community growth." 33 | }, 34 | { 35 | "abstract": "Cloud Computing: AWS vs Azure vs Google Cloud", 36 | "article": "AWS, Azure, and Google Cloud are leading providers of cloud computing services. This comparative analysis examines their features, pricing models, and use cases to help businesses choose the right cloud platform." 37 | }, 38 | { 39 | "abstract": "Modern JavaScript Frameworks: React vs Vue vs Angular", 40 | "article": "React, Vue, and Angular are popular JavaScript frameworks for building interactive web applications. This article compares their architecture, performance, and ecosystem to aid developers in framework selection." 41 | }, 42 | { 43 | "abstract": "Quantum Computing: Principles and Applications", 44 | "article": "Quantum computing harnesses quantum phenomena to process information, promising exponential speedup over classical computers. This paper explores quantum computing principles, algorithms, and potential applications." 45 | }, 46 | { 47 | "abstract": "Deep Learning for Computer Vision", 48 | "article": "Deep learning has revolutionized computer vision tasks like object detection and image classification. This tutorial covers deep learning models, datasets, and training techniques for computer vision applications." 49 | } 50 | ] 51 | -------------------------------------------------------------------------------- /analitika.py: -------------------------------------------------------------------------------- 1 | import json 2 | import itertools 3 | import ssl 4 | import warnings 5 | from os import path 6 | from typing import List, Tuple, Dict 7 | 8 | import numpy as np 9 | import pandas as pd 10 | import pickle 11 | import nltk 12 | from nltk.tokenize import word_tokenize 13 | from gensim.models import KeyedVectors 14 | from transformers import AutoTokenizer, AutoModel 15 | import h5py 16 | 17 | from config import get_config 18 | 19 | CONFIG = get_config() 20 | 21 | try: 22 | _create_unverified_https_context = ssl._create_unverified_context 23 | except AttributeError: 24 | pass 25 | else: 26 | ssl._create_default_https_context = _create_unverified_https_context 27 | 28 | def download_nltk_data(): 29 | try: 30 | nltk.data.find('tokenizers/punkt') 31 | except LookupError: 32 | print("NLTK 'punkt' resource not found. Attempting to download...") 33 | try: 34 | nltk.download('punkt', quiet=True) 35 | except ssl.SSLError: 36 | warnings.warn( 37 | "SSL certificate verification failed. Attempting to download NLTK data without verification. " 38 | "This is not secure and should only be used for testing purposes.", 39 | UserWarning 40 | ) 41 | try: 42 | _create_unverified_https_context = ssl._create_unverified_context 43 | except AttributeError: 44 | pass 45 | else: 46 | ssl._create_default_https_context = _create_unverified_https_context 47 | nltk.download('punkt', quiet=True) 48 | 49 | # Download necessary NLTK data 50 | download_nltk_data() 51 | 52 | WHITELIST = CONFIG['WHITELIST'] 53 | VOCAB_SIZE = CONFIG['VOCAB_SIZE'] 54 | UNK = 'unk' 55 | 56 | WHITELIST = CONFIG['WHITELIST'] 57 | VOCAB_SIZE = CONFIG['VOCAB_SIZE'] 58 | UNK = 'unk' 59 | 60 | limit = { 61 | 'max_descriptions': CONFIG['MAX_DESCRIPTION_LENGTH'], 62 | 'min_descriptions': CONFIG['MIN_DESCRIPTION_LENGTH'], 63 | 'max_headings': CONFIG['MAX_HEADING_LENGTH'], 64 | 'min_headings': 0, 65 | } 66 | 67 | def load_raw_data(filename): 68 | with open(filename, 'r') as fp: 69 | raw_data = json.load(fp) 70 | print(f'Loaded {len(raw_data):,} articles from {filename}') 71 | return raw_data 72 | 73 | def tokenize_sentence(sentence): 74 | if CONFIG['TOKENIZER'] == 'nltk': 75 | return ' '.join(word_tokenize(sentence)) 76 | elif CONFIG['TOKENIZER'] == 'custom': 77 | from custom_tokenizer import custom_tokenize 78 | return ' '.join(custom_tokenize(sentence)) 79 | else: 80 | raise ValueError(f"Unsupported tokenizer: {CONFIG['TOKENIZER']}") 81 | 82 | def article_is_complete(article: Dict) -> bool: 83 | """Check if an article has both heading and description.""" 84 | return ('abstract' in article and 'article' in article 85 | and article['abstract'] is not None and article['article'] is not None) 86 | 87 | def tokenize_articles(raw_data: List[Dict]) -> Tuple[List[str], List[str]]: 88 | """Tokenize articles and create lists of headings and descriptions.""" 89 | headings, descriptions = [], [] 90 | 91 | for i, a in enumerate(raw_data): 92 | if article_is_complete(a): 93 | headings.append(tokenize_sentence(a['abstract'])) 94 | descriptions.append(tokenize_sentence(a['article'])) 95 | if i % 1000 == 0: # Print progress every 1000 articles 96 | print(f'Tokenized {i:,} / {len(raw_data):,} articles') 97 | 98 | return headings, descriptions 99 | 100 | def filter_text(text: str) -> str: 101 | """Filter out characters not in whitelist.""" 102 | return ''.join(ch for ch in text if ch in WHITELIST) 103 | 104 | def filter_length(headings: List[str], descriptions: List[str]) -> Tuple[List[str], List[str]]: 105 | """Filter articles based on length constraints.""" 106 | if len(headings) != len(descriptions): 107 | raise ValueError('Number of headings does not match number of descriptions!') 108 | 109 | filtered_data = [ 110 | (h, d) for h, d in zip(headings, descriptions) 111 | if (limit['min_descriptions'] <= len(d.split()) <= limit['max_descriptions'] and 112 | limit['min_headings'] <= len(h.split()) <= limit['max_headings']) 113 | ] 114 | 115 | filtered_headings, filtered_descriptions = zip(*filtered_data) 116 | 117 | print(f'Length of filtered headings: {len(filtered_headings):,}') 118 | print(f'Length of filtered descriptions: {len(filtered_descriptions):,}') 119 | 120 | return list(filtered_headings), list(filtered_descriptions) 121 | 122 | def index_data(tokenized_sentences: List[List[str]], vocab_size: int) -> Tuple[List[str], Dict[str, int], nltk.FreqDist]: 123 | """Form vocabulary, idx2word and word2idx dictionaries.""" 124 | freq_dist = nltk.FreqDist(itertools.chain(*tokenized_sentences)) 125 | vocab = freq_dist.most_common(vocab_size) 126 | print(f'Vocab length: {len(vocab):,}') 127 | 128 | idx2word = ['_', UNK] + [x[0] for x in vocab] 129 | word2idx = {w: i for i, w in enumerate(idx2word)} 130 | 131 | return idx2word, word2idx, freq_dist 132 | 133 | def pad_seq(seq: List[str], lookup: Dict[str, int], max_length: int) -> List[int]: 134 | """Pad sequence with zero values.""" 135 | indices = [lookup.get(word, lookup[UNK]) for word in seq] 136 | return indices + [0] * (max_length - len(seq)) 137 | 138 | def zero_pad(tokenized_headings: List[List[str]], tokenized_descriptions: List[List[str]], word2idx: Dict[str, int]) -> Tuple[np.ndarray, np.ndarray]: 139 | """Store indices in numpy arrays and create zero padding where required.""" 140 | data_length = len(tokenized_descriptions) 141 | 142 | idx_descriptions = np.zeros((data_length, limit['max_descriptions']), dtype=np.int32) 143 | idx_headings = np.zeros((data_length, limit['max_headings']), dtype=np.int32) 144 | 145 | for i, (heading, description) in enumerate(zip(tokenized_headings, tokenized_descriptions)): 146 | idx_descriptions[i] = pad_seq(description, word2idx, limit['max_descriptions']) 147 | idx_headings[i] = pad_seq(heading, word2idx, limit['max_headings']) 148 | 149 | return idx_headings, idx_descriptions 150 | 151 | def load_pretrained_embeddings(word2idx: Dict[str, int], embedding_dim: int = 300) -> np.ndarray: 152 | """Load pre-trained word embeddings.""" 153 | model = KeyedVectors.load_word2vec_format('path_to_pretrained_embeddings', binary=True) 154 | embedding_matrix = np.zeros((len(word2idx), embedding_dim)) 155 | 156 | for word, i in word2idx.items(): 157 | if word in model.key_to_index: 158 | embedding_matrix[i] = model[word] 159 | 160 | return embedding_matrix 161 | 162 | def augment_data(descriptions: List[str]) -> List[str]: 163 | """Perform data augmentation on descriptions.""" 164 | augmented_descriptions = [] 165 | for description in descriptions: 166 | augmented_descriptions.append(description) 167 | # Add simple augmentation techniques 168 | augmented_descriptions.append(' '.join(np.random.permutation(description.split()))) 169 | augmented_descriptions.append(' '.join(description.split()[::-1])) 170 | return augmented_descriptions 171 | 172 | def process_data(): 173 | """Process the data and prepare it for model training.""" 174 | filename = CONFIG['RAW_DATA_FILE'] 175 | raw_data = load_raw_data(filename) 176 | 177 | headings, descriptions = tokenize_articles(raw_data) 178 | 179 | headings = [filter_text(heading) for heading in headings] 180 | descriptions = [filter_text(sentence) for sentence in descriptions] 181 | headings, descriptions = filter_length(headings, descriptions) 182 | 183 | # Data augmentation 184 | if CONFIG['ENABLE_AUGMENTATION']: 185 | augmented_descriptions = augment_data(descriptions) 186 | else: 187 | augmented_descriptions = descriptions 188 | 189 | word_tokenized_headings = [word_list.split() for word_list in headings] 190 | word_tokenized_descriptions = [word_list.split() for word_list in augmented_descriptions] 191 | 192 | idx2word, word2idx, freq_dist = index_data(word_tokenized_headings + word_tokenized_descriptions, VOCAB_SIZE) 193 | 194 | idx_headings, idx_descriptions = zero_pad(word_tokenized_headings, word_tokenized_descriptions, word2idx) 195 | 196 | unk_percentage = calculate_unk_percentage(idx_headings, idx_descriptions, word2idx) 197 | print(f"UNK percentage: {unk_percentage:.2f}%") 198 | 199 | # Load pre-trained embeddings 200 | if CONFIG['USE_PRETRAINED_EMBEDDINGS']: 201 | embedding_matrix = load_pretrained_embeddings(word2idx) 202 | else: 203 | embedding_matrix = None 204 | 205 | article_data = { 206 | 'word2idx': word2idx, 207 | 'idx2word': idx2word, 208 | 'limit': limit, 209 | 'freq_dist': freq_dist, 210 | 'embedding_matrix': embedding_matrix 211 | } 212 | 213 | save_data(article_data, idx_headings, idx_descriptions) 214 | 215 | return idx_headings, idx_descriptions 216 | 217 | def save_data(article_data: Dict, idx_headings: np.ndarray, idx_descriptions: np.ndarray): 218 | """Save processed data to disk using HDF5 format.""" 219 | with h5py.File(CONFIG['PROCESSED_DATA_FILE'], 'w') as hf: 220 | hf.create_dataset('idx_headings', data=idx_headings) 221 | hf.create_dataset('idx_descriptions', data=idx_descriptions) 222 | if article_data['embedding_matrix'] is not None: 223 | hf.create_dataset('embedding_matrix', data=article_data['embedding_matrix']) 224 | 225 | # Save metadata 226 | metadata = hf.create_group('metadata') 227 | metadata.attrs['vocab_size'] = len(article_data['word2idx']) 228 | metadata.attrs['max_heading_length'] = limit['max_headings'] 229 | metadata.attrs['max_description_length'] = limit['max_descriptions'] 230 | 231 | # Save other data using pickle 232 | with open(CONFIG['METADATA_FILE'], 'wb') as fp: 233 | pickle.dump({k: v for k, v in article_data.items() if k != 'embedding_matrix'}, fp) 234 | 235 | def load_processed_data() -> Tuple[Dict, np.ndarray, np.ndarray]: 236 | """Load processed data from disk.""" 237 | with h5py.File(CONFIG['PROCESSED_DATA_FILE'], 'r') as hf: 238 | idx_headings = hf['idx_headings'][:] 239 | idx_descriptions = hf['idx_descriptions'][:] 240 | embedding_matrix = hf['embedding_matrix'][:] if 'embedding_matrix' in hf else None 241 | 242 | with open(CONFIG['METADATA_FILE'], 'rb') as fp: 243 | article_data = pickle.load(fp) 244 | 245 | article_data['embedding_matrix'] = embedding_matrix 246 | return article_data, idx_headings, idx_descriptions 247 | 248 | def calculate_unk_percentage(idx_headings: np.ndarray, idx_descriptions: np.ndarray, word2idx: Dict[str, int]) -> float: 249 | """Calculate the percentage of unknown words in the dataset.""" 250 | num_unk = np.sum(idx_headings == word2idx[UNK]) + np.sum(idx_descriptions == word2idx[UNK]) 251 | num_words = np.sum(idx_headings > word2idx[UNK]) + np.sum(idx_descriptions > word2idx[UNK]) 252 | return (num_unk / num_words) * 100 253 | 254 | def main(): 255 | process_data() 256 | 257 | if __name__ == '__main__': 258 | main() --------------------------------------------------------------------------------