├── .gitignore
├── requirements.txt
├── run_demo.py
├── LICENSE
├── README.md
├── data
    └── raw_data.json
└── analitika.py


/.gitignore:
--------------------------------------------------------------------------------
1 | 
2 | .DS_Store
3 | /myenv
4 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | gensim==4.3.2
2 | h5py==3.11.0
3 | nltk==3.8.1
4 | numpy==2.0.0
5 | pandas==2.2.2
6 | transformers==4.40.2
7 | 


--------------------------------------------------------------------------------
/run_demo.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | from config import get_config
 4 | import analitika
 5 | 
 6 | def run_demo():
 7 |     print("Running demo with default configuration...")
 8 |     analitika.process_data()
 9 | 
10 |     print("\nRunning demo with custom tokenizer...")
11 |     config = get_config()
12 |     config['TOKENIZER'] = 'custom'
13 |     analitika.CONFIG = config
14 |     analitika.process_data()
15 | 
16 |     print("\nRunning demo with data augmentation disabled...")
17 |     config = get_config()
18 |     config['ENABLE_AUGMENTATION'] = False
19 |     analitika.CONFIG = config
20 |     analitika.process_data()
21 | 
22 | if __name__ == '__main__':
23 |     run_demo()


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Andrew Stepin
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # 🚀 Article Data Processing
 2 | 
 3 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
 4 | [![Python 3.7+](https://img.shields.io/badge/python-3.7+-blue.svg)](https://www.python.org/downloads/release/python-370/)
 5 | [![Contributions Welcome](https://img.shields.io/badge/contributions-welcome-brightgreen.svg?style=flat)](CONTRIBUTING.md)
 6 | 
 7 | This is a really old script from IBM Watson times I'm trying to keep afresh :)
 8 | 
 9 | ## 🌟 So, what it does?
10 | 
11 | - 🎯 **Efficient Processing**: Tokenize and filter articles (now ***a little bit*** faster)
12 | - 🧠 **Pre-trained Embeddings**:
13 | - 🔮 **Data Augment**: Expand your dataset
14 | - 💾 **Storage**: I/O HDF5
15 | - 🛠 **Customizable**: Hopefully! ;)
16 | 
17 | ## 🚀 Quick Start
18 | 
19 | 1. Clone the repo:
20 |    ```
21 |    git clone https://github.com/0101011/analitika.git
22 |    ```
23 | 
24 | 2. Install dependencies:
25 |    ```
26 |    pip install -r requirements.txt
27 |    ```
28 | 
29 | 3. Run the script:
30 |    ```
31 |    python analitika.py
32 |    ```
33 | 
34 | ## 📚 Table of Contents
35 | 
36 | - [Installation](#-installation)
37 | - [Usage](#-usage)
38 | - [Configuration](#-configuration)
39 | - [Contributing](#-contributing)
40 | - [License](#-license)
41 | 
42 | ## 🎮 Usage
43 | 
44 | 1. Place your `raw_data.json` in the `data/` directory
45 | 2. (Optional) Add pre-trained embeddings to `data/`
46 | 3. Run the script:
47 |    ```
48 |    python analitika.py
49 |    ```
50 | 4. Find processed data in `data/` as HDF5 and pickle files
51 | 
52 | ## ⚙ Configuration
53 | 
54 | Customize the script by modifying these variables:
55 | 
56 | - `WHITELIST`: Allowed characters
57 | - `VOCAB_SIZE`: Maximum vocabulary size
58 | - `limit`: Length constraints for articles
59 | 
60 | ## 🤝 Contributing
61 | 
62 | Here are some ways you can contribute:
63 | 
64 | - 💡 My goal was to develop a package or CLI tool out of it. Maybe we'll come up with something.
65 | 
66 | ## 📜 License
67 | 
68 | This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
69 | 
70 | ## 🙌 Acknowledgements
71 | 
72 | - [NLTK](https://www.nltk.org/) for natural language processing
73 | - [Gensim](https://radimrehurek.com/gensim/) for word embeddings
74 | - [HDF5 for Python](https://www.h5py.org/) for efficient data storage
75 | 
76 | ---
77 | 
78 | <p align="center">
79 |   Made with ❤️ by [Your Name]
80 | </p>
81 | 


--------------------------------------------------------------------------------
/data/raw_data.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 |       "abstract": "Introduction to Machine Learning",
 4 |       "article": "Machine learning is the study of computer algorithms that improve automatically through experience. It is used across various domains, from image recognition to natural language processing."
 5 |     },
 6 |     {
 7 |       "abstract": "Web Development Trends in 2023",
 8 |       "article": "Web development continues to evolve with new trends in frameworks, tools, and best practices. This article explores upcoming trends in frontend and backend development for 2023."
 9 |     },
10 |     {
11 |       "abstract": "Blockchain Technology and Its Applications",
12 |       "article": "Blockchain is a decentralized and distributed digital ledger technology that records transactions across multiple computers. This paper discusses blockchain's applications beyond cryptocurrencies, such as supply chain management and voting systems."
13 |     },
14 |     {
15 |       "abstract": "Cybersecurity Best Practices for Small Businesses",
16 |       "article": "Small businesses face increasing cyber threats. This guide provides best practices for securing networks, data, and customer information, emphasizing proactive measures and employee training."
17 |     },
18 |     {
19 |       "abstract": "Data Science: Predictive Analytics with Python",
20 |       "article": "Predictive analytics leverages historical data to predict future outcomes. This tutorial uses Python libraries like pandas and scikit-learn to build predictive models for business applications."
21 |     },
22 |     {
23 |       "abstract": "Artificial Intelligence in Healthcare",
24 |       "article": "AI is transforming healthcare with applications in medical imaging, drug discovery, and personalized medicine. This review highlights AI technologies and their potential to improve patient outcomes."
25 |     },
26 |     {
27 |       "abstract": "DevOps Automation: CI/CD Pipelines",
28 |       "article": "Continuous Integration and Continuous Deployment (CI/CD) pipelines automate software delivery processes. This article discusses CI/CD pipeline best practices and tools like Jenkins and GitLab."
29 |     },
30 |     {
31 |       "abstract": "Open Source Contribution Guidelines",
32 |       "article": "Contributing to open source projects can be rewarding. This guide outlines contribution guidelines, including code standards, issue triaging, and pull request etiquette, fostering collaboration and community growth."
33 |     },
34 |     {
35 |       "abstract": "Cloud Computing: AWS vs Azure vs Google Cloud",
36 |       "article": "AWS, Azure, and Google Cloud are leading providers of cloud computing services. This comparative analysis examines their features, pricing models, and use cases to help businesses choose the right cloud platform."
37 |     },
38 |     {
39 |       "abstract": "Modern JavaScript Frameworks: React vs Vue vs Angular",
40 |       "article": "React, Vue, and Angular are popular JavaScript frameworks for building interactive web applications. This article compares their architecture, performance, and ecosystem to aid developers in framework selection."
41 |     },
42 |     {
43 |       "abstract": "Quantum Computing: Principles and Applications",
44 |       "article": "Quantum computing harnesses quantum phenomena to process information, promising exponential speedup over classical computers. This paper explores quantum computing principles, algorithms, and potential applications."
45 |     },
46 |     {
47 |       "abstract": "Deep Learning for Computer Vision",
48 |       "article": "Deep learning has revolutionized computer vision tasks like object detection and image classification. This tutorial covers deep learning models, datasets, and training techniques for computer vision applications."
49 |     }
50 |   ]
51 |   


--------------------------------------------------------------------------------
/analitika.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import itertools
  3 | import ssl
  4 | import warnings
  5 | from os import path
  6 | from typing import List, Tuple, Dict
  7 | 
  8 | import numpy as np
  9 | import pandas as pd
 10 | import pickle
 11 | import nltk
 12 | from nltk.tokenize import word_tokenize
 13 | from gensim.models import KeyedVectors
 14 | from transformers import AutoTokenizer, AutoModel
 15 | import h5py
 16 | 
 17 | from config import get_config
 18 | 
 19 | CONFIG = get_config()
 20 | 
 21 | try:
 22 |     _create_unverified_https_context = ssl._create_unverified_context
 23 | except AttributeError:
 24 |     pass
 25 | else:
 26 |     ssl._create_default_https_context = _create_unverified_https_context
 27 | 
 28 | def download_nltk_data():
 29 |     try:
 30 |         nltk.data.find('tokenizers/punkt')
 31 |     except LookupError:
 32 |         print("NLTK 'punkt' resource not found. Attempting to download...")
 33 |         try:
 34 |             nltk.download('punkt', quiet=True)
 35 |         except ssl.SSLError:
 36 |             warnings.warn(
 37 |                 "SSL certificate verification failed. Attempting to download NLTK data without verification. "
 38 |                 "This is not secure and should only be used for testing purposes.",
 39 |                 UserWarning
 40 |             )
 41 |             try:
 42 |                 _create_unverified_https_context = ssl._create_unverified_context
 43 |             except AttributeError:
 44 |                 pass
 45 |             else:
 46 |                 ssl._create_default_https_context = _create_unverified_https_context
 47 |             nltk.download('punkt', quiet=True)
 48 | 
 49 | # Download necessary NLTK data
 50 | download_nltk_data()
 51 | 
 52 | WHITELIST = CONFIG['WHITELIST']
 53 | VOCAB_SIZE = CONFIG['VOCAB_SIZE']
 54 | UNK = 'unk'
 55 | 
 56 | WHITELIST = CONFIG['WHITELIST']
 57 | VOCAB_SIZE = CONFIG['VOCAB_SIZE']
 58 | UNK = 'unk'
 59 | 
 60 | limit = {
 61 |     'max_descriptions': CONFIG['MAX_DESCRIPTION_LENGTH'],
 62 |     'min_descriptions': CONFIG['MIN_DESCRIPTION_LENGTH'],
 63 |     'max_headings': CONFIG['MAX_HEADING_LENGTH'],
 64 |     'min_headings': 0,
 65 | }
 66 | 
 67 | def load_raw_data(filename):
 68 |     with open(filename, 'r') as fp:
 69 |         raw_data = json.load(fp)
 70 |     print(f'Loaded {len(raw_data):,} articles from {filename}')
 71 |     return raw_data
 72 | 
 73 | def tokenize_sentence(sentence):
 74 |     if CONFIG['TOKENIZER'] == 'nltk':
 75 |         return ' '.join(word_tokenize(sentence))
 76 |     elif CONFIG['TOKENIZER'] == 'custom':
 77 |         from custom_tokenizer import custom_tokenize
 78 |         return ' '.join(custom_tokenize(sentence))
 79 |     else:
 80 |         raise ValueError(f"Unsupported tokenizer: {CONFIG['TOKENIZER']}")
 81 | 
 82 | def article_is_complete(article: Dict) -> bool:
 83 |     """Check if an article has both heading and description."""
 84 |     return ('abstract' in article and 'article' in article 
 85 |             and article['abstract'] is not None and article['article'] is not None)
 86 | 
 87 | def tokenize_articles(raw_data: List[Dict]) -> Tuple[List[str], List[str]]:
 88 |     """Tokenize articles and create lists of headings and descriptions."""
 89 |     headings, descriptions = [], []
 90 |     
 91 |     for i, a in enumerate(raw_data):
 92 |         if article_is_complete(a):
 93 |             headings.append(tokenize_sentence(a['abstract']))
 94 |             descriptions.append(tokenize_sentence(a['article']))
 95 |         if i % 1000 == 0:  # Print progress every 1000 articles
 96 |             print(f'Tokenized {i:,} / {len(raw_data):,} articles')
 97 |     
 98 |     return headings, descriptions
 99 | 
100 | def filter_text(text: str) -> str:
101 |     """Filter out characters not in whitelist."""
102 |     return ''.join(ch for ch in text if ch in WHITELIST)
103 | 
104 | def filter_length(headings: List[str], descriptions: List[str]) -> Tuple[List[str], List[str]]:
105 |     """Filter articles based on length constraints."""
106 |     if len(headings) != len(descriptions):
107 |         raise ValueError('Number of headings does not match number of descriptions!')
108 | 
109 |     filtered_data = [
110 |         (h, d) for h, d in zip(headings, descriptions)
111 |         if (limit['min_descriptions'] <= len(d.split()) <= limit['max_descriptions'] and
112 |             limit['min_headings'] <= len(h.split()) <= limit['max_headings'])
113 |     ]
114 | 
115 |     filtered_headings, filtered_descriptions = zip(*filtered_data)
116 |     
117 |     print(f'Length of filtered headings: {len(filtered_headings):,}')
118 |     print(f'Length of filtered descriptions: {len(filtered_descriptions):,}')
119 | 
120 |     return list(filtered_headings), list(filtered_descriptions)
121 | 
122 | def index_data(tokenized_sentences: List[List[str]], vocab_size: int) -> Tuple[List[str], Dict[str, int], nltk.FreqDist]:
123 |     """Form vocabulary, idx2word and word2idx dictionaries."""
124 |     freq_dist = nltk.FreqDist(itertools.chain(*tokenized_sentences))
125 |     vocab = freq_dist.most_common(vocab_size)
126 |     print(f'Vocab length: {len(vocab):,}')
127 | 
128 |     idx2word = ['_', UNK] + [x[0] for x in vocab]
129 |     word2idx = {w: i for i, w in enumerate(idx2word)}
130 | 
131 |     return idx2word, word2idx, freq_dist
132 | 
133 | def pad_seq(seq: List[str], lookup: Dict[str, int], max_length: int) -> List[int]:
134 |     """Pad sequence with zero values."""
135 |     indices = [lookup.get(word, lookup[UNK]) for word in seq]
136 |     return indices + [0] * (max_length - len(seq))
137 | 
138 | def zero_pad(tokenized_headings: List[List[str]], tokenized_descriptions: List[List[str]], word2idx: Dict[str, int]) -> Tuple[np.ndarray, np.ndarray]:
139 |     """Store indices in numpy arrays and create zero padding where required."""
140 |     data_length = len(tokenized_descriptions)
141 | 
142 |     idx_descriptions = np.zeros((data_length, limit['max_descriptions']), dtype=np.int32)
143 |     idx_headings = np.zeros((data_length, limit['max_headings']), dtype=np.int32)
144 | 
145 |     for i, (heading, description) in enumerate(zip(tokenized_headings, tokenized_descriptions)):
146 |         idx_descriptions[i] = pad_seq(description, word2idx, limit['max_descriptions'])
147 |         idx_headings[i] = pad_seq(heading, word2idx, limit['max_headings'])
148 | 
149 |     return idx_headings, idx_descriptions
150 | 
151 | def load_pretrained_embeddings(word2idx: Dict[str, int], embedding_dim: int = 300) -> np.ndarray:
152 |     """Load pre-trained word embeddings."""
153 |     model = KeyedVectors.load_word2vec_format('path_to_pretrained_embeddings', binary=True)
154 |     embedding_matrix = np.zeros((len(word2idx), embedding_dim))
155 |     
156 |     for word, i in word2idx.items():
157 |         if word in model.key_to_index:
158 |             embedding_matrix[i] = model[word]
159 |     
160 |     return embedding_matrix
161 | 
162 | def augment_data(descriptions: List[str]) -> List[str]:
163 |     """Perform data augmentation on descriptions."""
164 |     augmented_descriptions = []
165 |     for description in descriptions:
166 |         augmented_descriptions.append(description)
167 |         # Add simple augmentation techniques
168 |         augmented_descriptions.append(' '.join(np.random.permutation(description.split())))
169 |         augmented_descriptions.append(' '.join(description.split()[::-1]))
170 |     return augmented_descriptions
171 | 
172 | def process_data():
173 |     """Process the data and prepare it for model training."""
174 |     filename = CONFIG['RAW_DATA_FILE']
175 |     raw_data = load_raw_data(filename)
176 | 
177 |     headings, descriptions = tokenize_articles(raw_data)
178 | 
179 |     headings = [filter_text(heading) for heading in headings]
180 |     descriptions = [filter_text(sentence) for sentence in descriptions]
181 |     headings, descriptions = filter_length(headings, descriptions)
182 | 
183 |     # Data augmentation
184 |     if CONFIG['ENABLE_AUGMENTATION']:
185 |         augmented_descriptions = augment_data(descriptions)
186 |     else:
187 |         augmented_descriptions = descriptions
188 |     
189 |     word_tokenized_headings = [word_list.split() for word_list in headings]
190 |     word_tokenized_descriptions = [word_list.split() for word_list in augmented_descriptions]
191 | 
192 |     idx2word, word2idx, freq_dist = index_data(word_tokenized_headings + word_tokenized_descriptions, VOCAB_SIZE)
193 | 
194 |     idx_headings, idx_descriptions = zero_pad(word_tokenized_headings, word_tokenized_descriptions, word2idx)
195 | 
196 |     unk_percentage = calculate_unk_percentage(idx_headings, idx_descriptions, word2idx)
197 |     print(f"UNK percentage: {unk_percentage:.2f}%")
198 | 
199 |     # Load pre-trained embeddings
200 |     if CONFIG['USE_PRETRAINED_EMBEDDINGS']:
201 |         embedding_matrix = load_pretrained_embeddings(word2idx)
202 |     else:
203 |         embedding_matrix = None
204 | 
205 |     article_data = {
206 |         'word2idx': word2idx,
207 |         'idx2word': idx2word,
208 |         'limit': limit,
209 |         'freq_dist': freq_dist,
210 |         'embedding_matrix': embedding_matrix
211 |     }
212 | 
213 |     save_data(article_data, idx_headings, idx_descriptions)
214 | 
215 |     return idx_headings, idx_descriptions
216 | 
217 | def save_data(article_data: Dict, idx_headings: np.ndarray, idx_descriptions: np.ndarray):
218 |     """Save processed data to disk using HDF5 format."""
219 |     with h5py.File(CONFIG['PROCESSED_DATA_FILE'], 'w') as hf:
220 |         hf.create_dataset('idx_headings', data=idx_headings)
221 |         hf.create_dataset('idx_descriptions', data=idx_descriptions)
222 |         if article_data['embedding_matrix'] is not None:
223 |             hf.create_dataset('embedding_matrix', data=article_data['embedding_matrix'])
224 |         
225 |         # Save metadata
226 |         metadata = hf.create_group('metadata')
227 |         metadata.attrs['vocab_size'] = len(article_data['word2idx'])
228 |         metadata.attrs['max_heading_length'] = limit['max_headings']
229 |         metadata.attrs['max_description_length'] = limit['max_descriptions']
230 | 
231 |     # Save other data using pickle
232 |     with open(CONFIG['METADATA_FILE'], 'wb') as fp:
233 |         pickle.dump({k: v for k, v in article_data.items() if k != 'embedding_matrix'}, fp)
234 | 
235 | def load_processed_data() -> Tuple[Dict, np.ndarray, np.ndarray]:
236 |     """Load processed data from disk."""
237 |     with h5py.File(CONFIG['PROCESSED_DATA_FILE'], 'r') as hf:
238 |         idx_headings = hf['idx_headings'][:]
239 |         idx_descriptions = hf['idx_descriptions'][:]
240 |         embedding_matrix = hf['embedding_matrix'][:] if 'embedding_matrix' in hf else None
241 | 
242 |     with open(CONFIG['METADATA_FILE'], 'rb') as fp:
243 |         article_data = pickle.load(fp)
244 |     
245 |     article_data['embedding_matrix'] = embedding_matrix
246 |     return article_data, idx_headings, idx_descriptions
247 | 
248 | def calculate_unk_percentage(idx_headings: np.ndarray, idx_descriptions: np.ndarray, word2idx: Dict[str, int]) -> float:
249 |     """Calculate the percentage of unknown words in the dataset."""
250 |     num_unk = np.sum(idx_headings == word2idx[UNK]) + np.sum(idx_descriptions == word2idx[UNK])
251 |     num_words = np.sum(idx_headings > word2idx[UNK]) + np.sum(idx_descriptions > word2idx[UNK])
252 |     return (num_unk / num_words) * 100
253 | 
254 | def main():
255 |     process_data()
256 | 
257 | if __name__ == '__main__':
258 |     main()


--------------------------------------------------------------------------------