├── pyviralcontent.egg-info ├── dependency_links.txt ├── top_level.txt ├── requires.txt ├── SOURCES.txt └── PKG-INFO ├── UML.png ├── Sample.JPG ├── dist ├── pyviralcontent-0.1.4.tar.gz └── pyviralcontent-0.1.4-py3-none-any.whl ├── pyviralcontent ├── __init__.py ├── visualizer.py ├── syllable_counter.py ├── content_analyzer.py ├── text_analyzer.py ├── likert_scale.py └── readability_calculator.py ├── setup.py ├── .github └── workflows │ └── python-publish.yml ├── main.py └── README.md /pyviralcontent.egg-info/dependency_links.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /pyviralcontent.egg-info/top_level.txt: -------------------------------------------------------------------------------- 1 | pyviralcontent 2 | -------------------------------------------------------------------------------- /UML.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bhaskatripathi/pyviralcontent/HEAD/UML.png -------------------------------------------------------------------------------- /Sample.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bhaskatripathi/pyviralcontent/HEAD/Sample.JPG -------------------------------------------------------------------------------- /pyviralcontent.egg-info/requires.txt: -------------------------------------------------------------------------------- 1 | pandas 2 | numpy 3 | matplotlib 4 | seaborn 5 | -------------------------------------------------------------------------------- /dist/pyviralcontent-0.1.4.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bhaskatripathi/pyviralcontent/HEAD/dist/pyviralcontent-0.1.4.tar.gz -------------------------------------------------------------------------------- /dist/pyviralcontent-0.1.4-py3-none-any.whl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bhaskatripathi/pyviralcontent/HEAD/dist/pyviralcontent-0.1.4-py3-none-any.whl -------------------------------------------------------------------------------- /pyviralcontent/__init__.py: -------------------------------------------------------------------------------- 1 | from .syllable_counter import SyllableCounter 2 | from .text_analyzer import TextAnalyzer 3 | from .likert_scale import LikertScale 4 | from .readability_calculator import ReadabilityCalculator 5 | from .visualizer import Visualizer 6 | from .content_analyzer import ContentAnalyzer 7 | 8 | __all__ = [ 9 | 'SyllableCounter', 'TextAnalyzer', 'LikertScale', 10 | 'ReadabilityCalculator', 'Visualizer', 'ContentAnalyzer' 11 | ] 12 | -------------------------------------------------------------------------------- /pyviralcontent.egg-info/SOURCES.txt: -------------------------------------------------------------------------------- 1 | README.md 2 | setup.py 3 | pyviralcontent/__init__.py 4 | pyviralcontent/content_analyzer.py 5 | pyviralcontent/likert_scale.py 6 | pyviralcontent/readability_calculator.py 7 | pyviralcontent/syllable_counter.py 8 | pyviralcontent/text_analyzer.py 9 | pyviralcontent/visualizer.py 10 | pyviralcontent.egg-info/PKG-INFO 11 | pyviralcontent.egg-info/SOURCES.txt 12 | pyviralcontent.egg-info/dependency_links.txt 13 | pyviralcontent.egg-info/requires.txt 14 | pyviralcontent.egg-info/top_level.txt -------------------------------------------------------------------------------- /pyviralcontent/visualizer.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import seaborn as sns 3 | import pandas as pd 4 | import numpy as np 5 | 6 | # This class provides visualization functionality. It can plot a heatmap of readability scores for each test, providing a visual representation of the analysis results. 7 | class Visualizer: 8 | @staticmethod 9 | def plot_scores_heatmap(df, content_type): 10 | fig, ax = plt.subplots(figsize=(8, 1), dpi=300) 11 | heatmap_data = pd.pivot_table(data=df, values='Score', index=['Test'], aggfunc=np.mean) 12 | sns.heatmap(heatmap_data, annot=True, fmt=".1f", linewidths=.5, ax=ax) 13 | plt.title(f'Readability Scores for {content_type.capitalize()} Content') 14 | plt.show() -------------------------------------------------------------------------------- /pyviralcontent/syllable_counter.py: -------------------------------------------------------------------------------- 1 | 2 | # This class is responsible for counting the syllables in a word. It identifies vowels in a word and applies rules to count syllables, 3 | # which is essential for readability analysis. 4 | class SyllableCounter: 5 | def __init__(self): 6 | self.vowels = "aeiouy" 7 | 8 | def count(self, word): 9 | word = word.lower() 10 | syllable_count = 0 11 | if word[0] in self.vowels: 12 | syllable_count += 1 13 | for index in range(1, len(word)): 14 | if word[index] in self.vowels and word[index - 1] not in self.vowels: 15 | syllable_count += 1 16 | if word.endswith("e"): 17 | syllable_count -= 1 18 | if syllable_count == 0: 19 | syllable_count += 1 20 | return syllable_count -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | import codecs 3 | import os 4 | 5 | # read the contents of your README file 6 | this_directory = os.path.abspath(os.path.dirname(__file__)) 7 | with codecs.open(os.path.join(this_directory, 'README.md'), encoding='utf-8') as f: 8 | long_description = f.read() 9 | 10 | setup( 11 | name='pyviralcontent', 12 | version='0.1.4', 13 | packages=find_packages(), 14 | install_requires=[ 15 | 'pandas', 16 | 'numpy', 17 | 'matplotlib', 18 | 'seaborn' 19 | ], 20 | author='Bhaskar Tripathi', 21 | author_email='bhaskar.tripathi@gmail.com', 22 | description='A package for analyzing content readability and virality potential.', 23 | long_description=long_description, 24 | long_description_content_type='text/markdown', 25 | keywords='readability virality content-analysis', 26 | url='https://github.com/bhaskatripathi/pyviralcontent', 27 | ) 28 | -------------------------------------------------------------------------------- /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflow will upload a Python Package using Twine when a release is created 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries 3 | 4 | # This workflow uses actions that are not certified by GitHub. 5 | # They are provided by a third-party and are governed by 6 | # separate terms of service, privacy policy, and support 7 | # documentation. 8 | 9 | name: Upload Python Package 10 | 11 | on: 12 | release: 13 | types: [published] 14 | 15 | permissions: 16 | contents: read 17 | 18 | jobs: 19 | deploy: 20 | 21 | runs-on: ubuntu-latest 22 | 23 | steps: 24 | - uses: actions/checkout@v3 25 | - name: Set up Python 26 | uses: actions/setup-python@v3 27 | with: 28 | python-version: '3.x' 29 | - name: Install dependencies 30 | run: | 31 | python -m pip install --upgrade pip 32 | pip install build 33 | - name: Build package 34 | run: python -m build 35 | - name: Publish package 36 | uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 37 | with: 38 | user: __token__ 39 | password: ${{ secrets.PYPI_API_TOKEN_VIRAL_CONTENT }} 40 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | #from pyviralcontent import ContentAnalyzer 2 | from pyviralcontent.content_analyzer import ContentAnalyzer 3 | 4 | def main(): 5 | # Mapping of content type numbers to content type names 6 | content_type_map = { 7 | 0: 'scientific', 8 | 1: 'blog', 9 | 2: 'video', 10 | 3: 'technical', 11 | 4: 'fictional', 12 | 5: 'legal', 13 | 6: 'educational', 14 | 7: 'news', 15 | 8: 'advertising', 16 | 9: 'social_media' 17 | } 18 | 19 | # Prompt user for a content type number 20 | content_type_number = int(input("Enter a number for the content type (0 for scientific, 1 for blog, ..., 9 for social_media): ")) 21 | 22 | # Get the content type name from the content_type_map 23 | content_type_name = content_type_map.get(content_type_number) 24 | 25 | if content_type_name is None: 26 | print("Invalid content type number.") 27 | else: 28 | # User to input the text content 29 | text_content = input("Enter the text content for analysis:\n") 30 | 31 | # Create instance of ContentAnalyzer with the chosen content type 32 | analyzer = ContentAnalyzer(text_content, content_type_name) 33 | 34 | # Perform the analysis 35 | df, viral_probability = analyzer.analyze() 36 | 37 | # Print the results 38 | print(f"Readability Scores Summary for {content_type_name.capitalize()} Content:") 39 | print(df) 40 | print(f"The probability of the content going viral is: {viral_probability * 100:.2f}%") 41 | 42 | if __name__ == "__main__": 43 | main() 44 | -------------------------------------------------------------------------------- /pyviralcontent/content_analyzer.py: -------------------------------------------------------------------------------- 1 | from .readability_calculator import ReadabilityCalculator 2 | from .likert_scale import LikertScale 3 | from .visualizer import Visualizer 4 | 5 | # This class acts as a high-level interface for content analysis. 6 | # It utilizes ReadabilityCalculator, LikertScale, and Visualizer to perform a comprehensive readability analysis, including calculating scores, 7 | # determining average scores, calculating virality probability, and visualizing results. 8 | class ContentAnalyzer: 9 | def __init__(self, text_content, content_type): 10 | self.text_content = text_content 11 | self.content_type = content_type 12 | self.readability_calculator = ReadabilityCalculator() 13 | self.likert_scale = LikertScale() 14 | self.visualizer = Visualizer() 15 | 16 | def analyze(self): 17 | # Calculate readability scores 18 | df = self.readability_calculator.calculate_scores_by_content_type(self.text_content, self.content_type) 19 | # Calculate the average score and determine the overall Likert scale and quality 20 | average_score, overall_likert, overall_quality = self.likert_scale.calculate_average_score(df) 21 | df.loc[df.shape[0]] = ['OVERALL SCORE', average_score, overall_likert, overall_quality] 22 | # Calculate the probability of the content going viral 23 | #viral_probability = self.readability_calculator.calculate_virality_probability(df) 24 | viral_probability = self.readability_calculator.calculate_virality_probability(df,self.content_type) 25 | # Visualize the readability scores heatmap 26 | self.visualizer.plot_scores_heatmap(df, self.content_type) 27 | return df, viral_probability -------------------------------------------------------------------------------- /pyviralcontent/text_analyzer.py: -------------------------------------------------------------------------------- 1 | from .syllable_counter import SyllableCounter 2 | 3 | # This class performs various text analysis operations. It utilizes the SyllableCounter to perform syllable-related computations 4 | # and calculates different readability scores such as Flesch Reading Ease, Gunning Fog Index, etc. 5 | class TextAnalyzer: 6 | def __init__(self): 7 | self.syllable_counter = SyllableCounter() 8 | 9 | def complex_word_count(self, text): 10 | words = text.split() 11 | complex_words = [word for word in words if self.syllable_counter.count(word) >= 3] 12 | return len(complex_words) 13 | 14 | def flesch_reading_ease(self, total_sentences, total_words, total_syllables): 15 | return 206.835 - 1.015 * (total_words / total_sentences) - 84.6 * (total_syllables / total_words) 16 | 17 | def flesch_kincaid(self, total_sentences, total_words, total_syllables): 18 | return 0.39 * (total_words / total_sentences) + 11.8 * (total_syllables / total_words) - 15.59 19 | 20 | def gunning_fog(self, total_sentences, total_words, total_complex_words): 21 | return 0.4 * ((total_words / total_sentences) + 100 * (total_complex_words / total_words)) 22 | 23 | def smog(self, total_sentences, total_complex_words): 24 | return 1.0430 * (30 * (total_complex_words / total_sentences)) ** 0.5 + 3.1291 25 | 26 | def linsear_write(self, text, total_sentences): 27 | sample = text.split()[:100] 28 | easy_word = len([word for word in sample if self.syllable_counter.count(word) < 3]) 29 | hard_word = len(sample) - easy_word 30 | return (easy_word + (hard_word * 3)) / total_sentences 31 | 32 | def coleman_liau(self, total_sentences, total_words, total_characters): 33 | return 0.0588 * (total_characters / total_words * 100) - 0.296 * (total_sentences / total_words * 100) - 15.8 34 | 35 | def ari(self, total_sentences, total_words, total_characters): 36 | return 4.71 * (total_characters / total_words) + 0.5 * (total_words / total_sentences) - 21.43 37 | 38 | def calculate_readability_scores(self, text): 39 | sentences = text.split('.') 40 | words = text.split() 41 | characters = ''.join(words) 42 | syllables = sum(self.syllable_counter.count(word) for word in words) 43 | complex_words = self.complex_word_count(text) 44 | 45 | # Total counts 46 | total_sentences = len(sentences) 47 | total_words = len(words) 48 | total_characters = len(characters) 49 | total_syllables = syllables 50 | total_complex_words = complex_words 51 | 52 | return { 53 | 'flesch_reading_ease': self.flesch_reading_ease(total_sentences, total_words, total_syllables), 54 | 'flesch_kincaid': self.flesch_kincaid(total_sentences, total_words, total_syllables), 55 | 'gunning_fog': self.gunning_fog(total_sentences, total_words, total_complex_words), 56 | 'smog': self.smog(total_sentences, total_complex_words), 57 | 'linsear_write': self.linsear_write(text, total_sentences), 58 | 'coleman_liau': self.coleman_liau(total_sentences, total_words, total_characters), 59 | 'ari': self.ari(total_sentences, total_words, total_characters) 60 | } 61 | -------------------------------------------------------------------------------- /pyviralcontent/likert_scale.py: -------------------------------------------------------------------------------- 1 | 2 | # This class manages the Likert scale interpretation and qualitative descriptors for readability scores. 3 | # It provides methods to determine the Likert scale based on scores and to get qualitative descriptors for Likert scores. 4 | class LikertScale: 5 | def __init__(self): 6 | self.likert_scale_interpretation = { 7 | 'flesch_reading_ease': [ 8 | (lambda score: self.in_range(score, 90, float('inf')), 5), # Very Easy 9 | (lambda score: self.in_range(score, 70, 90), 4), # Easy 10 | (lambda score: self.in_range(score, 50, 70), 3), # Fairly Easy 11 | (lambda score: self.in_range(score, 30, 50), 2), # Difficult 12 | (lambda score: self.in_range(score, 0, 30), 1), # Very Confusing 13 | ], 14 | 'flesch_kincaid': [ 15 | (lambda score: score <= 5, 5), # Very Easy 16 | (lambda score: self.in_range(score, 5, 6), 4), # Easy 17 | (lambda score: self.in_range(score, 6, 7), 3), # Fairly Easy 18 | (lambda score: self.in_range(score, 7, 9), 2), # Difficult 19 | (lambda score: score >= 9, 1), # Very Confusing 20 | ], 21 | 'gunning_fog': [ 22 | (lambda score: score <= 6, 5), # Very Easy 23 | (lambda score: self.in_range(score, 6, 8), 4), # Easy 24 | (lambda score: self.in_range(score, 8, 12), 3), # Fairly Easy 25 | (lambda score: self.in_range(score, 12, 17), 2), # Difficult 26 | (lambda score: score >= 17, 1), # Very Confusing 27 | ], 28 | 'smog': [ 29 | (lambda score: score <= 6, 5), # Very Easy 30 | (lambda score: self.in_range(score, 6, 8), 4), # Easy 31 | (lambda score: self.in_range(score, 8, 12), 3), # Fairly Easy 32 | (lambda score: self.in_range(score, 12, 14), 2), # Difficult 33 | (lambda score: score >= 14, 1), # Very Confusing 34 | ], 35 | 'linsear_write': [ 36 | (lambda score: score <= 5, 5), # Very Easy 37 | (lambda score: self.in_range(score, 5, 8), 4), # Easy 38 | (lambda score: self.in_range(score, 8, 12), 3), # Fairly Easy 39 | (lambda score: self.in_range(score, 12, 15), 2),# Difficult 40 | (lambda score: score >= 15, 1), # Very Confusing 41 | ], 42 | 'coleman_liau': [ 43 | (lambda score: score <= 5, 5), # Very Easy 44 | (lambda score: self.in_range(score, 5, 8), 4), # Easy 45 | (lambda score: self.in_range(score, 8, 12), 3), # Fairly Easy 46 | (lambda score: self.in_range(score, 12, 15), 2),# Difficult 47 | (lambda score: score >= 15, 1), # Very Confusing 48 | ], 49 | 'ari': [ 50 | (lambda score: score <= 2, 5), # Very Easy 51 | (lambda score: self.in_range(score, 2, 4), 4), # Easy 52 | (lambda score: self.in_range(score, 4, 7), 3), # Fairly Easy 53 | (lambda score: self.in_range(score, 7, 10), 2), # Difficult 54 | (lambda score: score >= 10, 1), # Very Confusing 55 | ], 56 | 'default': [ 57 | (lambda score: score <= 2, 5), # Very Easy 58 | (lambda score: self.in_range(score, 2, 4), 4), # Easy 59 | (lambda score: self.in_range(score, 4, 6), 3), # Fairly Easy 60 | (lambda score: self.in_range(score, 6, 8), 2), # Difficult 61 | (lambda score: score >= 8, 1), # Very Confusing 62 | ] 63 | } 64 | 65 | self.qualitative_descriptors = { 66 | 5: 'Excellent/Very Clear', 67 | 4: 'Good/Clear', 68 | 3: 'Average/Somewhat Clear', 69 | 2: 'Below Average/Confusing', 70 | 1: 'Poor/Unclear', 71 | 0: 'Very Poor/Very Unclear' 72 | } 73 | 74 | @staticmethod 75 | def in_range(score, start, end): 76 | return start <= score < end 77 | 78 | def determine_likert_scale(self, score, test_name): 79 | scale_ranges = self.likert_scale_interpretation.get(test_name, self.likert_scale_interpretation['default']) 80 | for check_func, likert_value in scale_ranges: 81 | if check_func(score): 82 | return likert_value 83 | return 0 84 | 85 | def max_scale(self, test_name): 86 | if test_name in self.likert_scale_interpretation: 87 | # Find the max Likert scale value for the test 88 | return max(value for _, value in self.likert_scale_interpretation[test_name]) 89 | else: 90 | # Default max Likert scale value 91 | return max(value for _, value in self.likert_scale_interpretation['default']) 92 | 93 | def get_qualitative_descriptor(self, likert_score): 94 | return self.qualitative_descriptors.get(likert_score, 'Undefined') 95 | 96 | def calculate_average_score(self, df): 97 | average_score = df['Score'].mean() 98 | overall_likert = self.determine_likert_scale(average_score, 'default') 99 | overall_quality = self.get_qualitative_descriptor(overall_likert) 100 | return average_score, overall_likert, overall_quality -------------------------------------------------------------------------------- /pyviralcontent/readability_calculator.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from .text_analyzer import TextAnalyzer 4 | from .likert_scale import LikertScale 5 | 6 | # This class is responsible for calculating readability scores for a given text based on content type. 7 | # It uses TextAnalyzer to compute scores and LikertScale for interpreting scores. 8 | # It also provides functionality to calculate the probability of content going viral. 9 | class ReadabilityCalculator: 10 | def __init__(self): 11 | self.content_type_formulas = { 12 | 'scientific': ['gunning_fog', 'coleman_liau', 'ari'], 13 | 'blog': ['flesch_reading_ease', 'flesch_kincaid'], 14 | 'video': ['smog', 'flesch_reading_ease'], 15 | 'technical': ['linsear_write', 'ari'], 16 | 'fictional': ['flesch_kincaid', 'coleman_liau'], 17 | 'legal': ['gunning_fog', 'smog'], 18 | 'educational': ['flesch_kincaid', 'linsear_write'], 19 | 'news': ['flesch_reading_ease', 'gunning_fog'], 20 | 'advertising': ['flesch_reading_ease', 'coleman_liau'], 21 | 'social_media': ['flesch_reading_ease', 'linsear_write'] 22 | } 23 | self.text_analyzer = TextAnalyzer() 24 | self.likert_scale = LikertScale() 25 | 26 | def calculate_scores_by_content_type(self, text, content_type): 27 | scores = self.text_analyzer.calculate_readability_scores(text) 28 | selected_scores = {test: score for test, score in scores.items() if test in self.content_type_formulas[content_type]} 29 | 30 | df = pd.DataFrame(list(selected_scores.items()), columns=['Test', 'Score']) 31 | df['Likert_Scale'] = df.apply(lambda row: self.likert_scale.determine_likert_scale(row['Score'], row['Test']), axis=1) 32 | df['Qualitative_Descriptors'] = df['Likert_Scale'].apply(lambda x: self.likert_scale.get_qualitative_descriptor(x)) 33 | return df 34 | 35 | def calculate_virality_probability(self, df): 36 | virality_weights = {'flesch_reading_ease': 0.15,'flesch_kincaid': 0.15,'gunning_fog': 0.1,'smog': 0.1,'linsear_write': 0.1,'coleman_liau': 0.2,'ari': 0.2} 37 | df['Weighted_Likert'] = df['Test'].map(virality_weights).fillna(0) * df['Likert_Scale'] 38 | virality_score = df['Weighted_Likert'].sum() / df['Test'].map(virality_weights).fillna(0).sum() 39 | probability_of_going_viral = virality_score / 5 # Assuming 5 is the max Likert scale value 40 | return probability_of_going_viral 41 | 42 | def calculate_scores_by_content_type_keener(self, text, content_type): 43 | scores_dict = self.text_analyzer.calculate_readability_scores(text) 44 | selected_scores_dict = {test: score for test, score in scores_dict.items() if test in self.content_type_formulas[content_type]} 45 | 46 | scores = list(selected_scores_dict.values()) 47 | keener_scores = self.keener_method(scores) 48 | 49 | keener_df = pd.DataFrame(list(selected_scores_dict.keys()), columns=['Test']) 50 | keener_df['Score'] = keener_scores 51 | keener_df['Likert_Scale'] = keener_df['Score'].apply(lambda x: self.likert_scale.determine_likert_scale(x, 'default')) 52 | keener_df['Qualitative_Descriptors'] = keener_df['Likert_Scale'].apply(lambda x: self.likert_scale.get_qualitative_descriptor(x)) 53 | 54 | overall_score = np.dot(keener_scores, scores) 55 | overall_likert = self.likert_scale.determine_likert_scale(overall_score, 'default') 56 | overall_quality = self.likert_scale.get_qualitative_descriptor(overall_likert) 57 | 58 | overall_df = pd.DataFrame([['OVERALL SCORE', overall_score, overall_likert, overall_quality]], columns=['Test', 'Score', 'Likert_Scale', 'Qualitative_Descriptors']) 59 | 60 | return pd.concat([keener_df, overall_df], ignore_index=True) 61 | 62 | # def calculate_virality_probability(self, df): 63 | # virality_weights = { 64 | # 'flesch_reading_ease': 0.15, 65 | # 'flesch_kincaid': 0.15, 66 | # 'gunning_fog': 0.1, 67 | # 'smog': 0.1, 68 | # 'linsear_write': 0.1, 69 | # 'coleman_liau': 0.2, 70 | # 'ari': 0.2 71 | # } 72 | # df['Weighted_Likert'] = df['Test'].map(virality_weights).fillna(0) * df['Likert_Scale'] 73 | # virality_score = df['Weighted_Likert'].sum() / df['Test'].map(virality_weights).fillna(0).sum() 74 | # probability_of_going_viral = virality_score / 5 # Assuming 5 is the max Likert scale value 75 | # return probability_of_going_viral 76 | 77 | def calculate_virality_probability(self, df, content_type): 78 | virality_weights = {'flesch_reading_ease': 0.15,'flesch_kincaid': 0.15,'gunning_fog': 0.1, 79 | 'smog': 0.1,'linsear_write': 0.1,'coleman_liau': 0.2,'ari': 0.2} 80 | df['Weighted_Likert'] = df['Test'].map(virality_weights).fillna(0) * df['Likert_Scale'] 81 | virality_score = df['Weighted_Likert'].sum() 82 | # Get the tests relevant for the content type 83 | relevant_tests = self.content_type_formulas[content_type] 84 | # Calculate the sum of the maximum possible weighted scores for the relevant tests 85 | max_weighted_score = sum(virality_weights[test] * self.likert_scale.max_scale(test) for test in relevant_tests if test in virality_weights) 86 | # Normalize the virality score by the maximum possible weighted score 87 | probability_of_going_viral = virality_score / max_weighted_score if max_weighted_score else 0 88 | return probability_of_going_viral 89 | 90 | @staticmethod 91 | def kappa(x): 92 | return 0.5 + 0.5 * np.sign(x - 0.5) * np.sqrt(abs(2 * x - 1)) 93 | 94 | def keener_method(self, scores): 95 | N = len(scores) 96 | S = np.zeros((N, N)) 97 | for i in range(N): 98 | for j in range(N): 99 | if i != j: 100 | S[i, j] = 1 / scores[i] if scores[i] > 0 else 0 101 | 102 | K = np.zeros((N, N)) 103 | for i in range(N): 104 | for j in range(N): 105 | if i != j: 106 | K[i, j] = self.kappa((1 + S[i, j]) / (2 + S[i, j] + S[j, i])) 107 | else: 108 | K[i, j] = 0 109 | 110 | eigenvalues, eigenvectors = np.linalg.eig(K) 111 | max_eigenvalue_index = np.argmax(eigenvalues.real) 112 | ratings = eigenvectors[:, max_eigenvalue_index].real 113 | normalized_ratings = ratings / np.sum(ratings) 114 | 115 | return normalized_ratings -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PyViralContent 2 | 3 | `pyviralcontent` is a Python package designed to assess the readability of various types of content and predict the and the probability of the content going viral. It employs multiple readability tests and translates numerical scores into qualitative descriptors based on the Likert scale. The package supports various types of content, allowing for a tailored analysis based on the specific nature of the content. 4 | 5 | ## Supported Content Types 6 | 7 | The package currently supports the following content types: 8 | 9 | - `scientific` 10 | - `blog` 11 | - `video` 12 | - `technical` 13 | - `fictional` 14 | - `legal` 15 | - `educational` 16 | - `news` 17 | - `advertising` 18 | - `social_media` 19 | 20 | ## Installation 21 | 22 | ``` 23 | pip install pyviralcontent 24 | ``` 25 | 26 | ## Usage 27 | To analyze your content, you can use the `ContentAnalyzer` class from the `pyviralcontent` package. Here's a simple example of how to use the `ContentAnalyzer` to analyze different types of content: 28 | 29 | 30 | ```python 31 | from pyviralcontent import ContentAnalyzer 32 | 33 | def test_content_analysis(content_type, text_content): 34 | """ 35 | Test the content analysis for a given type of content and text content. 36 | 37 | :param content_type: The type of the content (e.g., 'scientific', 'blog', etc.). 38 | :param text_content: The actual content to analyze. 39 | """ 40 | # Create an instance of ContentAnalyzer 41 | analyzer = ContentAnalyzer(text_content, content_type) 42 | # Perform the analysis 43 | df, viral_probability = analyzer.analyze() 44 | # Print the results 45 | print(f"\nReadability Scores Summary for {content_type.capitalize()} Content:") 46 | display(df) 47 | print(f"The probability of the content going viral is: {viral_probability * 100:.2f}%") 48 | 49 | # Example 1: Scientific content 50 | test_content_analysis( 51 | 'scientific', 52 | "Implement an accessibility-first approach in the design of the website. This includes: • High-contrast visuals for low-vision users. • Text-to-speech functionality for all text, including product descriptions and checkout processes.• Easy keyboard navigation for those unable to use a mouse." 53 | ) 54 | 55 | # Example 2: Blog content 56 | test_content_analysis( 57 | 'blog', 58 | "Today's blog post discusses the importance of user experience design. A good design ensures that users find joy and satisfaction in the interaction with the product, making it an essential aspect of product development." 59 | ) 60 | 61 | # Example 3: Technical content 62 | test_content_analysis( 63 | 'technical', 64 | "The module utilizes an advanced algorithm for data processing, ensuring high performance and reliability. It's optimized for multi-threaded environments, offering significant improvements in processing speed and efficiency." 65 | ) 66 | 67 | # Example 4: Fictional content 68 | test_content_analysis( 69 | 'fictional', 70 | "In the distant future, humanity has reached the stars. Each galaxy is a new frontier, and every planet a new adventure. Join our heroes as they navigate through cosmic dangers and discover the mysteries of the universe." 71 | ) 72 | 73 | # Example 5: Legal content 74 | test_content_analysis( 75 | 'legal', 76 | "The contract stipulates the terms and conditions of the agreement and is legally binding to both parties involved. It outlines the responsibilities, duties, and liabilities in clear, unambiguous language to prevent any misunderstandings." 77 | ) 78 | 79 | # Example 6: Educational content 80 | test_content_analysis( 81 | 'educational', 82 | "Today's lesson covers the fundamental principles of physics. We'll explore Newton's laws of motion, the concept of gravity, and the principles of energy and momentum. Each concept will be demonstrated with real-life examples and interactive experiments." 83 | ) 84 | 85 | # Example 7: News content 86 | test_content_analysis( 87 | 'news', 88 | "In today's news, the local community is coming together to support the annual food drive. Last year's drive helped over a thousand families, and this year the organizers hope to double that number with the help of generous donations and volunteer work." 89 | ) 90 | 91 | # Example 8: Advertising content 92 | test_content_analysis( 93 | 'advertising', 94 | "Introducing the latest innovation in home cleaning! Our new vacuum cleaner is equipped with advanced technology to clean your home efficiently and effortlessly. Say goodbye to dust and hello to spotless floors!" 95 | ) 96 | 97 | # Example 9: Social Media content 98 | test_content_analysis( 99 | 'social_media', 100 | "Just finished an amazing workout at the gym! 💪 Feeling energized and ready to take on the day. Remember, a healthy lifestyle is not just a goal, it's a way of living. #FitnessGoals #HealthyLiving" 101 | ) 102 | 103 | # Example 10: Video content 104 | test_content_analysis( 105 | 'video', 106 | "In this video, we'll take a closer look at the intricate ecosystem of the Amazon rainforest. Discover the diverse species that call it home, and learn about the critical role it plays in our planet's climate system." 107 | ) 108 | ``` 109 | ![Sample Image](https://github.com/bhaskatripathi/pyviralcontent/blob/main/Sample.JPG?raw=true) 110 | 111 | 112 | ## Features 113 | 114 | - Multiple readability tests for different content types. 115 | - Qualitative descriptors based on the Likert scale. 116 | - Estimation of content's virality potential. 117 | - Supported content types include: scientific, blog, video, technical, fictional, legal, educational, news, advertising, social_media. 118 | 119 | ## How it Works? 120 | 121 | The `PyViralContent` package offers a sophisticated approach to analyzing textual content by recognizing that no single readability metric is representative fits all content types. This is essential a Multi Criteria Decision Analysis Problem which is solved using Keener's method. Different types of content have unique stylistic and structural characteristics, and the package addresses this by associating specific readability formulas with each content type. This method ensures a nuanced analysis and provides a more accurate reflection of the content's readability and potential virality. 122 | 123 | ### Content Type Formulas 124 | 125 | The package defines `content_type_formulas`, a mapping of content types to the sets of readability formulas that are best suited for those types. Here's the association between content types and their corresponding readability formulas. These formulae have been integrated using Keener's MCDA method. Keener's method computes the eigenvector corresponding to the largest eigenvalue of a certain matrix derived from the pairwise comparisons. This eigenvector provides the weights or ratings of the items being compared, reflecting their relative importance or dominance in the context of the comparison. 126 | ![UML](https://github.com/bhaskatripathi/pyviralcontent/blob/main/UML.png) 127 | For a detailed explanation of Keener's method and its applications, please refer to the following resource: 128 | [Understanding Keener's Method (PDF)](https://www.dcs.bbk.ac.uk/~ale/dsta+dsat/dsta+dsat-3/lm-ch3-keener.pdf) 129 | 130 | The `PyViralContent` package integrates Keener's method in its analytical engine to enhance the robustness and depth of the content analysis, offering users a sophisticated tool for assessing the potential impact and reach of their content. 131 | 132 | | Content Type | Readability Formulas Used | 133 | |---------------|-----------------------------------------| 134 | | Scientific | Gunning Fog, Coleman Liau, ARI | 135 | | Blog | Flesch Reading Ease, Flesch Kincaid | 136 | | Video | SMOG, Flesch Reading Ease | 137 | | Technical | Linsear Write, ARI | 138 | | Fictional | Flesch Kincaid, Coleman Liau | 139 | | Legal | Gunning Fog, SMOG | 140 | | Educational | Flesch Kincaid, Linsear Write | 141 | | News | Flesch Reading Ease, Gunning Fog | 142 | | Advertising | Flesch Reading Ease, Coleman Liau | 143 | | Social Media | Flesch Reading Ease, Linsear Write | 144 | 145 | ### Interpretation with Likert Scale 146 | 147 | The results from the readability formulas are interpreted using a Likert scale, which provides a qualitative measure of the content's readability. This scale is not one-size-fits-all; it is tailored to each readability formula to accurately reflect the nuances of each metric. Here's how the Likert scale is applied for each readability formula: 148 | 149 | | Readability Formula | Likert Scale Interpretation (Score Range) | Qualitative Descriptor | 150 | |-------------------------|--------------------------------------------|----------------------------| 151 | | Flesch Reading Ease | 90-inf: 5, 70-90: 4, 50-70: 3, 30-50: 2, 0-30: 1 | Very Easy to Very Confusing | 152 | | Flesch Kincaid | <=5: 5, 5-6: 4, 6-7: 3, 7-9: 2, >=9: 1 | Very Easy to Very Confusing | 153 | | Gunning Fog | <=6: 5, 6-8: 4, 8-12: 3, 12-17: 2, >=17: 1 | Very Easy to Very Confusing | 154 | | SMOG | <=6: 5, 6-8: 4, 8-12: 3, 12-14: 2, >=14: 1 | Very Easy to Very Confusing | 155 | | Linsear Write | <=5: 5, 5-8: 4, 8-12: 3, 12-15: 2, >=15: 1 | Very Easy to Very Confusing | 156 | | Coleman Liau | <=5: 5, 5-8: 4, 8-12: 3, 12-15: 2, >=15: 1 | Very Easy to Very Confusing | 157 | | ARI | <=2: 5, 2-4: 4, 4-7: 3, 7-10: 2, >=10: 1 | Very Easy to Very Confusing | 158 | 159 | These ranges and descriptors ensure that the readability score is not just a number, but a meaningful indicator of how the content will likely be received by the intended audience. The `PyViralContent` package provides a detailed output, including both the readability scores from each formula used and the overall virality probability, offering valuable insights into the potential reach and impact of the content analyzed. 160 | 161 | 162 | ## Contributing 163 | 164 | Contributions to `pyviralcontent` are welcome! Please feel free to submit issues, fork the repository, and create pull requests. 165 | 166 | ## License 167 | 168 | This project is licensed under the MIT License - see the LICENSE file for details. 169 | 170 | ## Contact 171 | 172 | Bhaskar Tripathi - bhaskar.tripathi@gmail.com 173 | GitHub: https://github.com/bhaskatripathi/pyviralcontent 174 | -------------------------------------------------------------------------------- /pyviralcontent.egg-info/PKG-INFO: -------------------------------------------------------------------------------- 1 | Metadata-Version: 2.1 2 | Name: pyviralcontent 3 | Version: 0.1.4 4 | Summary: A package for analyzing content readability and virality potential. 5 | Home-page: https://github.com/bhaskatripathi/pyviralcontent 6 | Author: Bhaskar Tripathi 7 | Author-email: bhaskar.tripathi@gmail.com 8 | Keywords: readability virality content-analysis 9 | Description-Content-Type: text/markdown 10 | 11 | # PyViralContent 12 | 13 | `pyviralcontent` is a Python package designed to assess the readability of various types of content and predict the and the probability of the content going viral. It employs multiple readability tests and translates numerical scores into qualitative descriptors based on the Likert scale. The package supports various types of content, allowing for a tailored analysis based on the specific nature of the content. 14 | 15 | ## Supported Content Types 16 | 17 | The package currently supports the following content types: 18 | 19 | - `scientific` 20 | - `blog` 21 | - `video` 22 | - `technical` 23 | - `fictional` 24 | - `legal` 25 | - `educational` 26 | - `news` 27 | - `advertising` 28 | - `social_media` 29 | 30 | ## Installation 31 | 32 | ``` 33 | pip install pyviralcontent 34 | ``` 35 | 36 | ## Usage 37 | To analyze your content, you can use the `ContentAnalyzer` class from the `pyviralcontent` package. Here's a simple example of how to use the `ContentAnalyzer` to analyze different types of content: 38 | 39 | 40 | ```python 41 | from pyviralcontent import ContentAnalyzer 42 | 43 | def test_content_analysis(content_type, text_content): 44 | """ 45 | Test the content analysis for a given type of content and text content. 46 | 47 | :param content_type: The type of the content (e.g., 'scientific', 'blog', etc.). 48 | :param text_content: The actual content to analyze. 49 | """ 50 | # Create an instance of ContentAnalyzer 51 | analyzer = ContentAnalyzer(text_content, content_type) 52 | # Perform the analysis 53 | df, viral_probability = analyzer.analyze() 54 | # Print the results 55 | print(f"\nReadability Scores Summary for {content_type.capitalize()} Content:") 56 | display(df) 57 | print(f"The probability of the content going viral is: {viral_probability * 100:.2f}%") 58 | 59 | # Example 1: Scientific content 60 | test_content_analysis( 61 | 'scientific', 62 | "Implement an accessibility-first approach in the design of the website. This includes: • High-contrast visuals for low-vision users. • Text-to-speech functionality for all text, including product descriptions and checkout processes.• Easy keyboard navigation for those unable to use a mouse." 63 | ) 64 | 65 | # Example 2: Blog content 66 | test_content_analysis( 67 | 'blog', 68 | "Today's blog post discusses the importance of user experience design. A good design ensures that users find joy and satisfaction in the interaction with the product, making it an essential aspect of product development." 69 | ) 70 | 71 | # Example 3: Technical content 72 | test_content_analysis( 73 | 'technical', 74 | "The module utilizes an advanced algorithm for data processing, ensuring high performance and reliability. It's optimized for multi-threaded environments, offering significant improvements in processing speed and efficiency." 75 | ) 76 | 77 | # Example 4: Fictional content 78 | test_content_analysis( 79 | 'fictional', 80 | "In the distant future, humanity has reached the stars. Each galaxy is a new frontier, and every planet a new adventure. Join our heroes as they navigate through cosmic dangers and discover the mysteries of the universe." 81 | ) 82 | 83 | # Example 5: Legal content 84 | test_content_analysis( 85 | 'legal', 86 | "The contract stipulates the terms and conditions of the agreement and is legally binding to both parties involved. It outlines the responsibilities, duties, and liabilities in clear, unambiguous language to prevent any misunderstandings." 87 | ) 88 | 89 | # Example 6: Educational content 90 | test_content_analysis( 91 | 'educational', 92 | "Today's lesson covers the fundamental principles of physics. We'll explore Newton's laws of motion, the concept of gravity, and the principles of energy and momentum. Each concept will be demonstrated with real-life examples and interactive experiments." 93 | ) 94 | 95 | # Example 7: News content 96 | test_content_analysis( 97 | 'news', 98 | "In today's news, the local community is coming together to support the annual food drive. Last year's drive helped over a thousand families, and this year the organizers hope to double that number with the help of generous donations and volunteer work." 99 | ) 100 | 101 | # Example 8: Advertising content 102 | test_content_analysis( 103 | 'advertising', 104 | "Introducing the latest innovation in home cleaning! Our new vacuum cleaner is equipped with advanced technology to clean your home efficiently and effortlessly. Say goodbye to dust and hello to spotless floors!" 105 | ) 106 | 107 | # Example 9: Social Media content 108 | test_content_analysis( 109 | 'social_media', 110 | "Just finished an amazing workout at the gym! 💪 Feeling energized and ready to take on the day. Remember, a healthy lifestyle is not just a goal, it's a way of living. #FitnessGoals #HealthyLiving" 111 | ) 112 | 113 | # Example 10: Video content 114 | test_content_analysis( 115 | 'video', 116 | "In this video, we'll take a closer look at the intricate ecosystem of the Amazon rainforest. Discover the diverse species that call it home, and learn about the critical role it plays in our planet's climate system." 117 | ) 118 | ``` 119 | ![Sample Image](https://github.com/bhaskatripathi/pyviralcontent/blob/main/Sample.JPG?raw=true) 120 | 121 | 122 | ## Features 123 | 124 | - Multiple readability tests for different content types. 125 | - Qualitative descriptors based on the Likert scale. 126 | - Estimation of content's virality potential. 127 | - Supported content types include: scientific, blog, video, technical, fictional, legal, educational, news, advertising, social_media. 128 | 129 | ## How it Works? 130 | 131 | The `PyViralContent` package offers a sophisticated approach to analyzing textual content by recognizing that no single readability metric is representative fits all content types. This is essential a Multi Criteria Decision Analysis Problem which is solved using Keener's method. Different types of content have unique stylistic and structural characteristics, and the package addresses this by associating specific readability formulas with each content type. This method ensures a nuanced analysis and provides a more accurate reflection of the content's readability and potential virality. 132 | 133 | ### Content Type Formulas 134 | 135 | The package defines `content_type_formulas`, a mapping of content types to the sets of readability formulas that are best suited for those types. Here's the association between content types and their corresponding readability formulas. These formulae have been integrated using Keener's MCDA method. Keener's method computes the eigenvector corresponding to the largest eigenvalue of a certain matrix derived from the pairwise comparisons. This eigenvector provides the weights or ratings of the items being compared, reflecting their relative importance or dominance in the context of the comparison. 136 | 137 | For a detailed explanation of Keener's method and its applications, please refer to the following resource: 138 | [Understanding Keener's Method (PDF)](https://www.dcs.bbk.ac.uk/~ale/dsta+dsat/dsta+dsat-3/lm-ch3-keener.pdf) 139 | 140 | The `PyViralContent` package integrates Keener's method in its analytical engine to enhance the robustness and depth of the content analysis, offering users a sophisticated tool for assessing the potential impact and reach of their content. 141 | 142 | | Content Type | Readability Formulas Used | 143 | |---------------|-----------------------------------------| 144 | | Scientific | Gunning Fog, Coleman Liau, ARI | 145 | | Blog | Flesch Reading Ease, Flesch Kincaid | 146 | | Video | SMOG, Flesch Reading Ease | 147 | | Technical | Linsear Write, ARI | 148 | | Fictional | Flesch Kincaid, Coleman Liau | 149 | | Legal | Gunning Fog, SMOG | 150 | | Educational | Flesch Kincaid, Linsear Write | 151 | | News | Flesch Reading Ease, Gunning Fog | 152 | | Advertising | Flesch Reading Ease, Coleman Liau | 153 | | Social Media | Flesch Reading Ease, Linsear Write | 154 | 155 | ### Interpretation with Likert Scale 156 | 157 | The results from the readability formulas are interpreted using a Likert scale, which provides a qualitative measure of the content's readability. This scale is not one-size-fits-all; it is tailored to each readability formula to accurately reflect the nuances of each metric. Here's how the Likert scale is applied for each readability formula: 158 | 159 | | Readability Formula | Likert Scale Interpretation (Score Range) | Qualitative Descriptor | 160 | |-------------------------|--------------------------------------------|----------------------------| 161 | | Flesch Reading Ease | 90-inf: 5, 70-90: 4, 50-70: 3, 30-50: 2, 0-30: 1 | Very Easy to Very Confusing | 162 | | Flesch Kincaid | <=5: 5, 5-6: 4, 6-7: 3, 7-9: 2, >=9: 1 | Very Easy to Very Confusing | 163 | | Gunning Fog | <=6: 5, 6-8: 4, 8-12: 3, 12-17: 2, >=17: 1 | Very Easy to Very Confusing | 164 | | SMOG | <=6: 5, 6-8: 4, 8-12: 3, 12-14: 2, >=14: 1 | Very Easy to Very Confusing | 165 | | Linsear Write | <=5: 5, 5-8: 4, 8-12: 3, 12-15: 2, >=15: 1 | Very Easy to Very Confusing | 166 | | Coleman Liau | <=5: 5, 5-8: 4, 8-12: 3, 12-15: 2, >=15: 1 | Very Easy to Very Confusing | 167 | | ARI | <=2: 5, 2-4: 4, 4-7: 3, 7-10: 2, >=10: 1 | Very Easy to Very Confusing | 168 | 169 | These ranges and descriptors ensure that the readability score is not just a number, but a meaningful indicator of how the content will likely be received by the intended audience. The `PyViralContent` package provides a detailed output, including both the readability scores from each formula used and the overall virality probability, offering valuable insights into the potential reach and impact of the content analyzed. 170 | 171 | 172 | ## Contributing 173 | 174 | Contributions to `pyviralcontent` are welcome! Please feel free to submit issues, fork the repository, and create pull requests. 175 | 176 | ## License 177 | 178 | This project is licensed under the MIT License - see the LICENSE file for details. 179 | 180 | ## Contact 181 | 182 | Bhaskar Tripathi - bhaskar.tripathi@gmail.com 183 | GitHub: https://github.com/bhaskatripathi/pyviralcontent 184 | --------------------------------------------------------------------------------