├── .github
    └── ISSUE_TEMPLATE
    │   ├── bug-report.md
    │   └── module-request.md
├── .gitignore
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── PULL_REQUEST_TEMPLATE.md
├── Procfile
├── README.md
├── api.py
├── bricks-test-data-project.zip
├── classifiers
    ├── README.md
    ├── __init__.py
    ├── _template
    │   └── _template_func
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_common.md
    │   │   └── config.py
    ├── active_learner
    │   ├── bayesian_optimization
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   ├── config.py
    │   │   └── util
    │   │   │   ├── __init__.py
    │   │   │   ├── atl.py
    │   │   │   └── util.py
    │   ├── decision_tree
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    │   ├── grid_search
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    │   ├── logistic_regression
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    │   ├── random_forest
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    │   └── random_search
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    ├── communication_style
    │   └── communication_style_classifier
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    ├── dates_and_times
    │   └── workday_classifier
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    ├── llm
    │   ├── bert_sentiment_german
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    │   ├── deberta_review_classifier
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    │   ├── distilbert_stock_news_classifier
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    │   └── gpt_classifier
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_backup.md
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    ├── lookup_lists
    │   └── lookup_list
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_backup.md
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    ├── question_type
    │   └── question_type_classifier
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    ├── reference_complexity
    │   ├── chunked_sentence_complexity
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    │   ├── maximum_sentence_complexity
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    │   └── tiktoken_length_classifier
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    ├── reference_quality
    │   ├── special_character_classifier
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    │   └── word_count_classifier
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    ├── reference_relevance
    │   └── gpt_cross_encoder
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    ├── sentiment
    │   ├── README.md
    │   ├── textblob_sentiment
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_backup.md
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    │   └── vader_sentiment_classifier
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_backup.md
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    ├── similarity
    │   └── cosine_similarity
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_backup.md
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    ├── spelling
    │   └── spelling_check
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_backkup.md
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    ├── text_analysis
    │   ├── emotionality_detection
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_backup.md
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    │   ├── language_detection
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_backup.md
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    │   ├── profanity_detection
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_backup.md
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    │   ├── sentence_complexity
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_backup.md
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    │   └── textblob_subjectivity
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_backup.md
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    └── zero_shot
    │   └── README.md
├── cms.py
├── extractors
    ├── README.md
    ├── __init__.py
    ├── _template
    │   └── _template_func
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_common.md
    │   │   └── config.py
    ├── active_learner
    │   └── crf_tagger
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    ├── codes
    │   ├── color_code_extraction
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_backup.md
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    │   └── stock_ticker_extraction
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_backup.md
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   ├── config.py
    │   │   └── tickers.txt
    ├── dates_and_times
    │   ├── date_extraction
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_backup.md
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    │   └── time_extraction
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_backup.md
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    ├── functions
    │   ├── aspect_extraction
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_backup.md
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    │   ├── gazetteer_extraction
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_backup.md
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    │   ├── regex_extraction
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_backup.md
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    │   └── window_search_extraction
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_backup.md
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    ├── llm
    │   ├── bert_ner_extraction
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    │   ├── deberta_ner_extraction
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    │   └── gpt_information_extraction
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_backup.md
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    ├── media
    │   └── work_of_art_extraction
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_backup.md
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    ├── metrics
    │   └── metric_extraction
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_backup.md
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    ├── numbers
    │   ├── bic_extraction
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    │   ├── credit_card_extraction
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_backup.md
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    │   ├── digit_extraction
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_backup.md
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    │   ├── iban_extraction
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_backup.md
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    │   ├── ip_extraction
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_backup.md
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    │   ├── isbn_extraction
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_backup.md
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    │   ├── percentage_extraction
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    │   ├── phone_number_extraction
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_backup.md
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    │   └── price_extraction
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_backup.md
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    ├── paths
    │   ├── filepath_extraction
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_backup.md
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    │   └── url_extraction
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_backup.md
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    ├── personal_identifiers
    │   ├── address_extraction
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_backup.md
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    │   ├── email_extraction
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_backup.md
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    │   ├── location_extraction
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    │   ├── person_extraction
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_backup.md
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    │   └── zipcode_extraction
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_backup.md
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   ├── config.py
    │   │   └── zip_codes.json
    ├── symbols
    │   └── hashtag_extraction
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_backup.md
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    ├── util
    │   ├── __init__.py
    │   └── spacy.py
    ├── words
    │   ├── difficult_words_extraction
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    │   ├── goodbye_extraction
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_backup.md
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    │   ├── keyword_extraction
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_backup.md
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    │   ├── noun_match_extraction
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_backup.md
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    │   ├── org_extraction
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_backup.md
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    │   ├── part_of_speech_extraction
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_backup.md
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    │   ├── quote_extraction
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_backup.md
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    │   ├── smalltalk_extraction
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_backup.md
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    │   ├── substring_extraction
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_backup.md
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    │   ├── synonym_extraction
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_backup.md
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    │   └── verb_phrase_extraction
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_backup.md
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    └── zero_shot
    │   └── README.md
├── generators
    ├── README.md
    ├── __init__.py
    ├── _template
    │   └── _template_func
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_common.md
    │   │   └── config.py
    ├── distance
    │   ├── euclidean_distance
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_backup.md
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    │   ├── hamming_distance
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_backup.md
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    │   ├── levenshtein_distance
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_backup.md
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    │   └── manhattan_distance
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_backup.md
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    ├── lemmatizer
    │   └── spacy_lemmatizer
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_backup.md
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    ├── llm
    │   ├── bert_toxicity_detector
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    │   ├── gpt_grammar_correction
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_backup.md
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    │   └── gpt_tldr_summarization
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_backup.md
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    ├── ngram
    │   └── nltk_ngram_generator
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_backup.md
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    ├── paths
    │   ├── domain_parser
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    │   └── url_keyword_parser
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_backup.md
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    ├── reference_chunking
    │   ├── newline_splitter
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    │   └── noun_splitter
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    ├── search
    │   ├── bing_news_search
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_backup.md
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    │   ├── bing_search
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_backup.md
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    │   ├── google_search
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_backup.md
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    │   └── nyt_news_search
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_backup.md
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    ├── sentiment
    │   └── vader_sentiment_scores
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_backup.md
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    ├── speech_to_text
    │   └── azure_speech_to_text
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_backup.md
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    ├── spelling
    │   ├── bing_spelling_correction
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    │   └── textblob_spelling_correction
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_backup.md
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    ├── summarization
    │   ├── smalltalk_truncation
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_backup.md
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    │   ├── sumy_website_summarizer
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    │   └── text_summarization
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_backup.md
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    ├── text_analytics
    │   ├── most_frequent_words
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_backup.md
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    │   ├── phonetic_soundex
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_backup.md
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    │   ├── reading_time
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_backup.md
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    │   ├── syllable_count
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_backup.md
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    │   └── tiktoken_token_counter
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    ├── text_cleaning
    │   ├── html_cleanser
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_backup.md
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    │   └── html_unescape
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   ├── config.py
    │   │   └── config_backup.py
    ├── translation
    │   ├── deepl_translator
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_backup.md
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    │   ├── ibm_translator
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_backup.md
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    │   ├── language_translator
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_backup.md
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    │   └── microsoft_translator
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── code_snippet_backup.md
    │   │   ├── code_snippet_common.md
    │   │   ├── code_snippet_refinery.md
    │   │   └── config.py
    ├── util
    │   └── spacy.py
    └── zero_shot
    │   └── README.md
├── gunicorn.config.py
├── images
    ├── fastapi_testing_01.png
    ├── fastapi_testing_02.png
    ├── hero.svg
    ├── identifier.svg
    └── thumbnail-bricks.png
├── nltk.txt
├── requirements.txt
├── runtime.txt
└── util
    ├── __init__.py
    ├── configs.py
    ├── enums.py
    ├── exceptions.py
    └── paths.py


/.github/ISSUE_TEMPLATE/module-request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Module request
 3 | about: Suggest a module for bricks
 4 | title: "[MODULE] - Your module idea"
 5 | labels: enhancement
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Please describe the module you would like to add to bricks**
11 | Either go in detail, or just share a sketch of an idea here
12 | 
13 | **Do you already have an implementation?**
14 | If so, please share it here. For instance:
15 | ```python
16 | from typing import Dict, any
17 | def my_module(record: Dict[str, Any]) -> str:
18 |     return record["my-text"].text.lower()
19 | ```
20 | 
21 | **Additional context**
22 | Add any other context or screenshots about the feature request here.
23 | 


--------------------------------------------------------------------------------
/Procfile:
--------------------------------------------------------------------------------
1 | web: gunicorn --worker-tmp-dir /dev/shm --config gunicorn.config.py api:api
2 | 


--------------------------------------------------------------------------------
/bricks-test-data-project.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/code-kern-ai/bricks/3a43c1e2b1ad8f41cee43028fce12f94fe889ea2/bricks-test-data-project.zip


--------------------------------------------------------------------------------
/classifiers/README.md:
--------------------------------------------------------------------------------
1 | # Classifiers
2 | Classifiers are modules that summarizes a given text into a specific category. For example, a module that classifies a text into the category `news` or `blog` would go into this folder. It can also be about enrichments, e.g. to detect languages and such.


--------------------------------------------------------------------------------
/classifiers/_template/_template_func/README.md:
--------------------------------------------------------------------------------
1 | A brick module should contain a README which describes the use and functionality of a brick. This is also the place where you can provide additional information. You may also include your name here let others know who contributed this brick!


--------------------------------------------------------------------------------
/classifiers/_template/_template_func/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/code-kern-ai/bricks/3a43c1e2b1ad8f41cee43028fce12f94fe889ea2/classifiers/_template/_template_func/__init__.py


--------------------------------------------------------------------------------
/classifiers/_template/_template_func/code_snippet_common.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/code-kern-ai/bricks/3a43c1e2b1ad8f41cee43028fce12f94fe889ea2/classifiers/_template/_template_func/code_snippet_common.md


--------------------------------------------------------------------------------
/classifiers/active_learner/bayesian_optimization/README.md:
--------------------------------------------------------------------------------
1 | This is a bayesian optimised active learner. When we deal with significantly small hypervolume in a given n dimensional search space, random search needs very large number of samples, which can be very expensive. Bayesian optimisation is a sample efficient alternative for algorithmic assurance. With the use of gaussian process regression, B.O. calculates the best hyperparameters that globally optimises a black box function. Note that this might take a few minutes to run.
2 | 


--------------------------------------------------------------------------------
/classifiers/active_learner/bayesian_optimization/__init__.py:
--------------------------------------------------------------------------------
1 | def bayesian_optimization():
2 |     """Bayesian optimisation algorithm to optimise the active learner."""
3 |     pass


--------------------------------------------------------------------------------
/classifiers/active_learner/bayesian_optimization/code_snippet_common.md:
--------------------------------------------------------------------------------
1 | ```python
2 | "Coming soon!"
3 | ```


--------------------------------------------------------------------------------
/classifiers/active_learner/bayesian_optimization/util/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/code-kern-ai/bricks/3a43c1e2b1ad8f41cee43028fce12f94fe889ea2/classifiers/active_learner/bayesian_optimization/util/__init__.py


--------------------------------------------------------------------------------
/classifiers/active_learner/bayesian_optimization/util/util.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/code-kern-ai/bricks/3a43c1e2b1ad8f41cee43028fce12f94fe889ea2/classifiers/active_learner/bayesian_optimization/util/util.py


--------------------------------------------------------------------------------
/classifiers/active_learner/decision_tree/README.md:
--------------------------------------------------------------------------------
1 | A simple decision tree model as the classification head for active transfer learning, using a pre-trained model to generate embeddings.


--------------------------------------------------------------------------------
/classifiers/active_learner/decision_tree/__init__.py:
--------------------------------------------------------------------------------
1 | def decision_tree():
2 |     """A a decision tree classifier head for active transfer learning."""
3 |     pass


--------------------------------------------------------------------------------
/classifiers/active_learner/decision_tree/code_snippet_common.md:
--------------------------------------------------------------------------------
1 | ```python
2 | "Coming soon!"
3 | ```


--------------------------------------------------------------------------------
/classifiers/active_learner/decision_tree/code_snippet_refinery.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | from sklearn.tree import DecisionTreeClassifier
 3 | from typing import List
 4 | # you can find further models here: https://scikit-learn.org/stable/supervised_learning.html#supervised-learning
 5 | 
 6 | EMBEDDING: str = "text-classification-distilbert-base-uncased" 
 7 | MIN_CONFIDENCE: float = 0.8
 8 | LABELS: List[str] = None # you can specify a list to filter the predictions (e.g. ["label-a", "label-b"])
 9 | 
10 | class MyDT(LearningClassifier):
11 | 
12 |     def __init__(self):
13 |         self.model = DecisionTreeClassifier()
14 | 
15 |     @params_fit(
16 |         embedding_name = EMBEDDING, 
17 |         train_test_split = 0.5 # we have this fixed at the moment, but you'll soon be able to specify this individually! 
18 |     )
19 |     def fit(self, embeddings, labels):
20 |         self.model.fit(embeddings, labels)
21 | 
22 |     @params_inference(
23 |         min_confidence = MIN_CONFIDENCE,
24 |         label_names = LABELS 
25 |     )
26 |     def predict_proba(self, embeddings):
27 |         return self.model.predict_proba(embeddings)
28 | ```


--------------------------------------------------------------------------------
/classifiers/active_learner/grid_search/README.md:
--------------------------------------------------------------------------------
1 | `sklearn` based grid search to train an active learner classification head. The grid search entails testing every unique combination of hyperparameters in the search space to determine the combination that yields the best performance.


--------------------------------------------------------------------------------
/classifiers/active_learner/grid_search/__init__.py:
--------------------------------------------------------------------------------
1 | def grid_search():
2 |     """A grid search classification head for active learning."""
3 |     pass
4 | 


--------------------------------------------------------------------------------
/classifiers/active_learner/grid_search/code_snippet_common.md:
--------------------------------------------------------------------------------
1 | ```python
2 | "Coming soon!"
3 | ```


--------------------------------------------------------------------------------
/classifiers/active_learner/logistic_regression/README.md:
--------------------------------------------------------------------------------
1 | A simple logistic regression model as the classification head for active transfer learning, using a pre-trained model to generate embeddings.


--------------------------------------------------------------------------------
/classifiers/active_learner/logistic_regression/__init__.py:
--------------------------------------------------------------------------------
1 | def logistic_regression():
2 |     """A logistic regression classifier head for active transfer learning."""
3 |     pass
4 | 


--------------------------------------------------------------------------------
/classifiers/active_learner/logistic_regression/code_snippet_common.md:
--------------------------------------------------------------------------------
1 | ```python
2 | "Coming soon!"
3 | ```


--------------------------------------------------------------------------------
/classifiers/active_learner/random_forest/README.md:
--------------------------------------------------------------------------------
1 | A random forest model as the classification head for active transfer learning, using a pre-trained model to generate embeddings.


--------------------------------------------------------------------------------
/classifiers/active_learner/random_forest/__init__.py:
--------------------------------------------------------------------------------
1 | def random_forest():
2 |     """A logistic regression classifier head for active transfer learning."""
3 |     pass
4 | 


--------------------------------------------------------------------------------
/classifiers/active_learner/random_forest/code_snippet_common.md:
--------------------------------------------------------------------------------
1 | ```python
2 | "Coming soon!"
3 | ```


--------------------------------------------------------------------------------
/classifiers/active_learner/random_search/README.md:
--------------------------------------------------------------------------------
1 | `sklearn` based implementation of the Random Search Active Learning algorithm. The random search, instead of searching for all the hyperparameters in the search space, selects and tests a random combination of the hyperparameters. As a consequence, it is much faster and more robust than the grid search algorithm.
2 | 


--------------------------------------------------------------------------------
/classifiers/active_learner/random_search/__init__.py:
--------------------------------------------------------------------------------
1 | def random_search():
2 |     """A random search classification head for active learning."""
3 |     pass
4 | 


--------------------------------------------------------------------------------
/classifiers/active_learner/random_search/code_snippet_common.md:
--------------------------------------------------------------------------------
1 | ```python
2 | "Coming soon!"
3 | ```


--------------------------------------------------------------------------------
/classifiers/communication_style/communication_style_classifier/README.md:
--------------------------------------------------------------------------------
1 | Uses an `intfloat/multilingual-e5-small` model, which was finetuned on english and german examples of different question types. The model is hosted on Kern AIs own infrastructure and is meant to be used to classify text sequences by the labels `Action-seeking`, `Fact-oriented`, `Information-seeking` or `Self-revealing`.


--------------------------------------------------------------------------------
/classifiers/communication_style/communication_style_classifier/__init__.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | import requests
 3 | 
 4 | INPUT_EXAMPLE = {
 5 |     "text": "Change the number in row 2 and 3.",
 6 |     "model_name": "KernAI/multilingual-e5-communication-style",
 7 | }
 8 | 
 9 | 
10 | class CommunicationStyleClassifierModel(BaseModel):
11 |     text: str
12 |     model_name: str
13 | 
14 |     class Config:
15 |         schema_extra = {"example": INPUT_EXAMPLE}
16 | 
17 | 
18 | def communication_style_classifier(req: CommunicationStyleClassifierModel):
19 |     """Uses custom E5 model to classify communication style of a text"""
20 |     payload = {
21 |         "model_name": req.model_name,
22 |         "text": req.text
23 |     }      
24 |     response = requests.post("https://free.api.kern.ai/inference", json=payload)
25 |     if response.ok:
26 |         return {"communication_style": response.json()["label"]}
27 |     return response.raise_for_status()


--------------------------------------------------------------------------------
/classifiers/communication_style/communication_style_classifier/code_snippet_refinery.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | import requests
 3 | 
 4 | ATTRIBUTE: str = "text" # only text attributes
 5 | MODEL_NAME: str = "KernAI/multilingual-e5-communication-style"
 6 | REQUEST_URL: str = "https://free.api.kern.ai/inference"
 7 | 
 8 | def communication_style_classifier(record):
 9 |     payload = {
10 |         "model_name": MODEL_NAME,
11 |         "text": record[ATTRIBUTE].text
12 |     }      
13 |     response = requests.post(REQUEST_URL, json=payload)
14 |     if response.ok:
15 |         return response.json()["label"]
16 | ```


--------------------------------------------------------------------------------
/classifiers/dates_and_times/workday_classifier/README.md:
--------------------------------------------------------------------------------
1 | Uses the dateutil as well as the holidays library to determine if a weekday is a national holiday, a normal workday or a weekend. Currently, this module does not work with multiple dates and will only provide the classification on one first date that it finds! Defaults to month-first dates except if the first number is above 12!


--------------------------------------------------------------------------------
/classifiers/dates_and_times/workday_classifier/code_snippet_refinery.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | from datetime import datetime
 3 | import holidays
 4 | import dateutil.parser as dparser
 5 | from typing import Optional
 6 | 
 7 | ATTRIBUTE: str = "text" # only text attributes 
 8 | COUNTRY_ID: str = "US" # optional, takes in an ISO country code, such as US, DE, UK. Leave empty if not required
 9 | 
10 | def workday_classifier(record):
11 |     text = record[ATTRIBUTE].text
12 | 
13 |     # try to parse a date from the provided string
14 |     try:
15 |         date = dparser.parse(text, fuzzy=True).date()
16 |     except:
17 |         return "Found no date, an invalid date or multiple dates."
18 | 
19 |     # check if country code is specified
20 |     if YOUR_COUNTRY:
21 |         national_holidays = holidays.country_holidays(f"{COUNTRY_ID}")
22 |         if date in national_holidays:
23 |             return "Holiday"
24 |     
25 |     # check if weekday is a workday or a weekend
26 |     if date.weekday() < 5:
27 |         return "Working day"
28 |     else:
29 |         return "Weekend"
30 | ```


--------------------------------------------------------------------------------
/classifiers/llm/bert_sentiment_german/README.md:
--------------------------------------------------------------------------------
1 | Uses a BERT model for sentiment classification of german texts. Requires a HuggingFace API key. Visit https://huggingface.co/oliverguhr/german-sentiment-bert for more information.


--------------------------------------------------------------------------------
/classifiers/llm/bert_sentiment_german/__init__.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from pydantic import BaseModel
 3 | 
 4 | INPUT_EXAMPLE = {
 5 |     "apiToken": "<API_KEY_GOES_HERE>",
 6 |     "text": "Diese Bratwurst schmeckt wirklich lecker!"
 7 | }
 8 | 
 9 | class BertSentimentGermanModel(BaseModel):
10 |     apiToken: str
11 |     text: str
12 | 
13 |     class Config:
14 |         schema_example = {"example": INPUT_EXAMPLE}
15 | 
16 | def bert_sentiment_german(req: BertSentimentGermanModel):
17 |       """ Sentiment classification for german texts using a BERT model."""
18 |       try:
19 |             headers = {"Authorization": f"Bearer {req.apiToken}"}
20 |             data = {"inputs": req.text, "options": {"wait_for_model": "true"}}
21 |             response = requests.post("https://api-inference.huggingface.co/models/oliverguhr/german-sentiment-bert", headers=headers, json=data)
22 |             response_json = response.json()
23 |             return {"sentiment": response_json[0][0]["label"]}
24 |       except Exception as e: 
25 |            return f"That didn't work. Did you provide a valid API key? Got error: {e} and message {response_json}"


--------------------------------------------------------------------------------
/classifiers/llm/bert_sentiment_german/code_snippet_refinery.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | import requests
 3 | 
 4 | ATTRIBUTE: str = "text" 
 5 | API_KEY: str = "<API_KEY_GOES_HERE>"
 6 | 
 7 | def bert_sentiment_german(record):
 8 |       try: 
 9 |             headers = {"Authorization": f"Bearer {API_KEY}"}
10 |             data = {"inputs": record[ATTRIBUTE].text, "options": {"wait_for_model": "true"}}
11 |             response = requests.post("https://api-inference.huggingface.co/models/oliverguhr/german-sentiment-bert", headers=headers, json=data)
12 |             response_json = response.json()
13 |             return response_json[0][0]["label"]
14 |       except Exception as e: 
15 |            return f"That didn't work. Did you provide a valid API key? Got error: {e} and message {response_json}"
16 | ```


--------------------------------------------------------------------------------
/classifiers/llm/deberta_review_classifier/README.md:
--------------------------------------------------------------------------------
1 | Uses a DaBERTa model via the HuggingFace Inference API to classify texts by their sentiment. Requires a HuggingFace API key. The model was fine-tuned with a million reviews from Amazon and therefore also performs best on customer reviews. Model was build by our Discord user positive-. You can also find the model on HuggingFace here: https://huggingface.co/RashidNLP/Amazon-Deberta-Base-Sentiment?text=I+like+you.+I+love+you


--------------------------------------------------------------------------------
/classifiers/llm/deberta_review_classifier/code_snippet_refinery.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | import requests
 3 | 
 4 | API_KEY: str = "<API_KEY_GOES_HERE>"
 5 | ATTRIBUTE: str = "text" # only text attributes
 6 | 
 7 | def deberta_review_classifier(record):
 8 |     inputs = record[ATTRIBUTE].text
 9 |     headers = {"Authorization": f"Bearer {API_KEY}"}
10 |     response = requests.post("https://api-inference.huggingface.co/models/RashidNLP/Amazon-Deberta-Base-Sentiment", headers=headers, json={"inputs": inputs})
11 |     json_response = response.json()
12 |     while not isinstance(json_response, dict):
13 |         json_response = json_response[0]
14 |     if "label" not in json_response:
15 |         return f"This didn't work, got: {json_response}"
16 |     else:
17 |         json_response = json_response["label"]
18 |     return json_response
19 | ```


--------------------------------------------------------------------------------
/classifiers/llm/distilbert_stock_news_classifier/README.md:
--------------------------------------------------------------------------------
1 | This distilbert model was fine-tuned on 50.000 stock news articles using the HuggingFace adapter from Kern AI refinery. The articles consisted of the headlines plus abstract of the article. For the finetuning, a single NVidia K80 was used for about four hours. Requires HuggingFace API token to use this brick. 
2 | 
3 | Join our Discord if you have questions about this model: https://discord.gg/MdZyqSxKbe


--------------------------------------------------------------------------------
/classifiers/llm/distilbert_stock_news_classifier/code_snippet_refinery.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | import requests
 3 | import json
 4 | 
 5 | API_KEY: str = "<API_KEY_GOES_HERE>"
 6 | ATTRIBUTE: str = "text" # only text attributes
 7 | 
 8 | def distilbert_stock_news_classifier(record):
 9 |     api_token = API_KEY
10 |     inputs = record[ATTRIBUTE].text
11 |     headers = {"Authorization": f"Bearer {api_token}"}
12 |     response = requests.post("https://api-inference.huggingface.co/models/KernAI/stock-news-destilbert", headers=headers, json={"inputs": inputs})
13 |     json_response = response.json()
14 |     result = [{item["label"]: item["score"] for item in entry} for entry in json_response]
15 |     return str(list(result[0].keys())[0])
16 | ```


--------------------------------------------------------------------------------
/classifiers/llm/gpt_classifier/README.md:
--------------------------------------------------------------------------------
1 | Uses the `GPT-3.5-turbo` model from OpenAI to classify text prompts. The desired task as well as the text input itself are inserted via a single prompt into GPT. An API key can be obtained directly from OpenAI. Contact us at info@kern.ai if you require an API key or need any support from us.
2 | 
3 | Check out OpenAI for example: https://beta.openai.com/examples/default-classification


--------------------------------------------------------------------------------
/classifiers/lookup_lists/lookup_list/README.md:
--------------------------------------------------------------------------------
1 | Looks up whether values of a given list are in a text, and if so, returns the desired label.


--------------------------------------------------------------------------------
/classifiers/lookup_lists/lookup_list/__init__.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | from typing import List
 3 | 
 4 | INPUT_EXAMPLE = {
 5 |     "text": "The mail was sent from joe@example.ai, please contact him for further information.",
 6 |     "lookupValues": ["joe@example.ai", "ava@example.ai"],
 7 |     "yourLabel": "in lookup",
 8 | }
 9 | 
10 | 
11 | class LookupListModel(BaseModel):
12 |     text: str
13 |     lookupValues: List[str]
14 |     yourLabel: str
15 | 
16 |     class Config:
17 |         schema_extra = {"example": INPUT_EXAMPLE}
18 | 
19 | 
20 | def lookup_list(request: LookupListModel):
21 |     """Checks if a given text contains any of the given lookup values."""
22 | 
23 |     text = request.text
24 |     lookupValues = request.lookupValues
25 |     yourLabel = request.yourLabel
26 | 
27 |     for lookupValue in lookupValues:
28 |         if lookupValue.lower() in text.lower():
29 |             return {yourLabel: True}
30 | 


--------------------------------------------------------------------------------
/classifiers/lookup_lists/lookup_list/code_snippet_backup.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | # replace this list with a list containing your data
 3 | texts = ["Please contact john@kern.ai to get more info.", "This is a negative text."]
 4 | lookup_values = ["john@kern.ai", "jane@kern.ai"]
 5 | 
 6 | # add the texts to a dict called records. Add further information as key-value pairs if needed
 7 | record = {
 8 |     "text": texts,
 9 |     "lookup_list": lookup_values,
10 |     "label": "mail",
11 | }
12 | 
13 | def lookup_list(record):
14 |     labels, text_id = [], 0
15 |     for item in record["lookup_list"]:
16 |         for entry in record["text"]:
17 |             if item.lower() in entry: 
18 |                 labels.append([record["label"], text_id])
19 |         text_id += 1
20 |     return {"lookup_labels": labels}
21 | ```


--------------------------------------------------------------------------------
/classifiers/lookup_lists/lookup_list/code_snippet_refinery.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | import knowledge
 3 | from typing import List
 4 | 
 5 | ATTRIBUTE: str = "text" # only text attributes
 6 | 
 7 | LOOKUP_LISTS: List[str] = [knowledge.my_lookup_list] #either lookup lists or lookup values or both
 8 | LOOKUP_VALUES: List[str] = ["john@kern.ai", "jane@kern.ai"]
 9 | LABEL: str = "in lookup"
10 | 
11 | 
12 | final_list = []
13 | if LOOKUP_LISTS:
14 |     for lookup_list in LOOKUP_LISTS:
15 |         final_list += lookup_list
16 | if LOOKUP_VALUES:
17 |     final_list += LOOKUP_VALUES
18 | 
19 | def lkp_known_sender(record):
20 |     for known_sender in final_list:
21 |         if known_sender.lower() in record[ATTRIBUTE].text.lower(): # SpaCy document, hence we need to call .text to get the string
22 |             return LABEL
23 | ```


--------------------------------------------------------------------------------
/classifiers/question_type/question_type_classifier/README.md:
--------------------------------------------------------------------------------
1 | Uses an `intfloat/multilingual-e5-small` model, which was finetuned on english and german examples of different question types. The model is hosted on Kern AIs own infrastructure and is meant to be used to classify text sequences by the labels `Keyword-question`, `Statement-question` or `Interrogative-question`.


--------------------------------------------------------------------------------
/classifiers/question_type/question_type_classifier/__init__.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | import requests
 3 | 
 4 | INPUT_EXAMPLE = {
 5 |     "text": "Sushi restaurants Barcelona",
 6 |     "model_name": "KernAI/multilingual-e5-question-type",
 7 | }
 8 | 
 9 | 
10 | class QuestionTypeClassifierModel(BaseModel):
11 |     text: str
12 |     model_name: str
13 | 
14 |     class Config:
15 |         schema_extra = {"example": INPUT_EXAMPLE}
16 | 
17 | 
18 | def question_type_classifier(req: QuestionTypeClassifierModel):
19 |     """Uses custom E5 model to classify the question type of a text"""
20 |     payload = {
21 |         "model_name": req.model_name,
22 |         "text": req.text
23 |     }      
24 |     response = requests.post("https://free.api.kern.ai/inference", json=payload)
25 |     if response.ok:
26 |         return {"question_type": response.json()["label"]}
27 |     return response.raise_for_status()


--------------------------------------------------------------------------------
/classifiers/question_type/question_type_classifier/code_snippet_refinery.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | import requests
 3 | 
 4 | ATTRIBUTE: str = "text" # only text attributes
 5 | MODEL_NAME: str = "KernAI/multilingual-e5-question-type"
 6 | REQUEST_URL: str = "https://free.api.kern.ai/inference"
 7 | 
 8 | def question_type_classifier(record):
 9 |     payload = {
10 |         "model_name": MODEL_NAME,
11 |         "text": record[ATTRIBUTE].text
12 |     }      
13 |     response = requests.post(REQUEST_URL, json=payload)
14 |     if response.ok:
15 |         return response.json()["label"]
16 | ```


--------------------------------------------------------------------------------
/classifiers/reference_complexity/chunked_sentence_complexity/code_snippet_refinery.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | import textstat
 3 | 
 4 | ATTRIBUTE: str = "text" # only text attributes
 5 | TARGET_LANGUAGE: str = "en" # iso codes
 6 | 
 7 | def chunked_sentence_complexity(record):
 8 |     complexities = [textstat.flesch_reading_ease(sent.text) for sent in record[ATTRIBUTE].sents] 
 9 | 
10 |     avg = int(round(sum(complexities) / len(complexities)))
11 |     return get_mapping_complexity(avg)
12 | 
13 | def get_mapping_complexity(score):
14 |     if score < 30:
15 |         return "very difficult"
16 |     if score < 50:
17 |         return "difficult"
18 |     if score < 60:
19 |         return "fairly difficult"
20 |     if score < 70:
21 |         return "standard"
22 |     if score < 80:
23 |         return "fairly easy"
24 |     if score < 90:
25 |         return "easy"        
26 |     return "very easy"
27 | 
28 | if TARGET_LANGUAGE is not None:
29 |     textstat.set_lang(TARGET_LANGUAGE)
30 | ```


--------------------------------------------------------------------------------
/classifiers/reference_complexity/maximum_sentence_complexity/README.md:
--------------------------------------------------------------------------------
1 | This is similar to the standard sentence complexity brick, with the difference that this brick returns the highest sentence complexity found in a text. The formula for calculating the complexity is as follows: Flesch Reading Ease = 206.835 – (1.015 x Average Sentence Length) – (84.6 x Average Syllables Per Word). The higher the score, the easier the content is to read and understand. Average sentence length can be calculated by dividing the number of words by the number of sentences
2 | The score is categorized, where 0 is the most difficult and 122 is the easiest. The coefficients used in the formula were chosen to match a scale where a very easy text has a score of 100 and a really difficult one has a score of 2. The coefficients were determined through trial and error to best fit the data available at the time. 


--------------------------------------------------------------------------------
/classifiers/reference_complexity/maximum_sentence_complexity/code_snippet_refinery.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | import textstat
 3 | 
 4 | ATTRIBUTE: str = "text" # only text attributes
 5 | TARGET_LANGUAGE: str = "en" # iso codes
 6 | 
 7 | if TARGET_LANGUAGE is not None:
 8 |     textstat.set_lang(TARGET_LANGUAGE)
 9 | 
10 | def maximum_sentence_complexity(record):
11 |     complexities = [textstat.flesch_reading_ease(sent.text) for sent in record[ATTRIBUTE].sents] 
12 |     return get_mapping_complexity(min(complexities))
13 | 
14 | def get_mapping_complexity(score):
15 |     if score < 30:
16 |         return "very difficult"
17 |     if score < 50:
18 |         return "difficult"
19 |     if score < 60:
20 |         return "fairly difficult"
21 |     if score < 70:
22 |         return "standard"
23 |     if score < 80:
24 |         return "fairly easy"
25 |     if score < 90:
26 |         return "easy"        
27 |     return "very easy"
28 | ```


--------------------------------------------------------------------------------
/classifiers/reference_complexity/tiktoken_length_classifier/README.md:
--------------------------------------------------------------------------------
1 | Uses OpenAI's tiktoken tokenizer library to classify a text based on the amount of tokens in a text. The tokenizer is used for the GPT models and converts words into integers. The conversion is reversible and lossless, meaning that a tokenized sentence can be converted back. This brick counts the length of a tokenized text and classifies it as "short", "medium" or "long".


--------------------------------------------------------------------------------
/classifiers/reference_complexity/tiktoken_length_classifier/__init__.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | import tiktoken
 3 | 
 4 | INPUT_EXAMPLE = {
 5 |     "text": "The sun is shining bright today.",
 6 |     "encoding_model": "cl100k_base"
 7 | }
 8 | 
 9 | 
10 | class TiktokenLengthClassifierModel(BaseModel):
11 |     text: str
12 |     encoding_model: str = "cl100k_base"
13 | 
14 |     class Config:
15 |         schema_extra = {"example": INPUT_EXAMPLE}
16 | 
17 | 
18 | def tiktoken_length_classifier(req: TiktokenLengthClassifierModel):
19 |     """Uses the Tiktoken library to count tokens in a string"""
20 |     encoding = tiktoken.get_encoding(req.encoding_model)
21 |     tokens = encoding.encode(req.text)
22 |     num_tokens = len(tokens)
23 | 
24 |     if num_tokens < 128:
25 |         return {"token_length": "Short"}
26 |     elif num_tokens < 1024:
27 |         return {"token_length": "Medium"}
28 |     else:
29 |         return{"token_length": "Long"} 


--------------------------------------------------------------------------------
/classifiers/reference_complexity/tiktoken_length_classifier/code_snippet_refinery.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | import tiktoken 
 3 | 
 4 | ATTRIBUTE: str = "text" # only text attributes
 5 | ENCODING_MODEL: str = "cl100k_base"
 6 | 
 7 | encoding = tiktoken.get_encoding(ENCODING_MODEL)
 8 | 
 9 | def tiktoken_length_classifier(record):
10 |     tokens = encoding.encode(record[ATTRIBUTE].text)
11 |     num_tokens = len(tokens)
12 | 
13 |     if num_tokens < 128:
14 |         return "Short"
15 |     elif num_tokens < 1024:
16 |         return "Medium"
17 |     else:
18 |         return "Long"
19 | ```


--------------------------------------------------------------------------------
/classifiers/reference_quality/special_character_classifier/README.md:
--------------------------------------------------------------------------------
1 | The purpose of this brick is to identify if there are any unusual characters in the given text. This function can be useful for text preprocessing tasks, especially for checking reference material in RAG (Retrieval Augmented Generation) use cases where you want to filter out text that contains unusual or unexpected characters.


--------------------------------------------------------------------------------
/classifiers/reference_quality/special_character_classifier/code_snippet_refinery.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | import unicodedata
 3 | from typing import Optional, List, Tuple
 4 | 
 5 | ATTRIBUTE: str = "text" # only text attributes
 6 | LABEL: str = "has_special_character"
 7 | ALLOWED_RANGE: List[int] = None # list of integers that represent Unicode code points
 8 | 
 9 | def special_character_classifier(record):
10 |     text = record[ATTRIBUTE].text    
11 | 
12 |     allowed = ALLOWED_RANGE
13 |     if not allowed:
14 |         allowed = default_allowed_values
15 |     for char in text:
16 |         if ord(char) not in allowed and unicodedata.category(char) != "Zs":
17 |             return LABEL
18 | 
19 | default_allowed_values = set(range(32, 127)).union( # Basic Latin
20 |     set(range(160, 255)), # Latin-1 Supplement
21 |     set(range(256, 384)),  # Latin Extended-A
22 |     set(range(384, 592)),  # Latin Extended-B
23 |     set(range(8192, 8303)),  # General Punctuation
24 |     set(range(8352, 8399)),  # Currency Symbols
25 |     set([ord("\t"), ord("\n"), ord("\r")])# common stop chars
26 | )
27 | ```


--------------------------------------------------------------------------------
/classifiers/reference_quality/word_count_classifier/README.md:
--------------------------------------------------------------------------------
1 | This simple brick counts the amount of words in a string by splitting it. If there are less than five words found, it returns "short". If there are less than 20, it returns "medium" 
2 | and with more than 20, it returns "long". This brick can be used to check the quality of references for RAG (Retrieval Augmented Generation) use cases.


--------------------------------------------------------------------------------
/classifiers/reference_quality/word_count_classifier/__init__.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | 
 3 | INPUT_EXAMPLE = {"text": "This is too short!"}
 4 | 
 5 | 
 6 | class WordCountClassifierModel(BaseModel):
 7 |     text: str
 8 | 
 9 |     class Config:
10 |         schema_extra = {"example": INPUT_EXAMPLE}
11 | 
12 | 
13 | def word_count_classifier(req: WordCountClassifierModel):
14 |     """Checks the length of a string by counting the number of words in it"""
15 |     words = req.text.split()
16 |     length = len(words)
17 |     if length < 5:
18 |         return {"text_length": "short"}
19 |     elif length < 20:
20 |         return {"text_length": "medium"}
21 |     else:
22 |         return {"text_length": "long"}
23 | 


--------------------------------------------------------------------------------
/classifiers/reference_quality/word_count_classifier/code_snippet_common.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | def word_count_classifier(text: str) -> str:
 3 |     """
 4 |     @param text: text to check the length of.
 5 |     @return: either 'short', 'medium' or 'long' depending on the counted words.
 6 |     """
 7 |     words = text.split()
 8 |     length = len(words)
 9 |     if length < 5:
10 |           return "short"
11 |     elif length < 20:
12 |           return "medium"
13 |     else:
14 |           return "long"
15 | 
16 | # ↑ necessary bricks function 
17 | # -----------------------------------------------------------------------------------------
18 | # ↓ example implementation 
19 | 
20 | def example_integration():
21 |     texts = ["This is short.", "This is a text with medium length.", "This is a longer text with many more words. There is even a second sentence with extra words. Splendid, what a joyful day!"]
22 |     for text in texts:
23 |         print(f"\"{text}\" is -> {word_count_classifier(text)}")
24 | 
25 | example_integration()
26 | ```


--------------------------------------------------------------------------------
/classifiers/reference_quality/word_count_classifier/code_snippet_refinery.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | ATTRIBUTE: str = "text" # only text attributes
 3 | 
 4 | def word_count_classifier(record):
 5 |     words = record[ATTRIBUTE].text.split()
 6 |     length = len(words)
 7 |     if length < 5:
 8 |         return "short"
 9 |     elif length < 20:
10 |         return "medium"
11 |     else:
12 |         return "long"
13 | ```


--------------------------------------------------------------------------------
/classifiers/reference_relevance/gpt_cross_encoder/README.md:
--------------------------------------------------------------------------------
1 | Uses OpenAI's GPT models as a cross encoder for relevancy classification. In a nutshell, both a user-question as well as a retrieved reference are passed to the GPT model with the instruction to rate the reference-relevancy on a scale from 0 to 100. If the score is higher that 50, the reference is classified as relevant. This brick is also compatible with the Azure OpenAI service. Visit https://platform.openai.com/docs/models for full documentation of the GPT models.
2 | 


--------------------------------------------------------------------------------
/classifiers/sentiment/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/code-kern-ai/bricks/3a43c1e2b1ad8f41cee43028fce12f94fe889ea2/classifiers/sentiment/README.md


--------------------------------------------------------------------------------
/classifiers/sentiment/textblob_sentiment/README.md:
--------------------------------------------------------------------------------
1 | Calculates the sentiment of a given text in English using Textblob.


--------------------------------------------------------------------------------
/classifiers/sentiment/textblob_sentiment/code_snippet_refinery.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | from textblob import TextBlob
 3 | 
 4 | ATTRIBUTE: str = "text" # only text attributes
 5 | 
 6 | def textblob_sentiment(record):
 7 |     blob = TextBlob(record[ATTRIBUTE].text) # SpaCy document, hence we need to call .text to get the string
 8 |     return lookup_label(blob.sentiment.polarity)
 9 | 
10 | def lookup_label(score:float) -> str:
11 |     if score < -.6:
12 |         return "very negative"
13 |     if score < -.2:
14 |         return "negative"
15 |     if score < .2:
16 |         return "neutral"
17 |     if score < .6:
18 |         return "positive"
19 |     return "very positive"
20 | ```


--------------------------------------------------------------------------------
/classifiers/sentiment/vader_sentiment_classifier/README.md:
--------------------------------------------------------------------------------
1 | VADER (Valence Aware Dictionary and sEntiment Reasoner) is a lexicon and rule-based sentiment analysis tool. It's specifically tuned to get the sentiment of social-media post, but also works on other text-types as well. This version returns a string with either "positive", "neutral" or "negative". See the generators section for "vader sentiment scores" to get a brick that only returns the scores. Learn more here: https://github.com/cjhutto/vaderSentiment


--------------------------------------------------------------------------------
/classifiers/sentiment/vader_sentiment_classifier/__init__.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
 3 | 
 4 | INPUT_EXAMPLE = {"text": "World peace announced by the United Nations."}
 5 | 
 6 | 
 7 | class VaderSentimentClassifierModel(BaseModel):
 8 |     text: str
 9 | 
10 |     class Config:
11 |         schema_extra = {"example": INPUT_EXAMPLE}
12 | 
13 | 
14 | def vader_sentiment_classifier(req):
15 |     """Get the sentiment of a text using the VADER algorithm."""
16 |     analyzer = SentimentIntensityAnalyzer()
17 |     text = req.text
18 | 
19 |     vs = analyzer.polarity_scores(text)
20 |     if vs["compound"] >= 0.05:
21 |         return {"sentiment": "positive"}
22 |     elif vs["compound"] > -0.05:
23 |         return {"sentiment": "neutral"}
24 |     elif vs["compound"] <= -0.05:
25 |         return {"sentiment": "negative"}
26 | 


--------------------------------------------------------------------------------
/classifiers/sentiment/vader_sentiment_classifier/code_snippet_backup.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
 3 | 
 4 | # replace this list with a list containing your data
 5 | text = ["I hate this.", "Meh it's ok.", "I love this!"]
 6 | 
 7 | # add the texts to a dict called records. Add further information as key-value pairs if needed
 8 | record = {
 9 |     "your_text": text,
10 | }
11 | 
12 | def vader_sentiment_classifier(record: dict) -> dict:
13 |     analyzer = SentimentIntensityAnalyzer()
14 | 
15 |     sentiment = []
16 |     for entry in record["your_text"]:
17 |         vs = analyzer.polarity_scores(entry)
18 |         if vs["compound"] >= 0.05:
19 |             sentiment.append("positive")
20 |         elif vs["compound"] > -0.05: 
21 |             sentiment.append("neutral")
22 |         elif vs["compound"] <= -0.05:
23 |             sentiment.append("negative")
24 |     return {"sentiments": sentiment}
25 | ```


--------------------------------------------------------------------------------
/classifiers/sentiment/vader_sentiment_classifier/code_snippet_refinery.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | # expects labeling task to have labels ["positive" ,"neutral", "negative"]
 3 | from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
 4 | 
 5 | ATTRIBUTE: str = "text" # only text attributes
 6 | 
 7 | def vader_sentiment_classifier(record):
 8 |     analyzer = SentimentIntensityAnalyzer()
 9 |     text = record[ATTRIBUTE].text
10 | 
11 |     vs = analyzer.polarity_scores(text)
12 |     if vs["compound"] >= 0.05:
13 |         return "positive"
14 |     elif vs["compound"] > -0.05: 
15 |         return "neutral"
16 |     elif vs["compound"] <= -0.05:
17 |         return "negative"
18 | ```


--------------------------------------------------------------------------------
/classifiers/similarity/cosine_similarity/README.md:
--------------------------------------------------------------------------------
1 | Cosine similarity is a metric to measure the similarity between two entities (texts in this case). A word is represented into a vector form, while the text documents are represented in n-dimentional vector space. The cosine similarity always ranges between 0 and 1.


--------------------------------------------------------------------------------
/classifiers/spelling/spelling_check/README.md:
--------------------------------------------------------------------------------
1 | This module checks for spelling errors in a given string. The function return the number of words that contain spelling mistakes. For example, `"The wether is good toda. I might go for cyclng."` would return `3` as the output.


--------------------------------------------------------------------------------
/classifiers/spelling/spelling_check/code_snippet_refinery.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | from nltk.corpus import words, brown
 3 | 
 4 | ATTRIBUTE: str = "text" # only text attributes
 5 | LABEL_MISTAKES: str = "contains mistakes"
 6 | LABEL_CORRECT: str = "no mistakes"
 7 | 
 8 | words_corpus = words.words()
 9 | brown_corpus = brown.words()
10 | word_list = set(words_corpus + brown_corpus)
11 | 
12 | def spelling_check(record):
13 |     text = record[ATTRIBUTE].text # SpaCy document, hence we need to call .text to get the string
14 | 
15 |     text_lower = text.replace(',', '').replace('.', '').lower().split()
16 |     text_original = text.replace(',', '').replace('.', '').split()
17 |     
18 |     for i, _ in enumerate(text_lower):
19 |         if text_lower[i] not in word_list and text_original[i] not in word_list:
20 |             return LABEL_MISTAKES
21 | 
22 |     return LABEL_CORRECT
23 | ```


--------------------------------------------------------------------------------
/classifiers/text_analysis/emotionality_detection/README.md:
--------------------------------------------------------------------------------
1 | This module makes the use of `LeXmo` package to detect the emotionality of a text. For the input text, it tokenizes it, sums up the number of words that associate for each emotion, and returns a dictionary with emotionality scores. This module works best on large text sizes.
2 | 
3 | Possible emotions to get returned are: [anger, fear, anticipation, trust, surprise, sadness, joy, disgust]


--------------------------------------------------------------------------------
/classifiers/text_analysis/emotionality_detection/code_snippet_backup.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | # expects labeling task to have labels ["anger", "fear", "anticipation", "trust", "surprise", "sadness", "joy", "disgust"]
 3 | from LeXmo import LeXmo
 4 | 
 5 | # replace this list with a list containing your data
 6 | text = ["I really don't want to do it I am scared.", "Oh my goodness it was the best evening ever, hype!"]
 7 | 
 8 | # add the texts to a dict called records. Add further information as key-value pairs if needed
 9 | record = {
10 |     "your_text": text,
11 | }
12 | 
13 | def emotionality_detection(record):
14 |     emotions = []
15 |     for entry in record["your_text"]:
16 |         emo = LeXmo.LeXmo(entry)
17 |         emo.pop("text", None)
18 |         emo = max(emo, key=emo.get)
19 |         emotions.append(emo)
20 |     return {"emotions": emotions} 
21 | ```


--------------------------------------------------------------------------------
/classifiers/text_analysis/emotionality_detection/code_snippet_common.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | from LeXmo import LeXmo
 3 | 
 4 | def emotionality_detection(text:str) -> str:
 5 |     """
 6 |     @param text: text to check
 7 |     @return: either 'anger', 'fear', 'anticipation', 'trust', 'surprise', 'sadness', 'joy' or 'disgust' depending on the score
 8 |     """
 9 |     emo = LeXmo.LeXmo(text)
10 |     del emo["text"]
11 |     del emo["positive"]
12 |     del emo["negative"]
13 |     emo = max(emo, key=emo.get)
14 |     return emo
15 |     
16 | # ↑ necessary bricks function 
17 | # -----------------------------------------------------------------------------------------
18 | # ↓ example implementation 
19 | 
20 | def example_integration():
21 |     texts = ["I am scared to continue.", "Oh my goodness it was the best evening ever, hype!"]
22 |     for text in texts:
23 |         print(f"\"{text}\" has emotion: {emotionality_detection(text)}")
24 | 
25 | example_integration()
26 | ```


--------------------------------------------------------------------------------
/classifiers/text_analysis/emotionality_detection/code_snippet_refinery.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | #expects labeling task to have labels ["anger", "fear", "anticipation", "trust", "surprise", "sadness", "joy", "disgust"]
 3 | from LeXmo import LeXmo
 4 | 
 5 | ATTRIBUTE: str = "text" # only text attributes
 6 | 
 7 | def emotionality_detection(record):
 8 |     text = record[ATTRIBUTE].text # SpaCy document, hence we need to call .text to get the string
 9 |     emo = LeXmo.LeXmo(text)
10 |     del emo["text"]
11 |     del emo["positive"]
12 |     del emo["negative"]
13 |     emo = max(emo, key=emo.get)
14 | 
15 |     return emo 
16 | ```


--------------------------------------------------------------------------------
/classifiers/text_analysis/language_detection/README.md:
--------------------------------------------------------------------------------
1 | Detects the languages of 55 languages out of the box (ISO 639-1 codes):
2 | af, ar, bg, bn, ca, cs, cy, da, de, el, en, es, et, fa, fi, fr, gu, he,
3 | hi, hr, hu, id, it, ja, kn, ko, lt, lv, mk, ml, mr, ne, nl, no, pa, pl,
4 | pt, ro, ru, sk, sl, so, sq, sv, sw, ta, te, th, tl, tr, uk, ur, vi, zh-cn, zh-tw


--------------------------------------------------------------------------------
/classifiers/text_analysis/language_detection/__init__.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | from langdetect import detect, DetectorFactory, LangDetectException
 3 | 
 4 | DetectorFactory.seed = 0
 5 | 
 6 | INPUT_EXAMPLE = {"text": "This is an english sentence."}
 7 | 
 8 | 
 9 | class LanguageDetectionModel(BaseModel):
10 |     text: str
11 | 
12 |     class Config:
13 |         schema_extra = {"example": INPUT_EXAMPLE}
14 | 
15 | 
16 | def language_detection(request: LanguageDetectionModel):
17 |     """Detects the language of a given text."""
18 | 
19 |     text = request.text
20 |     if not text or not text.strip():
21 |         return {"language": "unknown"}
22 |     return {"language": detect(text)}
23 | 


--------------------------------------------------------------------------------
/classifiers/text_analysis/language_detection/code_snippet_backup.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | from langdetect import detect
 3 | 
 4 | # replace this list with a list containing your data
 5 | text = ["This is an english sentence.", "Dies ist ein Text in Deutsch."]
 6 | 
 7 | # add the texts to a dict called records. Add further information as key-value pairs if needed
 8 | record = {
 9 |     "text": text,
10 | }
11 | 
12 | def language_detection(record):
13 |     detected_languages = []
14 |     for entry in record["text"]:
15 |         language = detect(entry)
16 |         detected_languages.append(language)
17 |     return {"detected_languages": detected_languages}
18 | ```


--------------------------------------------------------------------------------
/classifiers/text_analysis/language_detection/code_snippet_common.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | from langdetect import detect
 3 | 
 4 | def language_detection(text:str)->str:    
 5 |     """
 6 |     @param text: text to check
 7 |     @return: language iso code. Full list here https://github.com/Mimino666/langdetect#languages
 8 |     """
 9 |     if not text or not text.strip():
10 |         return "unknown"
11 |     return detect(text)
12 | 
13 | # ↑ necessary bricks function 
14 | # -----------------------------------------------------------------------------------------
15 | # ↓ example implementation 
16 | 
17 | def example_integration():
18 |     texts = ["This is an english sentence.", "Dies ist ein Text in Deutsch."]
19 |     for text in texts:
20 |         print(f"\"{text}\" is written in {language_detection(text)}")
21 | 
22 | example_integration()
23 | ```


--------------------------------------------------------------------------------
/classifiers/text_analysis/language_detection/code_snippet_refinery.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | from langdetect import detect
 3 | 
 4 | ATTRIBUTE: str = "text" #only text attributes
 5 | 
 6 | def language_detection(record):
 7 |     text = record[ATTRIBUTE].text # SpaCy document, hence we need to call .text to get the string
 8 |     if not text or not text.strip():
 9 |         return "unknown"
10 |     return detect(text) 
11 | ```


--------------------------------------------------------------------------------
/classifiers/text_analysis/profanity_detection/README.md:
--------------------------------------------------------------------------------
1 | This module checks if the text contains profanity words. It uses a list of profanity words to detect if the text contains any of them. The list of profanity words can be found in [here](https://github.com/snguyenthanh/better_profanity/blob/master/better_profanity/profanity_wordlist.txt). This module will not work in refinery for versions <= 1.6.


--------------------------------------------------------------------------------
/classifiers/text_analysis/profanity_detection/__init__.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | from better_profanity import profanity
 3 | 
 4 | INPUT_EXAMPLE = {
 5 |     "text": "You suck man!",
 6 | }
 7 | 
 8 | 
 9 | class ProfanityDetectionModel(BaseModel):
10 |     text: str
11 | 
12 |     class Config:
13 |         schema_example = {"example": INPUT_EXAMPLE}
14 | 
15 | 
16 | def profanity_detection(request: ProfanityDetectionModel):
17 |     """Detects if a given text contains abusive language."""
18 | 
19 |     text = request.text
20 |     result = profanity.contains_profanity(text)
21 | 
22 |     return {"profanity": result}
23 | 


--------------------------------------------------------------------------------
/classifiers/text_analysis/profanity_detection/code_snippet_backup.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | from better_profanity import profanity
 3 | 
 4 | # replace this list with a list containing your data
 5 | text = ["You suck man!.", "Thanks have a nice day."]
 6 | 
 7 | # add the texts to a dict called records. Add further information as key-value pairs if needed
 8 | record = {
 9 |     "text": text,
10 |     "label_profane": "profane",
11 |     "label_not_profane": "not_profane",
12 | }
13 | 
14 | def profanity_detection(record):
15 |     detected_profanity = []
16 |     for entry in record["text"]:
17 |         if profanity.contains_profanity(entry):
18 |             detected_profanity.append(record["label_profane"])
19 |         else:
20 |             detected_profanity.append(record["label_not_profane"])
21 |     return {"profanity": detected_profanity}
22 | ```


--------------------------------------------------------------------------------
/classifiers/text_analysis/profanity_detection/code_snippet_refinery.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | from better_profanity import profanity
 3 | 
 4 | ATTRIBUTE: str = "text" # only text attributes
 5 | LABEL_PROFANE: str = "profane"
 6 | LABEL_NOT_PROFANE: str = "not_profane"
 7 | 
 8 | def profanity_detection(record):
 9 |     # SpaCy document, hence we need to call .text on our record to get the string
10 |     
11 |     if profanity.contains_profanity(record[ATTRIBUTE].text):
12 |         return LABEL_PROFANE
13 |     return LABEL_NOT_PROFANE
14 | ```


--------------------------------------------------------------------------------
/classifiers/text_analysis/sentence_complexity/README.md:
--------------------------------------------------------------------------------
1 | Computes the complexity of a sentence using the Flesch-ease reading ease formula.The formula is as follows: Flesch Reading Ease = 206.835 – (1.015 x Average Sentence Length) – (84.6 x Average Syllables Per Word). The higher the score, the easier the content is to read and understand. Average sentence length can be calculated by dividing the number of words by the number of sentences
2 | The score is categorized, where 0 is the most difficult and 122 is the easiest. The coefficients used in the formula were chosen to match a scale where a very easy text has a score of 100 and a really difficult one has a score of 02. The coefficients were determined through trial and error to best fit the data available at the time It can be used to compare the complexity of different texts.


--------------------------------------------------------------------------------
/classifiers/text_analysis/sentence_complexity/code_snippet_refinery.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | import textstat
 3 | 
 4 | ATTRIBUTE: str = "text" # only text attributes
 5 | TARGET_LANGUAGE: str = "en" # iso codes
 6 | 
 7 | def sentence_complexity(record):
 8 |     text = record[ATTRIBUTE].text # SpaCy document, hence we need to call .text to get the string
 9 |     sentence_complexity_score = textstat.flesch_reading_ease(text)
10 |     return lookup_label(sentence_complexity_score)
11 | 
12 | def lookup_label(score:int) -> str:
13 |     if score < 30:
14 |         return "very difficult"
15 |     if score < 50:
16 |         return "difficult"
17 |     if score < 60:
18 |         return "fairly difficult"
19 |     if score < 70:
20 |         return "standard"
21 |     if score < 80:
22 |         return "fairly easy"
23 |     if score < 90:
24 |         return "easy"        
25 |     return "very easy"
26 | 
27 | if TARGET_LANGUAGE is not None:
28 |     textstat.set_lang(TARGET_LANGUAGE)
29 | 
30 | ```


--------------------------------------------------------------------------------
/classifiers/text_analysis/textblob_subjectivity/README.md:
--------------------------------------------------------------------------------
1 | Calculates the subjectivity of a given text in English using Textblob.


--------------------------------------------------------------------------------
/classifiers/text_analysis/textblob_subjectivity/code_snippet_refinery.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | from textblob import TextBlob
 3 | 
 4 | ATTRIBUTE: str = "text" # only text attributes
 5 | 
 6 | def textblob_subjectivity(record):    
 7 |     blob = TextBlob(record[ATTRIBUTE].text) # SpaCy document, hence we need to call .text to get the string
 8 |     return lookup_label(blob.sentiment.subjectivity)
 9 | 
10 | def lookup_label(score:float) -> str:
11 |     if score < .2:
12 |         return "objective"
13 |     if score < .4:
14 |         return "rather objective"
15 |     if score < .6:
16 |         return "neutral"
17 |     if score < .8:
18 |         return "rather subjective"     
19 |     return "subjective"
20 | 
21 | ```


--------------------------------------------------------------------------------
/classifiers/zero_shot/README.md:
--------------------------------------------------------------------------------
1 | We don't support zero-shot configurations as code in refinery yet, but as soon as we do so, you can find zero-shot configurations in here.


--------------------------------------------------------------------------------
/extractors/README.md:
--------------------------------------------------------------------------------
1 | # Extractors
2 | Extractors are modules that retrieve specific information from a given text. For example, a module that extracts the author of a text would go into this folder.
3 | 


--------------------------------------------------------------------------------
/extractors/_template/_template_func/README.md:
--------------------------------------------------------------------------------
1 | A brick module should contain a README which describes the use and functionality of a brick. This is also the place where you can provide additional information. You may also include your name here let others know who contributed this brick!


--------------------------------------------------------------------------------
/extractors/_template/_template_func/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/code-kern-ai/bricks/3a43c1e2b1ad8f41cee43028fce12f94fe889ea2/extractors/_template/_template_func/__init__.py


--------------------------------------------------------------------------------
/extractors/_template/_template_func/code_snippet_common.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/code-kern-ai/bricks/3a43c1e2b1ad8f41cee43028fce12f94fe889ea2/extractors/_template/_template_func/code_snippet_common.md


--------------------------------------------------------------------------------
/extractors/active_learner/crf_tagger/README.md:
--------------------------------------------------------------------------------
1 | A conditional random fields tagger to detect spans during active transfer learning. Requires token-level embeddings, which can be generated in refinery.


--------------------------------------------------------------------------------
/extractors/active_learner/crf_tagger/__init__.py:
--------------------------------------------------------------------------------
1 | def crf_tagger():
2 |     """Conditional Random Fields tagger."""
3 | 


--------------------------------------------------------------------------------
/extractors/active_learner/crf_tagger/code_snippet_common.md:
--------------------------------------------------------------------------------
1 | ```python
2 | "Coming soon!"
3 | ```


--------------------------------------------------------------------------------
/extractors/codes/color_code_extraction/README.md:
--------------------------------------------------------------------------------
1 | This function extracts all level 3 and some level 4 CSS color codes from the given text, except named colors, e.g. 'blue' or 'currentColor'.


--------------------------------------------------------------------------------
/extractors/codes/color_code_extraction/code_snippet_refinery.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | import re
 3 | 
 4 | ATTRIBUTE: str = "text" # only text attributes
 5 | LABEL: str = "color"
 6 | 
 7 | def color_code_extraction(record):
 8 |     text = record[ATTRIBUTE].text # SpaCy doc, hence we need to use .text to get the string
 9 | 
10 |     hexcolor_regex = re.compile(r"#([0-9a-fA-F]{8}|[0-9a-fA-F]{6}|[0-9a-fA-F]{4}|[0-9a-fA-F]{3})(?![0-9a-fA-F])")
11 |     rgb_regex = re.compile(r"(rgba|rgb)\([^\)]*\)")
12 |     hsl_regex = re.compile(r"(hsla|hsl)\([^\)]*\)")
13 |     hwb_regex = re.compile(r"hwb\([^\)]*\)")
14 | 
15 |     for regex in [hexcolor_regex, rgb_regex, hsl_regex, hwb_regex]:
16 |         color_code_positions = []
17 |         for match in regex.finditer(text):
18 |             start, end = match.span()
19 |             span = record[ATTRIBUTE].char_span(start, end, alignment_mode="expand")
20 |             yield LABEL, span.start, span.end 
21 | ```


--------------------------------------------------------------------------------
/extractors/codes/stock_ticker_extraction/README.md:
--------------------------------------------------------------------------------
1 | Extracts stock tickers from a text. Is limited to mainly stocks from north america. 


--------------------------------------------------------------------------------
/extractors/codes/stock_ticker_extraction/code_snippet_refinery.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | from itertools import compress   
 3 | import requests
 4 | 
 5 | ATTRIBUTE: str = "text" # only text attributes
 6 | LABEL: str = "Ticker"
 7 | 
 8 | def stock_ticker_extraction(record):
 9 |     # Import tickers from bricks github repo
10 |     req = requests.get("https://raw.githubusercontent.com/code-kern-ai/bricks/main/extractors/codes/stock_ticker_extraction/tickers.txt")
11 |     tickers = req.text.split("\n")
12 | 
13 |     text = record[ATTRIBUTE].text
14 |     is_ticker = [True if word in tickers and word.isupper() else False for word in text.replace("(", " ").replace(")", " ").replace(":", " ").split(sep=" ")]
15 | 
16 |     found_tickers = list(compress(text.split(), is_ticker))
17 | 
18 |     for ticker in found_tickers:
19 |         start = text.find(ticker)
20 |         end = start + len(ticker)
21 |         span = record[ATTRIBUTE].char_span(start, end, alignment_mode="expand")
22 |         yield LABEL, span.start, span.end
23 | ```


--------------------------------------------------------------------------------
/extractors/dates_and_times/date_extraction/README.md:
--------------------------------------------------------------------------------
 1 | This module extracts dates from a given text. The module has been tested on the following formats:
 2 | 
 3 | * 2018-01-01
 4 | * 2018/01/01
 5 | * 2018.01.01
 6 | * 2018 Jan 01
 7 | * 2018 January 01
 8 | * Jan 01, 2018
 9 | * January 01, 2018
10 | * 04.11.2022


--------------------------------------------------------------------------------
/extractors/dates_and_times/date_extraction/code_snippet_refinery.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | import re
 3 | 
 4 | ATTRIBUTE: str = "text" # only text attributes
 5 | LABEL: str = "date"
 6 | 
 7 | def date_extraction(record):
 8 |     regex = re.compile(
 9 |         r"(?:[0-9]{1,2}|[0-9]{4}|Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)[\/\. -]{1}(?:[0-9]{1,2}|Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)[,\/\. -]{1}(?:[0-9]{2,4})"
10 |     )
11 | 
12 |     for match in regex.finditer(record[ATTRIBUTE].text):
13 |         start, end = match.span()
14 |         span = record[ATTRIBUTE].char_span(start, end, alignment_mode="expand")
15 |         yield LABEL, span.start, span.end
16 | ```


--------------------------------------------------------------------------------
/extractors/dates_and_times/time_extraction/README.md:
--------------------------------------------------------------------------------
1 | This function extracts the time from a given text. The correct date format is necessary for successful extraction.
2 | * Valid time formats: 
3 |   * "H am/pm/a.m./p.m./AM/PM"
4 |   * "HH:MM am/pm/a.m./p.m./AM/PM"
5 |   * "HH:MM:SS am/pm/a.m./p.m./AM/PM"
6 |   * "HH:MM"
7 |   * "HH:MM:SS".
8 | * Invalid formats: 
9 |   * "Twelve pm"


--------------------------------------------------------------------------------
/extractors/dates_and_times/time_extraction/code_snippet_refinery.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | import re
 3 | 
 4 | ATTRIBUTE: str = "text" # only text attributes
 5 | LABEL: str = "time"
 6 | 
 7 | def time_extraction(record):
 8 |     regex = re.compile(
 9 |         r"\b(1[0-2]|[1-9])\s*[apAP][. ]*[mM]\.?|(?:(?:[01]?[0-9]|2[0-3]):[0-5][0-9](?::[0-5][0-9])?(?:(?:\s?[ap](?:\.m\.)?)|(?:\s?[AP](?:\.M\.)?)))|(?:[01]?[0-9]|2[0-3]):[0-5][0-9](?::[0-5][0-9])?"
10 |     )
11 |     text = record[ATTRIBUTE].text # SpaCy doc, hence we need to use .text to get the string.
12 | 
13 |     for match in regex.finditer(text):
14 |         start, end = match.span()
15 |         span = record[ATTRIBUTE].char_span(start, end, alignment_mode="expand")
16 |         yield LABEL, span.start, span.end
17 | ```


--------------------------------------------------------------------------------
/extractors/functions/aspect_extraction/README.md:
--------------------------------------------------------------------------------
1 | Detects aspects in a given text. For instance something like "It has a really great battery life, but I hate the window size...". This should detect "a really great battery life" as something positive, and "I hate the window size" as something negative.


--------------------------------------------------------------------------------
/extractors/functions/aspect_extraction/code_snippet_refinery.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | from textblob import TextBlob
 3 | 
 4 | ATTRIBUTE: str = "details" # only text attributes
 5 | WINDOW: int = 4 # choose any window size here
 6 | SENSITIVITY: float = 0.5 # choose any value between 0 and 1
 7 | NEGATIVE_LABEL: str = "negative"
 8 | POSITIVE_LABEL: str = "positive"
 9 | 
10 | def aspect_extraction(record):
11 |     for chunk in record[ATTRIBUTE].noun_chunks:
12 |         left_bound = max(chunk.sent.start, chunk.start - (WINDOW // 2) +1)
13 |         right_bound = min(chunk.sent.end, chunk.end + (WINDOW // 2) + 1)
14 |         window_doc = record[ATTRIBUTE][left_bound: right_bound]
15 |         sentiment = TextBlob(window_doc.text).polarity
16 |         if sentiment < -(1 - SENSITIVITY):
17 |             yield NEGATIVE_LABEL, chunk.start, chunk.end
18 |         elif sentiment > (1 - SENSITIVITY):
19 |             yield POSITIVE_LABEL, chunk.start, chunk.end
20 | ```


--------------------------------------------------------------------------------
/extractors/functions/gazetteer_extraction/README.md:
--------------------------------------------------------------------------------
1 | A gazetteer detects full entities given some hint, e.g. a person based on their first name. If I know that "Max" is the first name, the gazetteer will detect "Max Mustermann" as a person. If you get an "AttributeError: module 'knowledge' has no attribute XYZ", you need to ensure that a lookup name with that name exists in refinery!


--------------------------------------------------------------------------------
/extractors/functions/gazetteer_extraction/code_snippet_refinery.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | import knowledge
 3 | from typing import List
 4 | 
 5 | ATTRIBUTE: str = "text" # only text attributes
 6 | LABEL: str = "PERSON"
 7 | LOOKUP_LISTS: List[str] = [knowledge.my_lookup_list] #either lookup lists or lookup values or both
 8 | LOOKUP_VALUES: List[str] = ["Max"]
 9 | 
10 | final_list = []
11 | if LOOKUP_LISTS:
12 |     for lookup_list in LOOKUP_LISTS:
13 |         final_list += lookup_list
14 | if LOOKUP_VALUES:
15 |     final_list += LOOKUP_VALUES
16 | 
17 | def gazetteer_extraction(record):
18 |     for chunk in record[ATTRIBUTE].noun_chunks:
19 |         if any([chunk.text in trie or trie in chunk.text for trie in final_list]):
20 |             yield LABEL, chunk.start, chunk.end
21 | ```


--------------------------------------------------------------------------------
/extractors/functions/regex_extraction/README.md:
--------------------------------------------------------------------------------
1 | This module detects spaCy tokens that fit a regex pattern and extracts them.


--------------------------------------------------------------------------------
/extractors/functions/regex_extraction/code_snippet_refinery.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | import re
 3 | 
 4 | REGEX: str = r"\$[0-9]+" # Choose any regex here
 5 | ATTRIBUTE: str = "text" # only text attributes
 6 | LABEL: str = "money" # Choose any available label here
 7 | 
 8 | def regex_extraction(record):
 9 | 
10 |     def regex_search(pattern, string):
11 |         prev_end = 0
12 |         while True:
13 |             match = re.search(pattern, string)
14 |             if not match:
15 |                 break
16 | 
17 |             start_, end_ = match.span()
18 |             yield start_ + prev_end, end_ + prev_end
19 | 
20 |             prev_end += end_
21 |             string = string[end_:]
22 |             
23 |     for start, end in regex_search(REGEX, record[ATTRIBUTE].text):
24 |         span = record[ATTRIBUTE].char_span(start, end, alignment_mode="expand")
25 |         yield LABEL, span.start, span.end
26 | ```


--------------------------------------------------------------------------------
/extractors/functions/window_search_extraction/README.md:
--------------------------------------------------------------------------------
1 | In a given text, a window search looks for cue words inside frame of tokens. E.g. in a 6-window search, 3 token left and right are searched. "Max" can be extracted from "Max joined Kern AI in September." if "join" is one of the cue words. If you get an "AttributeError: module 'knowledge' has no attribute XYZ", you need to ensure that a lookup name with that name exists in refinery!


--------------------------------------------------------------------------------
/extractors/functions/window_search_extraction/code_snippet_refinery.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | import knowledge
 3 | from typing import List
 4 | 
 5 | WINDOW: int = 4 # choose any window size here
 6 | LABEL: str = "PERSON"
 7 | ATTRIBUTE: str = "text" # only text attributes
 8 | LOOKUP_LISTS: List[str] = [knowledge.my_lookup_list] #either lookup lists or lookup values or both
 9 | LOOKUP_VALUES: List[str] = ["Max"]
10 | 
11 | final_list = []
12 | if LOOKUP_LISTS:
13 |     for lookup_list in LOOKUP_LISTS:
14 |         final_list += lookup_list
15 | if LOOKUP_VALUES:
16 |     final_list += LOOKUP_VALUES
17 | 
18 | def window_search_extraction(record):
19 |     for chunk in record[ATTRIBUTE].noun_chunks:
20 |         left_bound = max(chunk.sent.start, chunk.start - (WINDOW // 2) +1)
21 |         right_bound = min(chunk.sent.end, chunk.end + (WINDOW // 2) + 1)
22 |         window_doc = record[ATTRIBUTE][left_bound: right_bound]
23 |         if any([term in window_doc.text for term in final_list]):
24 |             yield LABEL, chunk.start, chunk.end
25 | ```


--------------------------------------------------------------------------------
/extractors/llm/bert_ner_extraction/README.md:
--------------------------------------------------------------------------------
1 | Uses a BERT transformer model that is accessed via the HuggingFace Inference API for Named Entity Recognition (NER). Recognizes for entities: location (LOC), organizations (ORG), person (PER) and Miscellaneous (MISC). The upside of using this over SpaCy is that the BERT model is much more accurate and less prone to do errors. Requires an API key from HuggingFace, which you can get by signing up on HuggingFace. More information here: https://huggingface.co/dslim/bert-base-NER


--------------------------------------------------------------------------------
/extractors/llm/bert_ner_extraction/code_snippet_refinery.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | import requests
 3 | 
 4 | ATTRIBUTE: str = "text"
 5 | API_KEY: str = "<API_KEY_GOES_HERE>"
 6 | 
 7 | def bert_ner_extraction(record):
 8 |       headers = {"Authorization": f"Bearer {API_KEY}"}
 9 |       data = {"inputs": record[ATTRIBUTE].text}
10 |       try:
11 |             response = requests.post("https://api-inference.huggingface.co/models/dslim/bert-base-NER", headers=headers, json=data)
12 |             response_json = response.json()
13 |             ner_positions = []
14 |             for item in response_json:
15 |                   start = item["start"]
16 |                   end = item["end"]
17 |                   span = record[ATTRIBUTE].char_span(start, end, alignment_mode="expand")
18 |                   yield item["entity_group"], span.start, span.end
19 |       except Exception as e: 
20 |             return f"That didn't work. Did you provide a valid API key? Go error: {e} and message: {response_json}"
21 | ```


--------------------------------------------------------------------------------
/extractors/llm/deberta_ner_extraction/README.md:
--------------------------------------------------------------------------------
1 | Uses a DeBERTa transformer model that is accessed via the HuggingFace Inference API for Named Entity Recognition (NER). Recognizes for entities: person, organization, location, building, event, product, art & other. The upside of using this over SpaCy is that the DeBERTa model is much more accurate and less prone to do errors. Requires an API key from HuggingFace, which you can get by signing up on HuggingFace. More information here: https://huggingface.co/RashidNLP/NER-Deberta


--------------------------------------------------------------------------------
/extractors/llm/deberta_ner_extraction/code_snippet_refinery.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | import requests
 3 | 
 4 | ATTRIBUTE: str = "text"
 5 | API_KEY: str = "<API_KEY_GOES_HERE>"
 6 | 
 7 | def deberta_ner_extraction(record):
 8 |       headers = {"Authorization": f"Bearer {API_KEY}"}
 9 |       data = {"inputs": record[ATTRIBUTE].text}
10 |       try: 
11 |             response = requests.post("https://api-inference.huggingface.co/models/RashidNLP/NER-Deberta", headers=headers, json=data)
12 |             response_json = response.json()
13 |             ner_positions = []
14 |             for item in response_json:
15 |             start = item["start"]
16 |                   end = item["end"]
17 |                   span = record[ATTRIBUTE].char_span(start, end, alignment_mode="expand")
18 |                   yield item["entity_group"], span.start, span.end
19 |       except Exception as e: 
20 |             return f"That didn't work. Did you provide a valid API key? Go error: {e} and message: {response_json}"
21 | ```


--------------------------------------------------------------------------------
/extractors/llm/gpt_information_extraction/README.md:
--------------------------------------------------------------------------------
1 | Uses OpenAI's `GPT-3.5-turbo` model to extract certain information from a text. At a low temperature, the model extracts specified keywords. At a higher temperature, the model generates relevant keywords. An API key can be provided by us or be obtained directly from OpenAI. Contact us at info@kern.ai if you require an API key or need any support from us.


--------------------------------------------------------------------------------
/extractors/media/work_of_art_extraction/README.md:
--------------------------------------------------------------------------------
1 | This module detects the name of works of art from a given text. It uses the `WORK_OF_ART` label for identify entities.


--------------------------------------------------------------------------------
/extractors/media/work_of_art_extraction/__init__.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | from extractors.util.spacy import SpacySingleton
 3 | 
 4 | INPUT_EXAMPLE = {
 5 |     "text": 'The bestseller of last month is "Mystery of the Floridian Porter" by John Doe.',
 6 |     "spacyTokenizer": "en_core_web_sm",
 7 | }
 8 | 
 9 | 
10 | class WorkOfArtExtractionModel(BaseModel):
11 |     text: str
12 |     spacyTokenizer: str = "en_core_web_sm"
13 | 
14 |     class Config:
15 |         schema_extra = {"example": INPUT_EXAMPLE}
16 | 
17 | 
18 | def work_of_art_extraction(request: WorkOfArtExtractionModel):
19 |     """Extracts the name of the book from a text."""
20 | 
21 |     text = request.text
22 |     nlp = SpacySingleton.get_nlp(request.spacyTokenizer)
23 |     doc = nlp(text)
24 |     found = []
25 | 
26 |     for entity in doc.ents:
27 |         if entity.label_ == "WORK_OF_ART":
28 |             found.append(["work of art", entity.start, entity.end])
29 | 
30 |     return {"works of art": found}
31 | 


--------------------------------------------------------------------------------
/extractors/media/work_of_art_extraction/code_snippet_backup.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | import spacy 
 3 | 
 4 | # replace this list with a list containing your data
 5 | text = ["Search of Lost Time is a great book by Marcel Proust."]
 6 | 
 7 | # add the texts to a dict called records. Add further information as key-value pairs if needed
 8 | record = {
 9 |     "text": text,
10 |     "label": "work of art",
11 | }
12 | 
13 | def work_of_art_extraction(record):
14 |     nlp = spacy.load("en_core_web_sm")
15 | 
16 |     artwork_positions = []
17 |     text_id = 0
18 |     for entry in record["text"]:
19 |         doc = nlp(entry)
20 |         for entity in doc.ents:
21 |             if entity.label_ == 'WORK_OF_ART':
22 |                 artwork_positions.append({f"text_{text_id}" :[record["label"], entity.start, entity.end]})
23 |         text_id += 1
24 |     return {"extractions": artwork_positions}
25 | ```


--------------------------------------------------------------------------------
/extractors/media/work_of_art_extraction/code_snippet_refinery.md:
--------------------------------------------------------------------------------
1 | ```python
2 | ATTRIBUTE: str = "text" # only text attributes
3 | LABEL: str = "work of art"
4 | 
5 | def work_of_art_extraction(record):
6 |     for entity in record[ATTRIBUTE].ents:
7 |         if entity.label_ == 'WORK_OF_ART':
8 |             yield LABEL, entity.start, entity.end
9 | ```


--------------------------------------------------------------------------------
/extractors/metrics/metric_extraction/README.md:
--------------------------------------------------------------------------------
1 | Uses the quantulum3 library to detect metrics like distances or units of measurement. Currently working on english text only.


--------------------------------------------------------------------------------
/extractors/metrics/metric_extraction/__init__.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | from quantulum3 import parser
 3 | 
 4 | INPUT_EXAMPLE = {
 5 |     "text": "My weight is 82 kilos. The eifel tower is 187 meters high.",
 6 | }
 7 | 
 8 | class MetricExtractionModel(BaseModel):
 9 |     text: str
10 | 
11 |     class Config:
12 |         schema_extra = {"example": INPUT_EXAMPLE}
13 | 
14 | def metric_extraction(request: MetricExtractionModel):
15 |     """Extracts units of measurement from a string."""
16 |     text = request.text
17 |     
18 |     quants = parser.parse(text)
19 |     units = []
20 |     for quant in quants:
21 |         span = quant.span
22 |         name = quant.unit.name
23 | 
24 |         units.append([name, span[0], span[1]])
25 | 
26 |     return {"metric": units}
27 | 


--------------------------------------------------------------------------------
/extractors/metrics/metric_extraction/code_snippet_backup.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | from quantulum3 import parser
 3 | 
 4 | # replace this list with a list containing your data
 5 | text = ["My weight is 82 kilos. The eifel tower is 187 meters high."]
 6 | 
 7 | # add the texts to a dict called records. Add further information as key-value pairs if needed
 8 | record = {
 9 |     "text": text,
10 |     "label": "metric",
11 | }
12 | 
13 | def metric_extraction(record):
14 |     metric_positions = []
15 |     text_id = 0
16 |     for entry in record["text"]:
17 |         quants = parser.parse(entry)
18 |         for quant in quants:
19 |             span = quant.span
20 |             metric_positions.append({f"text_{text_id}": [record["label"], span[0], span[1]]})
21 |         text_id += 1
22 |     return {"extractions": metric_positions}
23 | ```


--------------------------------------------------------------------------------
/extractors/metrics/metric_extraction/code_snippet_refinery.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | from quantulum3 import parser
 3 | 
 4 | ATTRIBUTE: str = "text" # only text attributes
 5 | LABEL: str = "metric"
 6 | 
 7 | def metric_extraction(record):
 8 |     text = record[ATTRIBUTE].text # SpaCy doc, hence we need to use .text to get the string.
 9 |     
10 |     quants = parser.parse(text)
11 |     for quant in quants:
12 |         span = quant.span
13 | 
14 |         yield LABEL, span[0], span[1]
15 | ```


--------------------------------------------------------------------------------
/extractors/numbers/bic_extraction/README.md:
--------------------------------------------------------------------------------
1 | The module extracts the BIC from a given text. BIC (Bank Identifier Code) is the international standard for the bank sort code constracted as follows: The BIC is in total 8 or 11 characters long. The first 4 characters are letters and contain the ID of the bank, the following 2 are alphabetical and mark the country. The first of these characters is not allowed to be 0 or 1. Next, there are 2 alphanumerical characters to code the location. Finally there can be 3 alphanumerical characters for characterizing the branch (optional). There are alphanumerical, but if the first character is an X, the next two characters have to be X's, too. 
2 | 


--------------------------------------------------------------------------------
/extractors/numbers/bic_extraction/__init__.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | from extractors.util.spacy import SpacySingleton
 3 | import re
 4 | 
 5 | INPUT_EXAMPLE = {
 6 |     "text": "My BIC number is COBADEBBXXX",
 7 |     "spacyTokenizer": "en_core_web_sm",
 8 | }
 9 | 
10 | 
11 | class BicExtractionModel(BaseModel):
12 |     text: str
13 |     spacyTokenizer: str = "en_core_web_sm"
14 | 
15 |     class Config:
16 |         schema_extra = {"example": INPUT_EXAMPLE}
17 | 
18 | 
19 | def bic_extraction(request: BicExtractionModel):
20 |     """Extracts BIC from text"""
21 | 
22 |     text = request.text
23 |     nlp = SpacySingleton.get_nlp(request.spacyTokenizer)
24 |     doc = nlp(text)
25 |     regex = re.compile(r'\b[A-Z0-9]{4,4}[A-Z]{2,2}[A-Z2-9][A-NP-Z0-9]([X]{3,3}|[A-WY-Z0-9]{1,1}[A-Z0-9]{2,2}|\s|\W|$)')
26 |     bic = []
27 |     for match in regex.finditer(text):
28 |         start, end = match.span()
29 |         span = doc.char_span(start, end, alignment_mode="expand")
30 |         bic.append(["BIC", span.start, span.end])
31 |     return {"bic": bic}
32 | 


--------------------------------------------------------------------------------
/extractors/numbers/bic_extraction/code_snippet_refinery.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | import re
 3 | 
 4 | ATTRIBUTE: str = "text" # only text attributes
 5 | LABEL: str = "bic"
 6 | 
 7 | def bic_extraction(record):
 8 |     regex = re.compile(r'\b[A-Z]{4,4}[A-Z]{2,2}[A-Z2-9][A-NP-Z0-9]([X]{3,3}|[A-WY-Z0-9]{1,1}[A-Z0-9]{2,2}|\s|\W|$)')
 9 |     text = record[ATTRIBUTE].text # SpaCy doc, hence we need to use .text to get the string
10 | 
11 |     for match in regex.finditer(text):
12 |         start, end = match.span()
13 |         span = record[ATTRIBUTE].char_span(start, end, alignment_mode="expand")
14 |         yield LABEL, span.start, span.end
15 | ```


--------------------------------------------------------------------------------
/extractors/numbers/credit_card_extraction/README.md:
--------------------------------------------------------------------------------
1 | This module extracts the credit card numbers from a given text. For example, if the text is "My credit card number is 1234-5678-9012-3456", then the output will be "1234-5678-9012-3456". The module uses regular expressions to extract the credit card numbers. The module has been tested on the following formats:
2 | 
3 | 
4 | * 1234-5678-9012-3456
5 | * 1234 5678 9012 3456
6 | * 1234567890123456


--------------------------------------------------------------------------------
/extractors/numbers/credit_card_extraction/__init__.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | from extractors.util.spacy import SpacySingleton
 3 | import re
 4 | 
 5 | INPUT_EXAMPLE = {
 6 |     "text": "This is my card details please use it carefully 4569-4039-6101-4710.",
 7 |     "spacyTokenizer": "en_core_web_sm",
 8 | }
 9 | 
10 | 
11 | class CreditCardExtractionModel(BaseModel):
12 |     text: str
13 |     spacyTokenizer: str = "en_core_web_sm"
14 | 
15 |     class Config:
16 |         schema_extra = {"example": INPUT_EXAMPLE}
17 | 
18 | 
19 | def credit_card_extraction(request: CreditCardExtractionModel):
20 |     """Extracts the credit/debit card number from a text."""
21 | 
22 |     text = request.text
23 |     nlp = SpacySingleton.get_nlp(request.spacyTokenizer)
24 |     doc = nlp(text)
25 |     regex = re.compile(r"(\d{4}[-\s]?){3}\d{3,4}")
26 | 
27 |     credit = []
28 |     for match in regex.finditer(text):
29 |         start, end = match.span()
30 |         span = doc.char_span(start, end, alignment_mode="expand")
31 |         credit.append([span.start, span.end, span.text])
32 | 
33 |     return {"creditCard": credit}
34 | 


--------------------------------------------------------------------------------
/extractors/numbers/credit_card_extraction/code_snippet_backup.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | import re
 3 | import spacy 
 4 | 
 5 | # replace this list with a list containing your data
 6 | text = ["This is my card details please use it carefully 4569-4039-6101-4710.", "The card number is 1231 4551 3431 1009."]
 7 | 
 8 | # add the texts to a dict called records. Add further information as key-value pairs if needed
 9 | record = {
10 |     "text": text,
11 |     "label": "credit card",
12 | }
13 | 
14 | def credit_card_extraction(record):
15 |     nlp = spacy.load("en_core_web_sm")
16 | 
17 |     credit_card_positions = []
18 |     text_id = 0
19 |     for entry in record["text"]:
20 |         regex = re.compile(
21 |             r"(\d{4}[-\s]?){3}\d{3,4}"
22 |         )
23 |         doc = nlp(entry)
24 |         for match in regex.finditer(entry):
25 |             start, end = match.span()
26 |             span = doc.char_span(start, end, alignment_mode="expand")
27 |             credit_card_positions.append({f"text_{text_id}": [record["label"], span.start, span.end]})
28 |         text_id += 1
29 |     return credit_card_positions
30 | ```


--------------------------------------------------------------------------------
/extractors/numbers/credit_card_extraction/code_snippet_refinery.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | import re
 3 | 
 4 | ATTRIBUTE: str = "text" # only text attributes
 5 | LABEL: str = "cardNumber"
 6 | 
 7 | def credit_card_extraction(record):
 8 |     text = record[ATTRIBUTE].text # SpaCy document, hence we need to use .text to the string.
 9 | 
10 |     regex = re.compile(
11 |         r"(\d{4}[-\s]?){3}\d{3,4}"
12 |     )
13 |     
14 |     for match in regex.finditer(text):
15 |         start, end = match.span()
16 |         span = record[ATTRIBUTE].char_span(start, end, alignment_mode="expand")
17 |         yield LABEL, span.start, span.end
18 | ```


--------------------------------------------------------------------------------
/extractors/numbers/digit_extraction/README.md:
--------------------------------------------------------------------------------
1 | Extracts digits from string of length n. 


--------------------------------------------------------------------------------
/extractors/numbers/digit_extraction/__init__.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from pydantic import BaseModel
 3 | from typing import Optional
 4 | from extractors.util.spacy import SpacySingleton
 5 | 
 6 | INPUT_EXAMPLE = {
 7 |     "text": "My PIN is 1337.",
 8 |     "digitLength": 4,
 9 |     "spacyTokenizer": "en_core_web_sm"
10 | }
11 | 
12 | class DigitExtractionModel(BaseModel):
13 |     text: str
14 |     digitLength: int
15 |     spacyTokenizer: Optional[str]
16 | 
17 |     class Config:
18 |         schema_extra = {"example": INPUT_EXAMPLE}
19 | 
20 | def digit_extraction(req: DigitExtractionModel):
21 |     """Extracts digits of variable length."""
22 |     text = req.text
23 |     number = req.digitLength
24 | 
25 |     nlp = SpacySingleton.get_nlp()
26 |     doc = nlp(text)
27 | 
28 |     num_string = "{"+f"{number}"+"}"
29 |     regex = re.compile(rf"(?<![0-9])[0-9]{num_string}(?![0-9])")
30 | 
31 |     for match in regex.finditer(text):
32 |         start, end = match.span()
33 |         span = doc.char_span(start, end, alignment_mode="expand")
34 |         return {"Number": [span.start, span.end]}
35 | 


--------------------------------------------------------------------------------
/extractors/numbers/digit_extraction/code_snippet_backup.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | import re
 3 | import spacy
 4 | 
 5 | # replace this list with a list containing your data
 6 | text = ["My PIN is 1337."]
 7 | 
 8 | # add the texts to a dict called records. Add further information as key-value pairs if needed
 9 | record = {
10 |     "text": text,
11 |     "label": "digit",
12 |     "digit_length": 4
13 | }
14 | 
15 | def digit_extraction(record):
16 |     nlp = spacy.load("en_core_web_sm")
17 | 
18 |     digit_positions = []
19 |     text_id = 0
20 |     for entry in record["text"]:
21 |         digit_length = record["digit_length"]
22 | 
23 |         num_string = "{"+f"{digit_length}"+"}"
24 |         regex = re.compile(rf"(?<![0-9])[0-9]{num_string}(?![0-9])")
25 |         
26 |         doc = nlp(entry)
27 |         for match in regex.finditer(entry):
28 |             start, end = match.span()
29 |             span = doc.char_span(start, end, alignment_mode="expand")
30 |             digit_positions.append({f"text_{text_id}": [record["label"], span.start, span.end]})
31 |         text_id += 1
32 |     return {"extraction": digit_positions}
33 | ```


--------------------------------------------------------------------------------
/extractors/numbers/digit_extraction/code_snippet_refinery.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | import re
 3 | 
 4 | ATTRIBUTE: str = "text" # only text attributes
 5 | MAX_NUMBER_LENGTH: int = 4 #maximum amount of digits to be considered relevant 
 6 | LABEL: str = "digit"
 7 | 
 8 | def digit_extraction(record):
 9 |     text = record[ATTRIBUTE].text # SpaCy doc, hence we need to use .text to get the string
10 |     number = MAX_NUMBER_LENGTH
11 | 
12 |     num_string = "{"+f"{number}"+"}"
13 |     regex = re.compile(rf"(?<![0-9])[0-9]{num_string}(?![0-9])")
14 | 
15 |     for match in regex.finditer(text):
16 |         start, end = match.span()
17 |         span = record[ATTRIBUTE].char_span(start, end, alignment_mode="expand")
18 |         yield LABEL, span.start, span.end
19 | ```
20 | 


--------------------------------------------------------------------------------
/extractors/numbers/iban_extraction/README.md:
--------------------------------------------------------------------------------
1 | This module extracts the IBAN from a given text. IBAN stands for **International Bank Account Number**. The IBANs are 22 characters long. The first two characters are the country code, the next two are the check digits, the next four are the bank code, and the rest is the account number.


--------------------------------------------------------------------------------
/extractors/numbers/iban_extraction/__init__.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | from extractors.util.spacy import SpacySingleton
 3 | import re
 4 | 
 5 | INPUT_EXAMPLE = {
 6 |     "text": "DE89370400440532013000",
 7 |     "spacyTokenizer": "en_core_web_sm",
 8 | }
 9 | 
10 | 
11 | class IbanExtractionModel(BaseModel):
12 |     text: str
13 |     spacyTokenizer: str = "en_core_web_sm"
14 | 
15 |     class Config:
16 |         schema_extra = {"example": INPUT_EXAMPLE}
17 | 
18 | 
19 | def iban_extraction(request: IbanExtractionModel):
20 |     """Extracts IBAN from text"""
21 | 
22 |     text = request.text
23 |     nlp = SpacySingleton.get_nlp(request.spacyTokenizer)
24 |     doc = nlp(text)
25 |     regex = re.compile(r"[A-Z]{2}\d{2} ?\d{4} ?\d{4} ?\d{4} ?\d{4} ?[\d]{0,2}")
26 | 
27 |     isbn = []
28 |     for match in regex.finditer(text):
29 |         start, end = match.span()
30 |         span = doc.char_span(start, end, alignment_mode="expand")
31 |         isbn.append(["IBAN", span.start, span.end])
32 |     return {"iban": isbn}
33 | 


--------------------------------------------------------------------------------
/extractors/numbers/iban_extraction/code_snippet_backup.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | import re
 3 | import spacy
 4 | 
 5 | # replace this list with a list containing your data
 6 | text = ["My IBAN is DE89370400440532013000."]
 7 | 
 8 | # add the texts to a dict called records. Add further information as key-value pairs if needed
 9 | record = {
10 |     "text": text,
11 |     "label": "iban",
12 | }
13 | 
14 | def iban_extraction(record):
15 |     nlp = spacy.load("en_core_web_sm")
16 | 
17 |     iban_positions = []
18 |     text_id = 0
19 |     for entry in record["text"]:
20 |         regex = re.compile(r"[A-Z]{2}\d{2} ?\d{4} ?\d{4} ?\d{4} ?\d{4} ?[\d]{0,2}")
21 |         
22 |         doc = nlp(entry)
23 |         for match in regex.finditer(entry):
24 |             start, end = match.span()
25 |             span = doc.char_span(start, end, alignment_mode="expand")
26 |             iban_positions.append({f"text_{text_id}": [record["label"], span.start, span.end]})
27 |         text_id += 1
28 |     return {"extraction": iban_positions}
29 | ```


--------------------------------------------------------------------------------
/extractors/numbers/iban_extraction/code_snippet_refinery.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | import re
 3 | 
 4 | ATTRIBUTE: str = "text" # only text attributes
 5 | LABEL: str = "iban"
 6 | 
 7 | def iban_extraction(record):
 8 |     regex = re.compile(r"[A-Z]{2}\d{2} ?\d{4} ?\d{4} ?\d{4} ?\d{4} ?[\d]{0,2}")
 9 |     text = record[ATTRIBUTE].text # SpaCy doc, hence we need to use .text to get the string
10 | 
11 |     for match in regex.finditer(text):
12 |         start, end = match.span()
13 |         span = record[ATTRIBUTE].char_span(start, end, alignment_mode="expand")
14 |         yield LABEL, span.start, span.end
15 | ```


--------------------------------------------------------------------------------
/extractors/numbers/ip_extraction/README.md:
--------------------------------------------------------------------------------
 1 | This module extracts IP addresses from a given text using regular expression. Ip address have a specific format:
 2 | 1. Each part of the IP address is a number between 0 and 255.
 3 | 2. Each part is separated by a dot.
 4 | 3. There are 4 parts in total.
 5 | 4. The IP address is not allowed to start or end with a dot.
 6 | 5. The IP address is not allowed to have leading zeros.
 7 | 6. The IP address is not allowed to have spaces.
 8 | 7. The IP address is not allowed to have any other characters than numbers and dots. 
 9 | 
10 | For example:
11 | 
12 | * 49.82.54.141
13 | * 79.112.83.185
14 | * 145.32.123.51


--------------------------------------------------------------------------------
/extractors/numbers/ip_extraction/__init__.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | from extractors.util.spacy import SpacySingleton
 3 | import re
 4 | 
 5 | INPUT_EXAMPLE = {
 6 |     "text": "The IP addressing range is from 0.0.0.0 to 255.255.255.255",
 7 |     "spacyTokenizer": "en_core_web_sm",
 8 | }
 9 | 
10 | 
11 | class IpExtractionModel(BaseModel):
12 |     text: str
13 |     spacyTokenizer: str = "en_web_core_sm"
14 | 
15 |     class Config:
16 |         schema_extra = {"example": INPUT_EXAMPLE}
17 | 
18 | 
19 | def ip_extraction(request: IpExtractionModel):
20 |     """Extracts IP addresses from text"""
21 |     text = request.text
22 |     nlp = SpacySingleton.get_nlp(request.spacyTokenizer)
23 |     doc = nlp(text)
24 |     regex = re.compile(r"\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b")
25 | 
26 |     ip_addresses = []
27 |     for match in regex.finditer(text):
28 |         start, end = match.span()
29 |         span = doc.char_span(start, end, alignment_mode="expand")
30 |         ip_addresses.append(["ip_address", span.start, span.end])
31 | 
32 |     return {"ip_addresses": ip_addresses}
33 | 


--------------------------------------------------------------------------------
/extractors/numbers/ip_extraction/code_snippet_backup.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | import re
 3 | import spacy
 4 | 
 5 | # replace this list with a list containing your data
 6 | text = ["The IP addressing range is from 0.0.0.0 to 255.255.255.255."]
 7 | 
 8 | # add the texts to a dict called records. Add further information as key-value pairs if needed
 9 | record = {
10 |     "text": text,
11 |     "label": "IP-address",
12 | }
13 | 
14 | def ip_extraction(record):
15 |     nlp = spacy.load("en_core_web_sm")
16 | 
17 |     ip_positions = []
18 |     text_id = 0
19 |     for entry in record["text"]:
20 |         regex = re.compile(r"\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b")
21 |         
22 |         doc = nlp(entry)
23 |         for match in regex.finditer(entry):
24 |             start, end = match.span()
25 |             span = doc.char_span(start, end, alignment_mode="expand")
26 |             ip_positions.append({f"text_{text_id}": [record["label"], span.start, span.end]})
27 |         text_id += 1
28 |     return {"extraction": ip_positions}
29 | ```


--------------------------------------------------------------------------------
/extractors/numbers/ip_extraction/code_snippet_refinery.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | import re
 3 | 
 4 | ATTRIBUTE: str = "text" # only text attributes
 5 | LABEL: str = "IP Address"
 6 | 
 7 | def ip_extraction(record):
 8 |     regex = re.compile(r"\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b")
 9 |     text = record[ATTRIBUTE].text # SpaCy doc, hence we need to use .text to get string.
10 | 
11 |     for match in regex.finditer(text):
12 |         start, end = match.span()
13 |         span = record[ATTRIBUTE].char_span(start, end, alignment_mode="expand")
14 |         yield LABEL, span.start, span.end
15 | ```


--------------------------------------------------------------------------------
/extractors/numbers/isbn_extraction/README.md:
--------------------------------------------------------------------------------
1 | This module extracts the ISBN from a given text. ISBN stands for **International Standard Book Number**. Until 2007, the ISBN consisted of 10 digits, broken into four parts separated by hyphens. Since January 01, 2007 it consists of 13 digits, broken into five parts separated by hyphens. 


--------------------------------------------------------------------------------
/extractors/numbers/isbn_extraction/__init__.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | from extractors.util.spacy import SpacySingleton
 3 | import re
 4 | 
 5 | INPUT_EXAMPLE = {
 6 |     "text": "I wish to issue this book whose ISBN is 78-0-3563-82542-0. And also this one whose ISBN is 69-087-647-01.",
 7 |     "spacyTokenizer": "en_core_web_sm",
 8 | }
 9 | 
10 | 
11 | class IsbnExtractionModel(BaseModel):
12 |     text: str
13 |     spacyTokenizer: str = "en_core_web_sm"
14 | 
15 |     class Config:
16 |         schema_extra = {"example": INPUT_EXAMPLE}
17 | 
18 | 
19 | def isbn_extraction(request: IsbnExtractionModel):
20 |     """Extracts the ISBN from a text"""
21 | 
22 |     text = request.text
23 |     nlp = SpacySingleton.get_nlp(request.spacyTokenizer)
24 |     doc = nlp(text)
25 |     regex = re.compile(r"(?:[\d-]{17}|[\d-]{13})")
26 | 
27 |     isbn = []
28 |     for match in regex.finditer(text):
29 |         start, end = match.span()
30 |         span = doc.char_span(start, end, alignment_mode="expand")
31 |         isbn.append(["isbn", span.start, span.end])
32 |     return {"isbn": isbn}
33 | 


--------------------------------------------------------------------------------
/extractors/numbers/isbn_extraction/code_snippet_backup.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | import re
 3 | import spacy
 4 | 
 5 | # replace this list with a list containing your data
 6 | text = ["I wish to issue this book whose ISBN is 78-0-3563-82542-0.", "lso this one whose ISBN is 69-087-647-01."]
 7 | 
 8 | # add the texts to a dict called records. Add further information as key-value pairs if needed
 9 | record = {
10 |     "text": text,
11 |     "label": "isbn",
12 | }
13 | 
14 | def isbn_extraction(record):
15 |     nlp = spacy.load("en_core_web_sm")
16 | 
17 |     isbn_positions = []
18 |     text_id = 0
19 |     for entry in record["text"]:
20 |         regex = re.compile(r"(?:[\d-]{17}|[\d-]{13})")
21 |         
22 |         doc = nlp(entry)
23 |         for match in regex.finditer(entry):
24 |             start, end = match.span()
25 |             span = doc.char_span(start, end, alignment_mode="expand")
26 |             isbn_positions.append({f"text_{text_id}": [record["label"], span.start, span.end]})
27 |         text_id += 1
28 |     return {"extraction": isbn_positions}
29 | ```


--------------------------------------------------------------------------------
/extractors/numbers/isbn_extraction/code_snippet_refinery.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | import re
 3 | 
 4 | ATTRIBUTE: str = "text" # only text attributes
 5 | LABEL: str = "isbn"
 6 | 
 7 | def iban_extraction(record):
 8 |     regex = re.compile(r"(?:[\d-]{17}|[\d-]{13})")
 9 |     text = record[ATTRIBUTE].text # SpaCy doc, hence we need to use .text to get the string
10 | 
11 |     for match in regex.finditer(text):
12 |         start, end = match.span()
13 |         span = record[ATTRIBUTE].char_span(start, end, alignment_mode="expand")
14 |         yield LABEL, span.start, span.end
15 | ```


--------------------------------------------------------------------------------
/extractors/numbers/percentage_extraction/README.md:
--------------------------------------------------------------------------------
1 | This module extracts percentages from a given text using a regex pattern. The percent sign will be extracted, too.


--------------------------------------------------------------------------------
/extractors/numbers/percentage_extraction/__init__.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | from extractors.util.spacy import SpacySingleton
 3 | import re
 4 | 
 5 | INPUT_EXAMPLE = {
 6 |     "text": "percentages 110% are found -.5% at 42,13% positions 1, 5 and 8",
 7 |     "spacyTokenizer": "en_core_web_sm",
 8 | }
 9 | 
10 | 
11 | class PercentageExtractionModel(BaseModel):
12 |     text: str
13 |     spacyTokenizer: str = "en_core_web_sm"
14 | 
15 |     class Config:
16 |         schema_extra = {"example": INPUT_EXAMPLE}
17 | 
18 | 
19 | def percentage_extraction(request: PercentageExtractionModel):
20 |     """Extracts the Percentages from a text"""
21 | 
22 |     text = request.text
23 |     nlp = SpacySingleton.get_nlp(request.spacyTokenizer)
24 |     doc = nlp(text)
25 |     regex = re.compile(r"(-?\d+(?:[.,]\d*)?|-?[.,]\d+)\s*%")
26 |     percentages = []
27 |     for match in regex.finditer(text):
28 |         start, end = match.span()
29 |         span = doc.char_span(start, end, alignment_mode="expand")
30 |         percentages.append(["percentage", span.start, span.end])
31 |     return {"percentages": percentages}
32 | 


--------------------------------------------------------------------------------
/extractors/numbers/percentage_extraction/code_snippet_refinery.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | import re
 3 | 
 4 | ATTRIBUTE: str = "text" # only text attributes
 5 | LABEL: str = "percentage"
 6 | 
 7 | def percentage_extraction(record):
 8 |     regex = re.compile(r"(-?\d+(?:[.,]\d*)?|-?[.,]\d+)\s*%")
 9 |     text = record[ATTRIBUTE].text # SpaCy doc, hence we need to use .text to get the string
10 | 
11 |     for match in regex.finditer(text):
12 |         start, end = match.span()
13 |         span = record[ATTRIBUTE].char_span(start, end, alignment_mode="expand")
14 |         yield LABEL, span.start, span.end
15 | ```


--------------------------------------------------------------------------------
/extractors/numbers/phone_number_extraction/README.md:
--------------------------------------------------------------------------------
1 | Extracts a number that vaguely resembles a phone number and then uses the phonenum Python library to validate the number to return only valid phone numbers.


--------------------------------------------------------------------------------
/extractors/numbers/phone_number_extraction/code_snippet_refinery.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | import re
 3 | import phonenumbers
 4 | 
 5 | ATTRIBUTE: str = "text" # only text attributes
 6 | LABEL: str = "PHONE_NUMBER"
 7 | 
 8 | def phone_number_extraction(record):
 9 |     regex = re.compile(r"[\+]?[(]?[0-9]{3}[)]?[-\s\.]?[0-9]{3}[-\s\.]?[0-9]{4,6}")
10 |     text = record[ATTRIBUTE].text
11 | 
12 |     for match in regex.finditer(text):
13 |         try:
14 |             parsed_num = phonenumbers.parse(match.group(0), None)
15 |             if phonenumbers.is_valid_number(parsed_num):
16 |                 start, end = match.span()
17 |                 span = record[ATTRIBUTE].char_span(start, end, alignment_mode="expand")
18 |                 yield LABEL, span.start, span.end
19 |         except phonenumbers.phonenumberutil.NumberParseException:
20 |             pass 
21 | ```


--------------------------------------------------------------------------------
/extractors/numbers/price_extraction/README.md:
--------------------------------------------------------------------------------
1 | This module extracts prices from a given text. Spacy's entity labellings can be used to extracts tokens labelled as `MONEY`. The best part is that the currency is also included.


--------------------------------------------------------------------------------
/extractors/numbers/price_extraction/__init__.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | from extractors.util.spacy import SpacySingleton
 3 | 
 4 | INPUT_EXAMPLE = {
 5 |     "text": "A desktop with i7 processor costs 950 dollars in the US.",
 6 |     "spacyTokenizer": "en_core_web_sm",
 7 | }
 8 | 
 9 | 
10 | class PriceExtractionModel(BaseModel):
11 |     text: str
12 |     spacyTokenizer: str = "en_core_web_sm"
13 | 
14 |     class Config:
15 |         schema_extra = {"example": INPUT_EXAMPLE}
16 | 
17 | 
18 | def price_extraction(request: PriceExtractionModel):
19 |     """Extracts prices from a given text."""
20 |     text = request.text
21 |     nlp = SpacySingleton.get_nlp(request.spacyTokenizer)
22 |     doc = nlp(text)
23 | 
24 |     prices = []
25 |     for entity in doc.ents:
26 |         if entity.label_ == "MONEY":
27 |             prices.append(["price", entity.start, entity.end])
28 | 
29 |     return {"prices": prices}
30 | 


--------------------------------------------------------------------------------
/extractors/numbers/price_extraction/code_snippet_backup.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | import spacy 
 3 | 
 4 | # replace this list with a list containing your data
 5 | text = ["A desktop with i7 processor costs 950 dollars in the US."]
 6 | 
 7 | # add the texts to a dict called records. Add further information as key-value pairs if needed
 8 | record = {
 9 |     "text": text,
10 |     "label": "money",
11 | }
12 | 
13 | def price_extraction(record):
14 |     nlp = spacy.load("en_core_web_sm")
15 | 
16 |     price_positions = []
17 |     text_id = 0
18 |     for entry in record["text"]:
19 |         doc = nlp(entry)
20 |         for entity in doc.ents:
21 |             if entity.label_ == "MONEY":
22 |                 price_positions.append({f"text_{text_id}" :[record["label"], entity.start, entity.end]})
23 |         text_id += 1
24 |     return {"extractions": price_positions}
25 | ```


--------------------------------------------------------------------------------
/extractors/numbers/price_extraction/code_snippet_refinery.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | ATTRIBUTE: str = "text" # only text attributes
 3 | LABEL: str = "price"
 4 | 
 5 | def price_extraction(record):
 6 |     doc = record[ATTRIBUTE] # SpaCy doc
 7 | 
 8 |     for entity in doc.ents:
 9 |         if entity.label_ == 'MONEY':
10 |             yield LABEL, entity.start, entity.end
11 | ```


--------------------------------------------------------------------------------
/extractors/paths/filepath_extraction/README.md:
--------------------------------------------------------------------------------
1 | This module extracts a path from a text by using the os module from Python.


--------------------------------------------------------------------------------
/extractors/paths/filepath_extraction/code_snippet_refinery.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | import re
 3 | 
 4 | ATTRIBUTE: str = "text" # only text attributes
 5 | LABEL: str = "path"
 6 | SEPARATOR: str = "/" # use "\\" for Windows paths
 7 | 
 8 | def filepath_extraction(record):
 9 |     text = record[ATTRIBUTE].text # SpaCy do , hence we need to use .text to get the string.
10 | 
11 |     # Extracts the paths from the texts
12 |     paths = [x for x in text.split() if len(x.split(SEPARATOR)) > 1]
13 | 
14 |     # We need to add an \ before separators to use them in regex
15 |     regex_paths = [i.replace(SEPARATOR, "\\"+SEPARATOR) for i in paths]
16 |     
17 |     for path in regex_paths:
18 |         pattern = rf"({path})"
19 |         match = re.search(pattern, text)
20 | 
21 |         start, end = match.span()
22 |         span = record[ATTRIBUTE].char_span(start, end, alignment_mode="expand")
23 |         
24 |         yield LABEL, span.start, span.end
25 | ```


--------------------------------------------------------------------------------
/extractors/paths/url_extraction/README.md:
--------------------------------------------------------------------------------
1 | This module extracts URL links from a given text using regular expressions. 


--------------------------------------------------------------------------------
/extractors/paths/url_extraction/__init__.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | from extractors.util.spacy import SpacySingleton
 3 | import re
 4 | 
 5 | INPUT_EXAMPLE = {
 6 |     "text": "Check out https://kern.ai!",
 7 |     "spacyTokenizer": "en_core_web_sm",
 8 | }
 9 | 
10 | 
11 | class UrlExtractionModel(BaseModel):
12 |     text: str
13 |     spacyTokenizer: str = "en_core_web_sm"
14 | 
15 |     class Config:
16 |         schema_extra = {"example": INPUT_EXAMPLE}
17 | 
18 | 
19 | def url_extraction(request: UrlExtractionModel):
20 |     """Extracts urls from a given text."""
21 |     text = request.text
22 |     nlp = SpacySingleton.get_nlp(request.spacyTokenizer)
23 |     doc = nlp(text)
24 | 
25 |     regex_pattern = re.compile(r"(?:(?:(?:https?|ftp):\/\/){1})?[\w\-\/?=%.]{3,}\.[\/\w\-&?=%.]{2,}")
26 |     regex_pattern.findall(text)
27 | 
28 |     urls = []
29 |     for match in regex_pattern.finditer(text):
30 |         start, end = match.span()
31 |         span = doc.char_span(start, end, alignment_mode="expand")
32 |         urls.append(["url", span.start, span.end])
33 | 
34 |     return {"urls": urls}
35 | 


--------------------------------------------------------------------------------
/extractors/paths/url_extraction/code_snippet_backup.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | import re
 3 | import spacy
 4 | 
 5 | # replace this list with a list containing your data
 6 | text = ["Check out https://kern.ai!"]
 7 | 
 8 | # add the texts to a dict called records. Add further information as key-value pairs if needed
 9 | record = {
10 |     "text": text,
11 |     "label": "url",
12 | }
13 | 
14 | def url_extraction(record):
15 |     npl = spacy.load("en_core_web_sm")
16 | 
17 |     url_positions = []
18 |     text_id = 0
19 |     for entry in record["text"]:
20 |         regex_pattern = re.compile(r"(?:(?:https?|ftp):\/\/)?[\w/\-?=%.]+\.[\w/\-&?=%.]+")
21 |         doc = npl(entry)
22 |         for match in regex_pattern.finditer(entry):
23 |             start, end = match.span()
24 |             span = doc.char_span(start, end, alignment_mode="expand")
25 |             url_positions.append({f"text_{text_id}": [record["label"], span.start, span.end]}) 
26 |         text_id += 1
27 |     return {"extractions" : url_positions}
28 | ```


--------------------------------------------------------------------------------
/extractors/paths/url_extraction/code_snippet_refinery.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | import re
 3 | 
 4 | ATTRIBUTE: str = "text" # only text attributes
 5 | LABEL: str = "url"
 6 | 
 7 | def url_extraction(record):
 8 |     text = record[ATTRIBUTE].text # SpaCy doc, hence we need to use .text to get the string.
 9 |     regex_pattern = re.compile(r"(?:(?:(?:https?|ftp):\/\/){1})?[\w\-\/?=%.]{3,}\.[\/\w\-&?=%.]{2,}")
10 | 
11 |     for match in regex_pattern.finditer(text):
12 |         start, end = match.span()
13 |         span = record[ATTRIBUTE].char_span(start, end, alignment_mode="expand")
14 |         yield LABEL, span.start, span.end
15 | ```


--------------------------------------------------------------------------------
/extractors/personal_identifiers/address_extraction/README.md:
--------------------------------------------------------------------------------
1 | Detects locations/addresses in a given text. For example, it can detect the following addresses:
2 | 
3 | * 1600 Pennsylvania Avenue, Washington, DC
4 | * 10 Downing Street, London
5 | * Alexanderstrasse 7, Berlin


--------------------------------------------------------------------------------
/extractors/personal_identifiers/email_extraction/README.md:
--------------------------------------------------------------------------------
1 | This module extracts emails from a given text using regular expressions.


--------------------------------------------------------------------------------
/extractors/personal_identifiers/email_extraction/__init__.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | from extractors.util.spacy import SpacySingleton
 3 | import re
 4 | 
 5 | INPUT_EXAMPLE = {
 6 |     "text": "If you have any questions, please contact johannes.hoetter@kern.ai.",
 7 |     "spacyTokenizer": "en_core_web_sm",
 8 | }
 9 | 
10 | 
11 | class EmailExtractionModel(BaseModel):
12 |     text: str
13 |     spacyTokenizer: str = "en_core_web_sm"
14 | 
15 |     class Config:
16 |         schema_extra = {"example": INPUT_EXAMPLE}
17 | 
18 | 
19 | def email_extraction(request: EmailExtractionModel):
20 |     """Detects emails in a text and returns them in a list."""
21 |     text = request.text
22 |     nlp = SpacySingleton.get_nlp(request.spacyTokenizer)
23 |     doc = nlp(text)
24 |     regex = re.compile(r"([a-zA-Z0-9._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)")
25 | 
26 |     emails = []
27 |     for match in regex.finditer(text):
28 |         start, end = match.span()
29 |         span = doc.char_span(start, end, alignment_mode="expand")
30 |         emails.append(["email", span.start, span.end])
31 | 
32 |     return {"emails": emails}
33 | 


--------------------------------------------------------------------------------
/extractors/personal_identifiers/email_extraction/code_snippet_backup.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | import re
 3 | import spacy
 4 | 
 5 | # replace this list with a list containing your data
 6 | text = ["My E-Mail address is jane.doe@gmail.com", "Our support mail is support@awesome-co.com"]
 7 | 
 8 | # add the texts to a dict called records. Add further information as key-value pairs if needed
 9 | record = {
10 |     "text": text,
11 |     "label": "email",
12 | }
13 | 
14 | def email_extraction(record: dict) -> dict:
15 |     regex = re.compile(r"([a-zA-Z0-9._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)")
16 |     nlp = spacy.load("en_core_web_sm")
17 | 
18 |     email_positions = []
19 |     text_id = 0
20 |     for entry in record["your_text"]:
21 |         doc = nlp(entry)
22 |         for match in regex.finditer(entry):
23 |             start, end = match.span()
24 |             span = doc.char_span(start, end, alignment_mode="expand")
25 |             email_positions.append({f"text_{text_id}" :[record["label"], span.start, span.end]})
26 |         text_id += 1
27 |     return {"extraction": email_positions}
28 | ```


--------------------------------------------------------------------------------
/extractors/personal_identifiers/email_extraction/code_snippet_refinery.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | import re
 3 | 
 4 | ATTRIBUTE: str = "text" # only text attributes
 5 | LABEL: str = "email"
 6 | 
 7 | def email_extraction(record):
 8 |     regex = re.compile(r"([a-zA-Z0-9._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)")
 9 |     text = record[ATTRIBUTE].text # SpaCy doc, hence we need to use .text to get the string
10 | 
11 |     for match in regex.finditer(text):
12 |         start, end = match.span()
13 |         span = record[ATTRIBUTE].char_span(start, end, alignment_mode="expand")
14 |         yield LABEL, span.start, span.end
15 | ```


--------------------------------------------------------------------------------
/extractors/personal_identifiers/location_extraction/README.md:
--------------------------------------------------------------------------------
1 | Uses SpaCy to extract locations such as cities and countries (GPE) or names of other famous places like mountains and rivers (LOC). 


--------------------------------------------------------------------------------
/extractors/personal_identifiers/location_extraction/__init__.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | from extractors.util.spacy import SpacySingleton
 3 | 
 4 | INPUT_EXAMPLE = {
 5 |     "text": "Tokyo is a beautiful city, which is not located in Kansas, USA.",
 6 |     "spacyTokenizer": "en_core_web_sm",
 7 | }
 8 | 
 9 | 
10 | class LocationExtractionModel(BaseModel):
11 |     text: str
12 |     spacyTokenizer: str = "en_core_web_sm"
13 | 
14 |     class Config:
15 |         schema_extra = {"example": INPUT_EXAMPLE}
16 | 
17 | 
18 | def location_extraction(req: LocationExtractionModel):
19 |     """ Uses SpaCy to extract locations from a text."""
20 |     text = req.text
21 |     nlp = SpacySingleton.get_nlp(req.spacyTokenizer)
22 |     doc = nlp(text)
23 | 
24 |     names = []
25 |     for ent in doc.ents:
26 |         if ent.label_ == "GPE" or ent.label_ == "LOC":
27 |             names.append(["location", ent.start, ent.end])
28 |     return {"locations": names}
29 | 


--------------------------------------------------------------------------------
/extractors/personal_identifiers/location_extraction/code_snippet_refinery.md:
--------------------------------------------------------------------------------
1 | ```python
2 | ATTRIBUTE: str = "text" # only text attributes
3 | LABEL: str = "location"
4 | 
5 | def location_extraction(record):
6 |     for ent in record[ATTRIBUTE].ents:
7 |         if ent.label_ == "GPE" or ent.label_ == "LOC":
8 |             yield LABEL, ent.start, ent.end
9 | ```


--------------------------------------------------------------------------------
/extractors/personal_identifiers/person_extraction/README.md:
--------------------------------------------------------------------------------
1 | This module extracts names from a given text using spaCy. It uses entity labels to match entities labelled as `PERSON`. Recommended: use `en_core_web_lg` tokenizer to generate accurate results.


--------------------------------------------------------------------------------
/extractors/personal_identifiers/person_extraction/code_snippet_backup.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | import spacy
 3 | 
 4 | # replace this list with a list containing your data
 5 | text = ["My name is James Bond.", "Harry met Jane on a sunny afternoon."]
 6 | 
 7 | # add the texts to a dict called records. Add further information as key-value pairs if needed
 8 | record = {
 9 |     "your_text": text,
10 |     "label": "name",
11 | }
12 | 
13 | def person_extraction(record: dict) -> dict:
14 |     nlp = spacy.load("en_core_web_sm")
15 | 
16 |     name_positions = []
17 |     text_id = 0
18 |     for entity in record["your_text"]:
19 |         doc = nlp(entity)
20 |         if entity.label_ == "PERSON":
21 |             name_positions.append({f"text_{text_id}" :[record["label"], entity.start, entity.end]})
22 |         text_id += 1
23 |     return {"extraction": name_positions}
24 | ```


--------------------------------------------------------------------------------
/extractors/personal_identifiers/person_extraction/code_snippet_refinery.md:
--------------------------------------------------------------------------------
1 | ```python
2 | ATTRIBUTE: str = "text" # only text attributes
3 | LABEL: str = "name"
4 | 
5 | def person_extraction(record):
6 |     for entity in record[ATTRIBUTE].ents:
7 |         if entity.label_ == "PERSON":
8 |             yield LABEL, entity.start, entity.end
9 | ```


--------------------------------------------------------------------------------
/extractors/personal_identifiers/zipcode_extraction/README.md:
--------------------------------------------------------------------------------
1 | This module extracts the zip codes from a text using regular expressions. All the valid zip codes formats are located in a dictionary in the source code.


--------------------------------------------------------------------------------
/extractors/symbols/hashtag_extraction/README.md:
--------------------------------------------------------------------------------
1 | This module detects hashtags from a given text. For example, `"The world needs to act now on the global increase in temperature #climatechange #globalwarming #fridaysforfuture"` will be extracted as `["#climatechange", "#globalwarming", "#fridaysforfuture"]`.


--------------------------------------------------------------------------------
/extractors/symbols/hashtag_extraction/__init__.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | from extractors.util.spacy import SpacySingleton
 3 | import re
 4 | 
 5 | INPUT_EXAMPLE = {
 6 |     "text": "In tech industry, #devrel is a very hot topic.",
 7 |     "spacyTokenizer": "en_core_web_sm",
 8 | }
 9 | 
10 | class HashtagExtractionModel(BaseModel):
11 |     text: str
12 |     spacyTokenizer: str = "en_web_core_sm"
13 | 
14 |     class Config:
15 |         schema_extra = {"example": INPUT_EXAMPLE}
16 | 
17 | 
18 | def hashtag_extraction(request: HashtagExtractionModel):
19 |     """Detects hashtags in a text and returns them in a list."""
20 |     text = request.text
21 |     nlp = SpacySingleton.get_nlp(request.spacyTokenizer)
22 |     doc = nlp(text)
23 |     regex = re.compile(r"#(\w*)")
24 |     regex.findall(text)
25 | 
26 |     hashtags = []
27 |     for match in regex.finditer(text):
28 |         start, end = match.span()
29 |         span = doc.char_span(start, end, alignment_mode="expand")
30 |         hashtags.append(["hashtag", span.start, span.end])
31 | 
32 |     return {"hashtags": hashtags}
33 | 


--------------------------------------------------------------------------------
/extractors/symbols/hashtag_extraction/code_snippet_backup.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | import re
 3 | import spacy
 4 | 
 5 | # replace this list with a list containing your data
 6 | text = ["In tech industry, #devrel is a very hot topic.", "Follow us on #mastodon!"]
 7 | 
 8 | # add the texts to a dict called records. Add further information as key-value pairs if needed
 9 | record = {
10 |     "text": text,
11 |     "label": "hashtag",
12 | }
13 | 
14 | def hashtag_extraction(record):
15 |     nlp = spacy.load("en_core_web_sm")
16 |     regex = re.compile(r"#(\w*)")
17 |     hashtag_positions = []
18 |     text_id = 0
19 |     for entry in record["text"]:
20 |         doc = nlp(entry)
21 |         for match in regex.finditer(entry):
22 |             start, end = match.span()
23 |             span = doc.char_span(start, end, alignment_mode="expand")
24 |             hashtag_positions.append({f"text_{text_id}": [record["label"], span.start, span.end]}) 
25 |         text_id += 1
26 |     return {"extractions": hashtag_positions}
27 | ```


--------------------------------------------------------------------------------
/extractors/symbols/hashtag_extraction/code_snippet_refinery.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | import re
 3 | 
 4 | ATTRIBUTE: str = "text" # only text attributes
 5 | LABEL: str = "hashtag"
 6 | 
 7 | def hashtag_extraction(record):
 8 |     regex = re.compile(r"#(\w*)")
 9 |     text = record[ATTRIBUTE].text # SpaCy doc, hence we need to use .text to get the string.
10 | 
11 |     for match in regex.finditer(text):
12 |         start, end = match.span()
13 |         span = record[ATTRIBUTE].char_span(start, end, alignment_mode="expand")
14 |         yield LABEL, span.start, span.end
15 | ```


--------------------------------------------------------------------------------
/extractors/util/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/code-kern-ai/bricks/3a43c1e2b1ad8f41cee43028fce12f94fe889ea2/extractors/util/__init__.py


--------------------------------------------------------------------------------
/extractors/util/spacy.py:
--------------------------------------------------------------------------------
 1 | import spacy
 2 | 
 3 | def download_all_models():
 4 |     print("Downloading spacy models...")
 5 |     models = [
 6 |         "en_core_web_sm",
 7 |         "de_core_news_sm"
 8 |     ]
 9 |     for model in models:
10 |         print(f"Downloading {model}...")
11 |         download_model(model)
12 | 
13 | def download_model(model):
14 |     """Download a spacy model if it doesn't exist."""
15 |     try:
16 |         spacy.load(model)
17 |     except OSError:
18 |         spacy.cli.download(model)
19 |         spacy.load(model)
20 | 
21 | class SpacySingleton:
22 |     nlp = None
23 | 
24 |     @classmethod
25 |     def get_nlp(cls, model="en_core_web_sm"):
26 |         if cls.nlp is None:
27 |             cls.nlp = spacy.load(model)
28 |         return cls.nlp


--------------------------------------------------------------------------------
/extractors/words/difficult_words_extraction/README.md:
--------------------------------------------------------------------------------
1 | This module uses `textstat` package to extract the difficult words from a given text.
2 | Community contribution by @rpavani1998


--------------------------------------------------------------------------------
/extractors/words/difficult_words_extraction/code_snippet_refinery.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | import textstat
 3 | import re
 4 | 
 5 | ATTRIBUTE: str =  "text" # only text attributes
 6 | SYLLABLE_THRESHOLD: int = 3
 7 | LABEL: str = "difficult_word"
 8 | 
 9 | def difficult_words_extraction(record):
10 |     text = record[ATTRIBUTE].text
11 |     syllable_threshold = SYLLABLE_THRESHOLD
12 |     difficult_words = textstat.difficult_words_list(text, syllable_threshold)
13 |     
14 |     for word in difficult_words: 
15 |         if word in text:
16 |             start, end = re.search(rf"({word})", text).span() # get the position of the word in the text
17 |             span = record[ATTRIBUTE].char_span(start, end, alignment_mode="expand")
18 |             yield LABEL, span.start, span.end
19 | ```


--------------------------------------------------------------------------------
/extractors/words/goodbye_extraction/README.md:
--------------------------------------------------------------------------------
1 | Extracts all the farewell greetings from a text. This module will extract all occurrences that satisfy the following format: 
2 | - Goodbye/bye
3 | - Ciao
4 | - See you later/soon/tomorrow


--------------------------------------------------------------------------------
/extractors/words/goodbye_extraction/code_snippet_backup.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | import re
 3 | import spacy
 4 | 
 5 | # replace this list with a list containing your data
 6 | text = ["I will leave for now since I have to cook dinner. Goodbye, and ciao to you as well!"]
 7 | 
 8 | # add the texts to a dict called records. Add further information as key-value pairs if needed
 9 | record = {
10 |     "text": text,
11 |     "label": "goodbye",
12 | }
13 | 
14 | def goodbye_extraction(record: dict) -> dict:
15 |     regex = re.compile(r"((?:((?i)good)(?:[ ])?)?((?i)bye)|(?i)Ciao|(?:((?i)see you)(?:[ ]?)((?i)tomorrow|later|soon)?))")
16 |     nlp = spacy.load("en_core_web_sm")
17 | 
18 |     goodbye_positions = []
19 |     text_id = 0
20 |     for entry in record["text"]:
21 |         doc = nlp(entry)
22 |         for match in regex.finditer(entry):
23 |             start, end = match.span()
24 |             span = doc.char_span(start, end, alignment_mode="expand")
25 |             goodbye_positions.append({f"text_{text_id}" :[record["label"], span.start, span.end]})
26 |         text_id += 1
27 |     return {"extraction": goodbye_positions}
28 | ```


--------------------------------------------------------------------------------
/extractors/words/goodbye_extraction/code_snippet_refinery.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | import re
 3 | 
 4 | ATTRIBUTE: str = "text" # only text attributes
 5 | LABEL: str = "goodbye"
 6 | 
 7 | def goodbye_extraction(record):
 8 |     regex = re.compile(r"((?:(good)(?:[ ])?)?(bye)|Ciao|(?:(see you)(?:[ ]?)(tomorrow|later|soon)?))", re.IGNORECASE)
 9 |     text = record[ATTRIBUTE].text # SpaCy doc, hence we need to use .text to get the string.
10 |     
11 |     for match in regex.finditer(text):
12 |         start, end = match.span()
13 |         span = record[ATTRIBUTE].char_span(start, end, alignment_mode="expand")
14 |         yield LABEL, span.start, span.end
15 | ```


--------------------------------------------------------------------------------
/extractors/words/keyword_extraction/README.md:
--------------------------------------------------------------------------------
1 | This module extracts keywords from a text or passage, using unsupervised methods of keyword extraction. For this, it uses the `flashtext` package which takes input keywords that are needed to be matched and extracts the span of those keywords in the text. Due to this requirement, this module only works in refinery version > 1.6.


--------------------------------------------------------------------------------
/extractors/words/keyword_extraction/__init__.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | from flashtext import KeywordProcessor
 3 | from typing import List
 4 | 
 5 | INPUT_EXAMPLE = {
 6 |     "text": "I had such an amazing time in the movies. The popcorn was delicious as well.",
 7 |     "keywords": ["movies", "popcorn"],
 8 | }
 9 | 
10 | 
11 | class KeywordExtractionModel(BaseModel):
12 |     text: str
13 |     keywords: List[str]
14 | 
15 |     class Config:
16 |         schema_example = {"example": INPUT_EXAMPLE}
17 | 
18 | 
19 | def keyword_extraction(request: KeywordExtractionModel):
20 |     """Extracts key phrases in a body of text"""
21 | 
22 |     text = request.text
23 |     keywords = request.keywords
24 |     keyword_processor = KeywordProcessor()
25 |     keyword_processor.add_keywords_from_list(keywords)
26 |     extracted_keywords = keyword_processor.extract_keywords(text, span_info=True)
27 | 
28 |     return {"keywords": extracted_keywords}
29 | 


--------------------------------------------------------------------------------
/extractors/words/keyword_extraction/code_snippet_backup.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | from flashtext import KeywordProcessor
 3 | 
 4 | # replace this list with a list containing your data
 5 | text = ["I had such an amazing time in the movies. The popcorn was delicious as well."]
 6 | 
 7 | # add the texts to a dict called records. Add further information as key-value pairs if needed
 8 | record = {
 9 |     "text": text,
10 |     "keywords": ["movies", "popcorn"],
11 |     "label": "goodbye",
12 | }
13 | 
14 | def keyword_extraction(record: dict) -> dict:
15 |     keyword_processor = KeywordProcessor()
16 |     keyword_processor.add_keywords_from_list(record["keywords"])
17 | 
18 |     keyword_positions = []
19 |     text_id = 0
20 |     for entry in record["text"]:
21 |         keyword_found = keyword_processor.extract_keywords(entry, span_info=True)
22 |         for keyword in keyword_found:
23 |             keyword_positions.append({f"text_{text_id}" :[keyword[0], keyword[1], keyword[2]]})
24 |         text_id += 1
25 |     return {"extraction": keyword_positions}
26 | ```


--------------------------------------------------------------------------------
/extractors/words/keyword_extraction/code_snippet_refinery.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | import re
 3 | from typing import List
 4 | from flashtext import KeywordProcessor
 5 | 
 6 | ATTRIBUTE: str = "text" # only text attributes
 7 | KEYWORDS: List[str] = ["keyword1", "keyword2", "keyword3"]
 8 | LABEL: str = "keyword"
 9 | 
10 | def keyword_extraction(record):
11 |     
12 |     text = record[ATTRIBUTE].text # SpaCy doc, hence we need to use .text to get the string.
13 |     keyword_processor = KeywordProcessor()
14 |     keyword_processor.add_keywords_from_list(KEYWORDS)
15 |     keyword_found = keyword_processor.extract_keywords(text, span_info=True)
16 |     
17 |     if len(keyword_found) > 0:
18 |         for keyword in keyword_found:
19 |             start, end = re.match(rf"({keyword})", text).span()
20 |             span = record[ATTRIBUTE].char_span(start, end)
21 |             yield LABEL, span.start, span.end
22 | ```


--------------------------------------------------------------------------------
/extractors/words/noun_match_extraction/README.md:
--------------------------------------------------------------------------------
1 | The function uses the spacy library to load an English language model and process the input text. It then extracts noun chunks from the processed text and iterates over them. For each noun chunk, if it has more than one word, the first word is taken as the target word. If the target word has not been used before, it is added to a repository to avoid duplicate use and a regular expression pattern is created with the target word. The function then uses this pattern to find all matches in the input text and extracts their spans. Finally, the function returns a list of tuples containing the noun match positions.


--------------------------------------------------------------------------------
/extractors/words/org_extraction/README.md:
--------------------------------------------------------------------------------
1 | This module extracts company/organization names from a given text using spaCy. It uses entity labelling to match entities labelled as `ORG`.


--------------------------------------------------------------------------------
/extractors/words/org_extraction/__init__.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | from extractors.util.spacy import SpacySingleton
 3 | 
 4 | INPUT_EXAMPLE = {
 5 |     "text": "We are developers from Kern.ai",
 6 |     "spacyTokenizer": "en_core_web_sm",
 7 | }
 8 | 
 9 | 
10 | class OrgExtractionModel(BaseModel):
11 |     text: str
12 |     spacyTokenizer: str = "en_core_web_sm"
13 | 
14 |     class Config:
15 |         schema_extra = {"example": INPUT_EXAMPLE}
16 | 
17 | 
18 | def org_extraction(request: OrgExtractionModel):
19 |     """Detects organizations in a given text."""
20 |     text = request.text
21 |     nlp = SpacySingleton.get_nlp(request.spacyTokenizer)
22 |     doc = nlp(text)
23 | 
24 |     organisations = []
25 | 
26 |     for entity in doc.ents:
27 |         if entity.label_ == "ORG":
28 |             organisations.append(["org", entity.start, entity.end])
29 | 
30 |     return {"organisations": organisations}
31 | 


--------------------------------------------------------------------------------
/extractors/words/org_extraction/code_snippet_backup.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | import spacy 
 3 | 
 4 | # replace this list with a list containing your data
 5 | text = ["We are developers from Kern.ai."]
 6 | 
 7 | # add the texts to a dict called records. Add further information as key-value pairs if needed
 8 | record = {
 9 |     "text": text,
10 |     "label": "org",
11 | }
12 | 
13 | def org_extraction(record):
14 |     nlp = spacy.load("en_core_web_sm")
15 | 
16 |     org_positions = []
17 |     text_id = 0
18 |     for entry in record["text"]:
19 |         doc = nlp(entry)
20 |         for entity in doc.ents:
21 |             if entity.label_ == "ORG":
22 |                 org_positions.append({f"text_{text_id}" :[record["label"], entity.start, entity.end]})
23 |         text_id += 1
24 |     return {"extractions": org_positions}
25 | ```


--------------------------------------------------------------------------------
/extractors/words/org_extraction/code_snippet_refinery.md:
--------------------------------------------------------------------------------
1 | ```python
2 | ATTRIBUTE: str = "text" # only text attributes
3 | LABEL: str = "ORG"
4 | 
5 | def org_extraction(record):
6 |     for entity in record[ATTRIBUTE].ents:
7 |         if entity.label_ == "ORG":
8 |             yield LABEL, entity.start, entity.end
9 | ```


--------------------------------------------------------------------------------
/extractors/words/part_of_speech_extraction/README.md:
--------------------------------------------------------------------------------
1 | Provides the part-of-speech tags using spaCy. The entities can have labels `PNOUN`, `ADJ`, `NOUN`, etc.
2 | 
3 | Common part-of-speech tags are: 
4 | 
5 | ADJ	= adjective | ADP = adposition | ADV = adverb | AUX	= auxiliary | CONJ = conjunction | CCONJ = coordinating conjunction | DET = determiner | INTJ = interjection| NOUN = noun | NUM	= numeral | PART = particle | PRON = pronoun | PROPN = proper noun | PUNCT = punctuation | SCONJ = subordinating conjunction | SYM = symbol | VERB = verb | X = other | SPACE =space
6 | 
7 | Full list of all POS-tags can be found here: https://github.com/explosion/spaCy/blob/master/spacy/glossary.py


--------------------------------------------------------------------------------
/extractors/words/part_of_speech_extraction/__init__.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | from pydantic import BaseModel
 3 | from extractors.util.spacy import SpacySingleton
 4 | import re
 5 | 
 6 | INPUT_EXAMPLE = {
 7 |     "text": "My favourite british tea is Yorkshire tea",
 8 |     "spacyTokenizer": "en_core_web_sm"
 9 | }
10 | 
11 | class PartOfSpeechExtractionModel(BaseModel):
12 |     text: str
13 |     spacyTokenizer: Optional[str] = "en_core_web_sm"
14 | 
15 |     class Config:
16 |         schema_extra = {"example": INPUT_EXAMPLE}
17 | 
18 | def part_of_speech_extraction(req: PartOfSpeechExtractionModel):
19 |     """Yields POS tags using spaCy."""
20 |     text = req.text
21 | 
22 |     nlp = SpacySingleton.get_nlp(req.spacyTokenizer)
23 |     doc = nlp(text)
24 | 
25 |     pos_tags = []
26 |     for token in doc:
27 |         pos = token.pos_
28 | 
29 |         start, end = token.i, token.i +1
30 |         span = doc.char_span(start, end, alignment_mode="expand")
31 | 
32 |         pos_tags.append([pos, span.start, span.end])
33 |         
34 |     return {"POS tags": pos_tags}


--------------------------------------------------------------------------------
/extractors/words/part_of_speech_extraction/code_snippet_backup.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | # expects labelling task to have labels ["ADJ", "ADP", "ADV", "AUX", "CONJ", "CCONJ", "DET", "INTJ", "NOUN", "NUM", "PART", "PRON", "PROP", "PUNCT", "SCONJ", "SYM", "VERB", "X", "SPACE"]
 3 | # replace this list with a list containing your data
 4 | import spacy 
 5 | text = ["My favourite british tea is Yorkshire tea", "Coffee is made from beans."]
 6 | 
 7 | # add the texts to a dict called records. Add further information as key-value pairs if needed
 8 | record = {
 9 |     "text": text,
10 | }
11 | 
12 | def part_of_speech_extraction(record):
13 |     nlp = spacy.load("en_core_web_sm")
14 | 
15 |     pos_positions = []
16 |     text_id = 0
17 |     for entry in record["text"]:
18 |         doc = nlp(entry)
19 |         for token in doc:
20 |                 pos = token.pos_
21 |                 if pos:
22 |                     pos_positions.append({f"text_{text_id}": [pos, token.i, token.i+1]}) 
23 |         text_id += 1    
24 |     return {"extractions": pos_positions}
25 | ```


--------------------------------------------------------------------------------
/extractors/words/part_of_speech_extraction/code_snippet_common.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | import spacy 
 3 | from typing import List, Tuple
 4 | 
 5 | def part_of_speech_extraction(text:str) -> List[Tuple[str,int]]:
 6 |     """
 7 |     @param text: the input text
 8 |     @return: POS tag positions
 9 |     """
10 |     nlp = spacy.load("en_core_web_sm")
11 |     doc = nlp(text)
12 | 
13 |     pos_positions = []
14 |     for token in doc:
15 |             pos = token.pos_
16 |             if pos:
17 |                 pos_positions.append((pos, token.i, token.i+1))  
18 |     return pos_positions
19 | 
20 | # ↑ necessary bricks function 
21 | # -----------------------------------------------------------------------------------------
22 | # ↓ example implementation
23 | 
24 | def example_integration():
25 |     texts = ["My favourite british tea is Yorkshire tea", "Coffee is made from beans."]
26 |     for text in texts:
27 |         found = part_of_speech_extraction(text)
28 |         print(f"text: \"{text}\" has -> \"{found}\"")
29 | 
30 | example_integration()
31 | ```


--------------------------------------------------------------------------------
/extractors/words/part_of_speech_extraction/code_snippet_refinery.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | # expects labelling task to have labels ["ADJ", "ADP", "ADV", "AUX", "CONJ", "CCONJ", "DET", "INTJ", "NOUN", "NUM", "PART", "PRON", "PROP", "PUNCT", "SCONJ", "SYM", "VERB", "X", "SPACE"]
 3 | ATTRIBUTE: str = "text" # only text attributes
 4 | 
 5 | def part_of_speech_extraction(record):
 6 |     doc = record[ATTRIBUTE]
 7 |     for token in doc:
 8 |             pos = token.pos_
 9 |             if pos:
10 |                 yield pos, token.i, token.i+1
11 | ```


--------------------------------------------------------------------------------
/extractors/words/quote_extraction/README.md:
--------------------------------------------------------------------------------
1 | This module extracts the quotes and dialogues from a given text string using regular expressions. 


--------------------------------------------------------------------------------
/extractors/words/quote_extraction/code_snippet_refinery.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | import re
 3 | 
 4 | ATTRIBUTE: str = "text" # only text attributes
 5 | LABEL: str = "quote"
 6 | 
 7 | def quote_extraction(record):
 8 |     regex = re.compile(r"\".*?\"|\'.*?\'")
 9 |     text = record[ATTRIBUTE].text # SpaCy doc, hence we need to use .text to get the string
10 | 
11 |     for match in regex.finditer(text):
12 |         start, end = match.span()
13 |         span = record[ATTRIBUTE].char_span(start, end, alignment_mode="expand")
14 |         yield LABEL, span.start, span.end
15 | ```


--------------------------------------------------------------------------------
/extractors/words/smalltalk_extraction/README.md:
--------------------------------------------------------------------------------
1 | This module detects the smalltalk language from a passage of text or chats.


--------------------------------------------------------------------------------
/extractors/words/smalltalk_extraction/code_snippet_refinery.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | import re
 3 | from nltk.corpus import stopwords
 4 | 
 5 | ATTRIBUTE: str = "text" # only text attributes
 6 | LABEL: str = "smalltalk"
 7 | 
 8 | def smalltalk_extraction(record):
 9 |     sw = stopwords.words("english")
10 |     regex = re.compile(r"\".*?\"")
11 |     text = record[ATTRIBUTE].text # SpaCy doc, hence we need to use .text to get the string.
12 | 
13 |     for match in regex.finditer(text): 
14 |         start, end = match.span()
15 |         span = record[ATTRIBUTE].char_span(start, end, alignment_mode="expand")
16 |         text_list_original = span.text.replace('"', '').replace(',', '').split()
17 |         new_text = []
18 |         stop_words = []
19 |         for token in text_list_original:
20 |             if token not in sw:
21 |                 new_text.append(token)
22 |             else:
23 |                 stop_words.append(token)
24 |         if len(new_text) < 0.5*len(text_list_original) or len(stop_words) < 8:
25 |             yield LABEL, span.start, span.end
26 |         else:
27 |             pass
28 | ```


--------------------------------------------------------------------------------
/extractors/words/substring_extraction/README.md:
--------------------------------------------------------------------------------
1 | Extracts substrings using the build-in `difflib` library. It takes two strings as inputs and looks for common substring occurrences in both the strings depending on the size of minimum pre-defined length of the substring. 


--------------------------------------------------------------------------------
/extractors/words/substring_extraction/code_snippet_backup.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | import spacy
 3 | 
 4 | # replace this list with a list containing your data
 5 | text = ["Italians eat a lot of pasta, often with tomatoes."]
 6 | 
 7 | # add the texts to a dict called records. Add further information as key-value pairs if needed
 8 | record = {
 9 |     "text": text,
10 |     "label": "substring",
11 |     "substring": "Italians eat a lot of pasta"
12 | }
13 | 
14 | def substring_extraction(record):
15 |     nlp = spacy.load("en_core_web_sm")
16 |     substring = record["substring"]
17 | 
18 |     substring_position = []
19 |     text_id = 0
20 |     for entry in record["text"]:
21 |         doc = nlp(entry)
22 | 
23 |         start_index = entry.find(substring)
24 |         end_index = start_index + len(substring)
25 | 
26 |         if start_index != -1:
27 |             span = doc.char_span(start_index, end_index, alignment_mode="expand")
28 |             substring_position.append({f"text_{text_id}": [record["label"], span.start, span.end]}) 
29 |         text_id += 1
30 |     return {"extractions": substring_position}
31 | ```


--------------------------------------------------------------------------------
/extractors/words/substring_extraction/code_snippet_refinery.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | SUBSTRING: str = "example"
 3 | ATTRIBUTE: str = "text" # only text attributes
 4 | LABEL: str = "substring"
 5 | 
 6 | def substring_extraction(record):
 7 |     """Extracts a common substring between two strings."""
 8 | 
 9 |     string1 = record[ATTRIBUTE].text # SpaCy doc, hence we need to use .text to get the string.
10 |     string2 = SUBSTRING
11 | 
12 |     start_index = string1.find(string2)
13 |     end_index = start_index + len(string2)
14 | 
15 |     if start_index != -1:
16 |         span = record[ATTRIBUTE].char_span(start_index, end_index, alignment_mode="expand")
17 |         yield LABEL, span.start, span.end
18 | ```


--------------------------------------------------------------------------------
/extractors/words/synonym_extraction/README.md:
--------------------------------------------------------------------------------
1 | This module extracts synonyms of a specified word using `Wordnet`. It takes a target word as the input and the input text. Then, it matches the target word with each of the words in the text string and based on wordnet, extracts the synonym of the target word from the input text.


--------------------------------------------------------------------------------
/extractors/words/verb_phrase_extraction/README.md:
--------------------------------------------------------------------------------
1 | A verb phrase is a syntactic unit composed of at least one verb. this verb can be joined by other chunks, such as noun phrases. Verb phrases are useful for understanding the actions that nouns are involved in.


--------------------------------------------------------------------------------
/extractors/words/verb_phrase_extraction/__init__.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | import textacy
 3 | 
 4 | INPUT_EXAMPLE = {
 5 |     "text": "In the next section, we will build a new model which is more accurate than the previous one.",
 6 |     "spacyTokenizer": "en_core_web_sm",
 7 | }
 8 | 
 9 | 
10 | class VerbPhraseExtractionModel(BaseModel):
11 |     text: str
12 |     spacyTokenizer: str = "en_core_web_sm"
13 | 
14 |     class Config:
15 |         schema_extra = {"example": INPUT_EXAMPLE}
16 | 
17 | 
18 | def verb_phrase_extraction(request: VerbPhraseExtractionModel):
19 |     """Extracts the verb phrases from a record"""
20 | 
21 |     text = request.text
22 |     patterns = [{"POS": "AUX"}, {"POS": "VERB"}]
23 |     doc = textacy.make_spacy_doc(text, lang=request.spacyTokenizer)
24 |     verb_phrase = textacy.extract.token_matches(doc, patterns=patterns)
25 |     verb_chunk = []
26 |     for chunk in verb_phrase:
27 |         verb_chunk.append(["match", chunk.start, chunk.end])
28 | 
29 |     return {"action": verb_chunk}
30 | 


--------------------------------------------------------------------------------
/extractors/words/verb_phrase_extraction/code_snippet_refinery.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | import textacy
 3 | import os
 4 | 
 5 | ATTRIBUTE: str = "text"  # only texts allowed
 6 | TOKENIZER: str = "en_core_web_sm" 
 7 | LABEL: str = "verb-action"
 8 | 
 9 | stream = os.popen(f'python -m spacy download {TOKENIZER}')
10 | output = stream.read()
11 | 
12 | def verb_phrase_extraction(record):
13 |     text = record[ATTRIBUTE].text
14 |     patterns = [{"POS": "AUX"}, {"POS": "VERB"}]
15 |     about_talk_doc = textacy.make_spacy_doc(
16 |         text, lang=TOKENIZER
17 |     )
18 |     verb_phrase = textacy.extract.token_matches(
19 |         about_talk_doc, patterns=patterns
20 |     )
21 |     
22 |     for chunk in verb_phrase:
23 |         yield LABEL, chunk.start, chunk.end
24 | ```


--------------------------------------------------------------------------------
/extractors/zero_shot/README.md:
--------------------------------------------------------------------------------
1 | We don't support zero-shot configurations as code in refinery yet, but as soon as we do so, you can find zero-shot configurations in here.


--------------------------------------------------------------------------------
/generators/README.md:
--------------------------------------------------------------------------------
1 | # Generators 
2 | Think of them as functions returning values, i.e. one input = one output, but that output is sequential.


--------------------------------------------------------------------------------
/generators/_template/_template_func/README.md:
--------------------------------------------------------------------------------
1 | A brick module should contain a README which describes the use and functionality of a brick. This is also the place where you can provide additional information. You may also include your name here let others know who contributed this brick!


--------------------------------------------------------------------------------
/generators/_template/_template_func/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/code-kern-ai/bricks/3a43c1e2b1ad8f41cee43028fce12f94fe889ea2/generators/_template/_template_func/__init__.py


--------------------------------------------------------------------------------
/generators/_template/_template_func/code_snippet_common.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/code-kern-ai/bricks/3a43c1e2b1ad8f41cee43028fce12f94fe889ea2/generators/_template/_template_func/code_snippet_common.md


--------------------------------------------------------------------------------
/generators/distance/euclidean_distance/README.md:
--------------------------------------------------------------------------------
1 | Calculates the euclidean distance between two vectors of text embeddings, which are created by a tf-idf vectorizer. The longer the texts are, the more accurate the results should be. The euclidean distance can be used to determine the similarity between two vectors. Please keep in mind that the optimal distance metric always depends on your use-case. For an alternative use the cosine-similarity.
2 | 
3 | The difference between the code in `code_snippet_refinery.md` and `code_snippet_common.md` is, that the common code returns a matrix of all the similarities, while the refinery code returns only individual values based on a base sentence. This is because the refinery code cannot (yet) access all the data at the same time. Please also note that refinery also calculates the cosine similarity between all texts based on the transformer embeddings in the vector database by default. 


--------------------------------------------------------------------------------
/generators/distance/euclidean_distance/code_snippet_refinery.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | import numpy as np 
 3 | from numpy.linalg import norm
 4 | from sklearn.feature_extraction.text import TfidfVectorizer
 5 | 
 6 | ATTRIBUTE: str = "text" # only text attributes
 7 | SUBJECT_TEXT: str = "Ten amazing facts about the sun"
 8 | 
 9 | def euclidean_distance(record):
10 |     text = record[ATTRIBUTE].text # SpaCy document, hence we need to call .text to get the string
11 | 
12 |     # Transform sentences to a vector
13 |     tfidf = TfidfVectorizer()
14 |     vects = tfidf.fit_transform([SUBJECT_TEXT.lower(), text.lower()])
15 |     vects = vects.todense()
16 |     vect_one, vect_two = np.squeeze(np.asarray(vects[0])), np.squeeze(np.asarray(vects[1]))
17 | 
18 |     # Return the calculated euclidean distance
19 |     return np.linalg.norm(vect_one - vect_two)
20 | ```


--------------------------------------------------------------------------------
/generators/distance/hamming_distance/README.md:
--------------------------------------------------------------------------------
1 | Calculates the hamming distance between vector representations of two strings. The hamming distance is the proportion of disagreeing components in two vectors, 1.0 being very dissimilar and 0.0 meaning completely the same. This modules returns a `float` value for hamming distance.
2 | 
3 | The difference between the code in `code_snippet_refinery.md` and `code_snippet_common.md` is, that the common code returns a matrix of all the similarities, while the refinery code returns only individual values based on a base sentence. This is because the refinery code cannot (yet) access all the data at the same time. Please also note that refinery also calculates the cosine similarity between all texts based on the transformer embeddings in the vector database by default. 


--------------------------------------------------------------------------------
/generators/distance/hamming_distance/code_snippet_refinery.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | import numpy as np
 3 | from scipy.spatial.distance import hamming
 4 | from sklearn.feature_extraction.text import TfidfVectorizer
 5 | 
 6 | ATTRIBUTE: str = "text" # only text attributes
 7 | BASE_SENTENCE: str = "This is the base sentence you want to find the distances to." 
 8 | 
 9 | def hamming_distance(record):
10 | 
11 |     text_two = record[ATTRIBUTE].text # SpaCy document, hence we need to call .text to get the string
12 | 
13 |     tfidf = TfidfVectorizer().fit_transform([BASE_SENTENCE.lower(), text_two.lower()])
14 | 
15 |     dense = tfidf.toarray()
16 |     vect_one, vect_two = np.squeeze(dense[0]), np.squeeze(dense[1])
17 | 
18 |     if vect_one.shape == () or vect_two.shape == ():
19 |         pass
20 | 
21 |     else:
22 |         return hamming(vect_one, vect_two)
23 | ```


--------------------------------------------------------------------------------
/generators/distance/levenshtein_distance/README.md:
--------------------------------------------------------------------------------
1 | The Levenshtein distance is a string metric for measuring the difference between two sequences. It is calculated as the minimum number of single-character edits necessary to
2 | transform one string into another.
3 | 
4 | The optional weights are for the three operations in the form of a tuple (insertion, deletion, substitution).
5 | 
6 | The difference between the code in `code_snippet_refinery.md` and `code_snippet_common.md` is, that the common code returns a matrix of all the similarities, while the refinery code returns only individual values based on a base sentence. This is because the refinery code cannot (yet) access all the data at the same time. Please also note that refinery also calculates the cosine similarity between all texts based on the transformer embeddings in the vector database by default. 


--------------------------------------------------------------------------------
/generators/distance/levenshtein_distance/code_snippet_refinery.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | from Levenshtein import distance as levenshtein_distance
 3 | 
 4 | BASE_SENTENCE: str = "This is a base sentence to compare to."
 5 | ATTRIBUTE: str = "headline" #only text attributes
 6 | WEIGHT_INSERTION: int = 1 # Optional
 7 | WEIGHT_DELETION: int = 1 # Optional
 8 | WEIGHT_SUBSTITUTION: int = 1 # Optional
 9 | 
10 | def levenshtein_distance(record):
11 |     str_01 = BASE_SENTENCE
12 |     str_02 = record[ATTRIBUTE].text # SpaCy document, hence we need to call .text to get the string
13 |     
14 |     weights_tuple = [1,1,1]
15 |     if WEIGHT_INSERTION is not None:
16 |         weights_tuple[0] = WEIGHT_INSERTION
17 |     if WEIGHT_DELETION is not None:
18 |         weights_tuple[1] = WEIGHT_DELETION
19 |     if WEIGHT_SUBSTITUTION is not None:
20 |         weights_tuple[2] = WEIGHT_SUBSTITUTION
21 |     return levenshtein_distance(str_01, str_02, weights=tuple(weights_tuple))
22 | ```


--------------------------------------------------------------------------------
/generators/distance/manhattan_distance/README.md:
--------------------------------------------------------------------------------
1 | Calculates to manhattan distance, also known as the cityblock distance, between two vectors. Can be used as a similarity metric between two texts. Uses TF-IDF to vectorize the text inputs.


--------------------------------------------------------------------------------
/generators/distance/manhattan_distance/code_snippet_refinery.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | import numpy as np 
 3 | from numpy.linalg import norm
 4 | from sklearn.feature_extraction.text import TfidfVectorizer
 5 | 
 6 | ATTRIBUTE: str = "text" # only text attributes
 7 | SUBJECT_TEXT: str = "Insert the sentence you want to compare your records to here!"
 8 | 
 9 | def manhattan_distance(record):
10 |     text = record[ATTRIBUTE].text # SpaCy document, hence we need to call .text to get the string
11 | 
12 |     # Transform sentences to a vector
13 |     tfidf = TfidfVectorizer()
14 |     vects = tfidf.fit_transform([SUBJECT_TEXT.lower(), text.lower()])
15 |     vects = vects.todense()
16 |     vect_one, vect_two = np.squeeze(np.asarray(vects[0])), np.squeeze(np.asarray(vects[1]))
17 | 
18 |     # Return the calculated manhattan distance
19 |     return sum(abs(val1-val2) for val1, val2 in zip(vect_one, vect_two))
20 | ```


--------------------------------------------------------------------------------
/generators/lemmatizer/spacy_lemmatizer/README.md:
--------------------------------------------------------------------------------
1 | Reduces all tokens in a text to their base form with the use of a vocabulary and morphological analysis of the tokens. Uses a spaCy model, see official documentation here: https://spacy.io/api/lemmatizer


--------------------------------------------------------------------------------
/generators/lemmatizer/spacy_lemmatizer/__init__.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | from extractors.util.spacy import SpacySingleton
 3 | 
 4 | INPUT_EXAMPLE = {"text": "Hello, I am talking about coding at Kern AI!"}
 5 | 
 6 | 
 7 | class SpacyLemmatizerModel(BaseModel):
 8 |     text: str
 9 | 
10 |     class Config:
11 |         schema_extra = {"example": INPUT_EXAMPLE}
12 | 
13 | 
14 | def spacy_lemmatizer(req: SpacyLemmatizerModel):
15 |     """Converts words in a sentence to there base form."""
16 |     text = req.text
17 | 
18 |     nlp = SpacySingleton.get_nlp("en_core_web_sm")
19 |     doc = nlp(text)
20 |     final_text = ""
21 |     for i, token in enumerate(doc):
22 |         if i > 0:
23 |             diff = token.idx - (doc[i - 1].idx + len(doc[i - 1]))
24 |             if diff > 0:
25 |                 final_text += " " * diff
26 |         final_text += token.lemma_
27 |     return {"lemmatized_text": final_text}
28 | 


--------------------------------------------------------------------------------
/generators/lemmatizer/spacy_lemmatizer/code_snippet_backup.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | import spacy
 3 | 
 4 | # replace this list with a list containing your data
 5 | text = ["Pizza is very delicious.", "Titanic is a movie made by James Cameron", "Apple pie is also very delicious."]
 6 | 
 7 | # add the texts to a dict called records. Add further information as key-value pairs if needed
 8 | record = {
 9 |     "text": text,
10 | }
11 | 
12 | def spacy_lemmatizer(record):
13 |     nlp = spacy.load("en_core_web_sm")
14 |     lemmatized_text = []
15 |     for entry in record["text"]:
16 |         doc = nlp(entry)
17 |         lemmatized_text.append(" ".join([token.lemma_ for token in doc]))
18 |     return {"lemmatizedTexts": lemmatized_text}
19 | ```
20 | 


--------------------------------------------------------------------------------
/generators/lemmatizer/spacy_lemmatizer/code_snippet_common.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | import spacy
 3 | 
 4 | def spacy_lemmatizer(text: str) -> str:
 5 |     """
 6 |     @param text: base text
 7 |     @return: All tokens in a text in their base form
 8 |     """
 9 |     nlp = spacy.load("en_core_web_sm")
10 |     doc = nlp(text)
11 |     final_text = ""
12 |     for i, token in enumerate(doc):
13 |         if i > 0:
14 |             diff = token.idx - (doc[i-1].idx + len(doc[i-1]))
15 |             if diff > 0:
16 |                 final_text+=" "*diff
17 |         final_text+=token.lemma_
18 |     return final_text
19 |     
20 | # ↑ necessary bricks function 
21 | # -----------------------------------------------------------------------------------------
22 | # ↓ example implementation 
23 | 
24 | def example_integration():
25 |     texts = ["Pizza is very delicious.", "Titanic is a movie made by James Cameron", "Apple pie is also very delicious."]
26 |     for text in texts:
27 |         print(f"lemmatized text: \"{text}\" is \"{spacy_lemmatizer(text)}\"")
28 | 
29 | example_integration()
30 | ```
31 | 


--------------------------------------------------------------------------------
/generators/lemmatizer/spacy_lemmatizer/code_snippet_refinery.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | ATTRIBUTE: str = "text"
 3 | 
 4 | def spacy_lemmatizer(record):
 5 |     final_text = ""
 6 |     for i, token in enumerate(record[ATTRIBUTE]):
 7 |         if i > 0:
 8 |             diff = token.idx - (record[ATTRIBUTE][i-1].idx + len(record[ATTRIBUTE][i-1]))
 9 |             if diff > 0:
10 |                 final_text+=" "*diff
11 |         final_text+=token.lemma_
12 |     return final_text
13 | ```
14 | 


--------------------------------------------------------------------------------
/generators/llm/bert_toxicity_detector/README.md:
--------------------------------------------------------------------------------
1 | Uses the `unitary/toxic-bert` model from Hugging Face Hub to classify toxicity in text. Outputs various labels and their respective scores. An API key can be obtained directly from Hugging Face Inference API. Contact us at info@kern.ai if you require an API key or need any support from us. Community contribution by @rasdani.
2 | 
3 | Check out Hugging Face Hub for example: https://huggingface.co/unitary/toxic-bert
4 | 


--------------------------------------------------------------------------------
/generators/llm/bert_toxicity_detector/code_snippet_common.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | import requests
 3 | 
 4 | def bert_toxicity_detector(text: str, api_key: str) -> dict:
 5 |     headers = {"Authorization": f"Bearer {api_key}"}
 6 |     response = requests.post("https://api-inference.huggingface.co/models/unitary/toxic-bert", headers=headers, json={"inputs": text})
 7 |     json_response = response.json()
 8 |     result = [
 9 |         {item["label"]: item["score"] for item in entry}
10 |         for entry in json_response
11 |     ]
12 |     return result[0]
13 | 
14 | # ↑ necessary bricks function 
15 | # -----------------------------------------------------------------------------------------
16 | # ↓ example implementation 
17 | 
18 | def example_integration():
19 |     api = "<API_KEY_GOES_HERE>"
20 |     texts = ["Damn you are a stupid moron!", "The flowers look beautiful today.", "I hate all german people!", "I love you!"]
21 |     for text in texts:
22 |         print(f"\"{text}\" is {bert_toxicity_detector(text, api)}")
23 | 
24 | example_integration()
25 | ```


--------------------------------------------------------------------------------
/generators/llm/bert_toxicity_detector/code_snippet_refinery.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | import requests
 3 | import json
 4 | 
 5 | API_KEY: str = "<API_KEY_GOES_HERE>"
 6 | ATTRIBUTE: str = "headline" # only text attributes
 7 | 
 8 | def bert_toxicity_detector(record):
 9 |     """
10 |     Uses toxic-bert via Hugging Face Inference API to classify toxicity in text.
11 |     """
12 |     api_token = API_KEY
13 |     inputs = record[ATTRIBUTE].text
14 |     headers = {"Authorization": f"Bearer {api_token}"}
15 |     response = requests.post("https://api-inference.huggingface.co/models/unitary/toxic-bert", headers=headers, json={"inputs": inputs})
16 |     json_response = response.json()
17 |     result = [
18 |         {item["label"]: item["score"] for item in entry}
19 |         for entry in json_response
20 |     ]
21 |     return json.dumps(result[0])
22 | ```


--------------------------------------------------------------------------------
/generators/llm/gpt_grammar_correction/README.md:
--------------------------------------------------------------------------------
1 | Uses OpenAI's `GPT-3.5-turbo` GPT model to correct a sentence into standard English. An API key can be obtained directly from OpenAI. Contact us at info@kern.ai if you require an API key or need any support from us.


--------------------------------------------------------------------------------
/generators/llm/gpt_tldr_summarization/README.md:
--------------------------------------------------------------------------------
1 | Uses OpenAI's `GPT-3.5-turbo` GPT model to summarise a given text in a few lines. An API key can be obtained directly from OpenAI. Contact us at info@kern.ai if you require an API key or need any support from us.


--------------------------------------------------------------------------------
/generators/ngram/nltk_ngram_generator/README.md:
--------------------------------------------------------------------------------
1 | 
2 | This nltk-based n-gram generator allows to generate word n-grams from a given input sentence. Utilizes the spaCy tokenizer for tokenization. An n-gram is a contiguous sequence of 'n' words from a text. For example, in the sentence "The quick brown fox", the 2-grams (bigrams) are "The quick" and "quick brown", and "brown fox". 
3 | Users can specify the size of the n-grams, ranging from bigrams (2-grams) to n-grams of any size, as well as spaCy language model (default is en_core_web_sm). 
4 | More about spaCy: https://spacy.io/usage/models
5 | More about nltk: https://www.nltk.org/index.html
6 | 
7 | 


--------------------------------------------------------------------------------
/generators/ngram/nltk_ngram_generator/code_snippet_backup.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | import spacy
 3 | from nltk.util import ngrams
 4 | 
 5 | # replace this list with a list containing your data
 6 | text = ["Despite the unpredictable weather, the enthusiastic crowd gathered at the park for the annual summer festival, eagerly anticipating an evening filled with music, food, and vibrant celebrations."]
 7 | 
 8 | # add the texts to a dict called records. Add further information as key-value pairs if needed
 9 | record = {
10 |     "sentence": text,
11 |     "ngram_size": 2
12 | }
13 | 
14 | def nltk_ngram_generator(record):
15 |     nlp = spacy.load("en_core_web_sm")
16 |     doc = nlp(record.sentence)
17 |     tokens  =[token.text for token in doc]
18 |     n_grams = list(ngrams(tokens, record["ngram_size"]))
19 | 
20 |     return {"n_grams": n_grams}
21 | ```
22 | 
23 | 
24 | 


--------------------------------------------------------------------------------
/generators/ngram/nltk_ngram_generator/code_snippet_refinery.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | from nltk.util import ngrams
 3 | ATTRIBUTE: str = "text" # only text fields
 4 | NGRAM_SIZE: int = 2
 5 | 
 6 | def nltk_ngram_generator(record):
 7 | 
 8 |     tokens = [token.text for token in record[ATTRIBUTE]]
 9 |     n_grams = list(ngrams(tokens, NGRAM_SIZE))
10 |     n_grams_str = str(n_grams).strip('[]')
11 | 
12 |     return n_grams_str
13 | ```
14 | 


--------------------------------------------------------------------------------
/generators/paths/domain_parser/README.md:
--------------------------------------------------------------------------------
1 | Extracts the root domain from a given URL. You can choose to keep or cut the subdomain in front of the root domain, too. 
2 | The root domain is the highest hierarchical for the website. It includes the domain name and the top-level-domain.


--------------------------------------------------------------------------------
/generators/paths/domain_parser/__init__.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | from urllib.parse import urlsplit
 3 | 
 4 | INPUT_EXAMPLE = {
 5 |     "text": "https://huggingface.co/sentence-transformers",
 6 | }
 7 | 
 8 | class DomainParserModel(BaseModel):
 9 |     text: str
10 | 
11 |     class Config:
12 |         schema_extra = {"example": INPUT_EXAMPLE}
13 | 
14 | 
15 | def domain_parser(request: DomainParserModel):
16 |     """Parses a domain of a URL."""
17 |     link = request.text
18 |     if "http" in link:
19 |         parser = urlsplit(link)
20 |         domain = parser.netloc
21 |     else:
22 |         part = link.strip('/').split('/')
23 |         domain = part[0]
24 |     if "www." in domain:
25 |             domain = domain.lstrip("www.")
26 |     return domain
27 | 
28 | 
29 | 
30 | 


--------------------------------------------------------------------------------
/generators/paths/domain_parser/code_snippet_refinery.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | from urllib.parse import urlsplit
 3 | 
 4 | ATTRIBUTE: str = "text" # only text attributes
 5 | 
 6 | def domain_parser(record):
 7 |     link = record[ATTRIBUTE].text
 8 |     if "http" in link:
 9 |         parser = urlsplit(link)
10 |         domain = parser.netloc
11 |     else:
12 |         part = link.strip('/').split('/')
13 |         domain = part[0]
14 |     if "www." in domain:
15 |             domain = domain.lstrip("www.")
16 |     return domain
17 | 
18 | ```
19 | 


--------------------------------------------------------------------------------
/generators/paths/url_keyword_parser/README.md:
--------------------------------------------------------------------------------
1 | Extracts keywords from URLs using regex and NLTK. There are multiple configurations you can set. INCLUDE_DOMAIN tries to extract keywords from the domain (e.g. "google.com") as well. If INCLUDE_PARAMETER is set to true, we do the same with all remaining parts of the URL (params, query & fragment of `urlparse`).  CHECK_VALID_URL allows you to check if a URL is valid or not. With REMOVE_NONE_ENGLISH and REMOVE_STOPWORDS you can opt to remove non-english words or stop-words provided by nltk. Setting REMOVE_HEX_LIKE to true remove any hex numbers in the URL. With TEXT_SEPERATOR you can set the separator for the final text and SPLIT_REGEX allows you to add any possible regex for splitting the URL. If you want to whitelist words you can do so by adding them to WORD_WHITE_LIST.


--------------------------------------------------------------------------------
/generators/reference_chunking/newline_splitter/README.md:
--------------------------------------------------------------------------------
1 | This brick splits a text into smaller pieces by newline characters. It can be used for the 'embedding list' attribute in refinery to increase the accuracy for similarity search. 


--------------------------------------------------------------------------------
/generators/reference_chunking/newline_splitter/__init__.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | 
 3 | INPUT_EXAMPLE = {
 4 |     "text": """This is the first line.
 5 |     And this is the second line.
 6 |     Here's a third one, too.
 7 |     """,
 8 | }
 9 | 
10 | class NewlineSplitterModel(BaseModel):
11 |     text: str
12 | 
13 |     class Config:
14 |         schema_extra = {"example": INPUT_EXAMPLE}
15 | 
16 | 
17 | def newline_splitter(req: NewlineSplitterModel):
18 |     """Splits a text by newline characters"""
19 |     splits = [t.strip() for t in req.text.split("\n")]
20 |     return {"splitted_text" : [val for val in splits if len(val) > 0]}
21 | 
22 | 
23 | 


--------------------------------------------------------------------------------
/generators/reference_chunking/newline_splitter/code_snippet_common.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | from typing import List 
 3 | 
 4 | def newline_splitter(text: str) -> List[str]:
 5 |     """
 6 |     @param text: The input string that needs to be split.
 7 |     @return:  A list of strings where each string is a non-empty line from the input.
 8 |     """
 9 |     splits = [t.strip() for t in text.split("\n")]
10 |     return [val for val in splits if len(val) > 0]
11 | 
12 | # ↑ necessary bricks function 
13 | # -----------------------------------------------------------------------------------------
14 | # ↓ example implementation
15 | 
16 | def example_integration():
17 |     texts = ["""
18 |     This is a sentences.
19 |     This too, but in another line
20 |     """, "This is a sentence\nwith a newline literal!"]
21 |     for text in texts:
22 |         print(f"The text {repr(text)} was split into {newline_splitter(text)}")
23 | 
24 | example_integration()
25 | ```


--------------------------------------------------------------------------------
/generators/reference_chunking/newline_splitter/code_snippet_refinery.md:
--------------------------------------------------------------------------------
1 | ```python
2 | ATTRIBUTE: str = "text" # only text attributes
3 | 
4 | def newline_splitter(record):
5 |     splits = [t.strip() for t in record[ATTRIBUTE].text.split("\n")]
6 |     return [val for val in splits if len(val) > 0]
7 | ```


--------------------------------------------------------------------------------
/generators/reference_chunking/noun_splitter/README.md:
--------------------------------------------------------------------------------
1 | Uses spaCy to find nouns in a texts and returns them in a list. In RAG use-cases, it can be a successful strategy to conduct a similarity search only on the nouns. This can be done in refinery by using this brick to create an embedding list.


--------------------------------------------------------------------------------
/generators/reference_chunking/noun_splitter/__init__.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | from extractors.util.spacy import SpacySingleton
 3 | 
 4 | INPUT_EXAMPLE = {
 5 |     "text": "My favorite noun is 'friend'.",
 6 |     "spacy_model": "en_core_web_sm",
 7 | }
 8 | 
 9 | 
10 | class NounSplitterModel(BaseModel):
11 |     text: str
12 |     spacy_model: str
13 | 
14 |     class Config:
15 |         schema_extra = {"example": INPUT_EXAMPLE}
16 | 
17 | 
18 | def noun_splitter(req: NounSplitterModel):
19 |     """Creates embedding chunks based on the nouns in a text"""
20 |     nlp = SpacySingleton.get_nlp(req.spacy_model)
21 |     doc = nlp(req.text)
22 | 
23 |     nouns_sents = set()
24 |     for sent in doc.sents:
25 |         for token in sent:
26 |             if token.pos_ == "NOUN" and len(token.text) > 1:
27 |                 nouns_sents.add(token.text)
28 | 
29 |     return {"nouns": list(nouns_sents)}
30 | 


--------------------------------------------------------------------------------
/generators/reference_chunking/noun_splitter/code_snippet_refinery.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | ATTRIBUTE: str = "text" # only text attributes
 3 | 
 4 | def noun_splitter(record):
 5 |     nouns_sents = set()
 6 |     for sent in record[ATTRIBUTE].sents:
 7 |         for token in sent:
 8 |             if token.pos_ == "NOUN" and len(token.text) > 1:
 9 |                 nouns_sents.add(token.text)
10 | 
11 |     return list(nouns_sents)
12 | ```
13 | 


--------------------------------------------------------------------------------
/generators/search/bing_news_search/README.md:
--------------------------------------------------------------------------------
1 | Uses Bing to retrieve search results for news articles. Requires an API key, which can be obtained from Azures Bing Resources. In refinery, set RESPONSE_SIZE to "full" to get the full response as a JSON (stored as string) or choose "compact" to only get the text of the first result. Please contact the Kern AI Team if you need further support at info@kern.ai or join our Discord channel: https://discord.gg/WAnAgQEv 


--------------------------------------------------------------------------------
/generators/search/bing_search/README.md:
--------------------------------------------------------------------------------
1 | Uses Bing to retrieve search results. Requires an API key, which can be obtained from Azures Bing Resources. In refinery, set RESPONSE_SIZE to "full" to get the full response as a JSON (stored as string) or choose "compact" to only get the text of the first result. Please contact the Kern AI Team if you need further support at info@kern.ai or join our Discord channel: https://discord.gg/WAnAgQEv 


--------------------------------------------------------------------------------
/generators/search/google_search/README.md:
--------------------------------------------------------------------------------
1 | **Only works with refinery version > 1.7.1**  
2 | 
3 | Uses Google engine to retrieve search results. This module requires an API key, which can be obtained at https://serpapi.com/. In refinery, set RESPONSE_SIZE to "full" to get the full response as a JSON (stored as string) or choose "compact" to only get the text of the first result. Please contact the Kern AI Team if you need further support at info@kern.ai or join our Discord channel: https://discord.gg/WAnAgQEv 


--------------------------------------------------------------------------------
/generators/search/google_search/code_snippet_refinery.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | from serpapi import GoogleSearch
 3 | import json
 4 | 
 5 | ATTRIBUTE: str = "text" # only text attributes
 6 | LOCATION: str = "Germany"
 7 | LANGUAGE: str = "en"
 8 | GEOLOCATION: str = "de"
 9 | API_KEY: str = "<API_KEY_GOES_HERE>"
10 | RESPONSE_SIZE: str = "full" # choose "compact" to only get text snippet of the first result
11 | 
12 | def google_search(record):
13 |     """Uses Google search to retrieve search results, given the parameters."""
14 |     params = {
15 |         "q": record[ATTRIBUTE].text,
16 |         "location": LOCATION,
17 |         "hl": LANGUAGE,
18 |         "gl": GEOLOCATION,
19 |         "google_domain": f"google.{GEOLOCATION}",
20 |         "api_key": API_KEY,
21 |     }
22 |     search = GoogleSearch(params)
23 |     results = search.get_dict()
24 |     
25 |     if RESPONSE_SIZE == "full":
26 |         return json.dumps(results) # returns full response
27 |     elif RESPONSE_SIZE == "compact":
28 |         return results["organic_results"][0]["snippet"] # only returns text of first response
29 | ```
30 | 


--------------------------------------------------------------------------------
/generators/search/nyt_news_search/README.md:
--------------------------------------------------------------------------------
1 | Searches through news articles of the New York Times. The API key is available for free here: https://developer.nytimes.com/


--------------------------------------------------------------------------------
/generators/search/nyt_news_search/__init__.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | import requests
 3 | 
 4 | INPUT_EXAMPLE = {
 5 |     "query": "US election 2020.",
 6 |     "apiKey": "<API_KEY_GOES_HERE>",
 7 | }
 8 | 
 9 | 
10 | class NytNewsSearchModel(BaseModel):
11 |     query: str
12 |     apiKey: str
13 | 
14 |     class Config:
15 |         schema_extra = {"example": INPUT_EXAMPLE}
16 | 
17 | 
18 | def nyt_news_search(req: NytNewsSearchModel):
19 |     """Search for New York Times news articles."""
20 |     query = req.query
21 |     key = req.apiKey
22 | 
23 |     req = requests.get(
24 |         f"https://api.nytimes.com/svc/search/v2/articlesearch.json?q={query}&api-key={key}"
25 |     )
26 |     search_results = req.json()
27 | 
28 |     response_snippet = search_results["response"]["docs"][0]["snippet"]
29 |     response_url = search_results["response"]["docs"][0]["web_url"]
30 | 
31 |     return {"response_text": response_snippet, "url": response_url}
32 | 


--------------------------------------------------------------------------------
/generators/search/nyt_news_search/code_snippet_backup.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | import requests
 3 | import json
 4 | 
 5 | # replace this list with a list containing your data
 6 | text = ["US election 2020."]
 7 | 
 8 | # add the texts to a dict called records. Add further information as key-value pairs if needed
 9 | record = {
10 |     "text": text,
11 |     "nyt_api_key": "paste your NYT API key here", # go here for free API key https://developer.nytimes.com/
12 |     "output_size": "full", # choose "compact" to only get the text of the first result
13 | }
14 | 
15 | def nyt_news_search(record):
16 |     search_results = []
17 |     for entry in record["text"]:
18 |         req = requests.get(f"https://api.nytimes.com/svc/search/v2/articlesearch.json?q={entry}&api-key={record["nyt_api_key"]}")
19 |         search_results = req.json()
20 | 
21 |         if record["output_size"] == "full":
22 |             search_results.append(json.dumps(search_results))
23 |         elif record["output_size"] == "compact":
24 |             search_results.append(search_results["response"]["docs"][0]["snippet"])
25 |     return {"nytResults": search_results}
26 | ```


--------------------------------------------------------------------------------
/generators/search/nyt_news_search/code_snippet_refinery.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | import requests
 3 | import json
 4 | 
 5 | ATTRIBUTE: str = "text" # only text attributes
 6 | API_KEY: str = "<API_KEY_GOES_HERE>" # go here for free API key https://developer.nytimes.com/
 7 | OUTPUT_SIZE: str = "full" # choose "compact" to only get the text of the first result
 8 | 
 9 | def nyt_news_search(record):
10 |     query = record[ATTRIBUTE]
11 |     key = API_KEY
12 | 
13 |     req = requests.get(f"https://api.nytimes.com/svc/search/v2/articlesearch.json?q={query}&api-key={key}")
14 |     search_results = req.json()
15 | 
16 |     if OUTPUT_SIZE == "full":
17 |         return json.dumps(search_results)
18 |     elif OUTPUT_SIZE == "compact":
19 |         return search_results["response"]["docs"][0]["snippet"]
20 | ```


--------------------------------------------------------------------------------
/generators/sentiment/vader_sentiment_scores/README.md:
--------------------------------------------------------------------------------
1 | VADER (Valence Aware Dictionary and sEntiment Reasoner) is a lexicon and rule-based sentiment analysis tool. It's specifically tuned to get the sentiment of social-media post, but also works on other text-types as well. This version returns a score dictionary as a string. See the generators section for "vader sentiment classifier" to get a brick that only returns the sentiment classes. Learn more here: https://github.com/cjhutto/vaderSentiment


--------------------------------------------------------------------------------
/generators/sentiment/vader_sentiment_scores/__init__.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
 3 | 
 4 | INPUT_EXAMPLE = {
 5 |     "text": "World peace announced by the United Nations.",
 6 | }
 7 | 
 8 | 
 9 | class VaderSentimentScoresModel(BaseModel):
10 |     text: str
11 | 
12 |     class Config:
13 |         schema_extra = {"example": INPUT_EXAMPLE}
14 | 
15 | 
16 | def vader_sentiment_scores(req):
17 |     """Get the sentiment of a text using the VADER algorithm."""
18 |     analyzer = SentimentIntensityAnalyzer()
19 |     text = req.text
20 | 
21 |     vs = analyzer.polarity_scores(text)
22 |     return {"sentiment_scores": vs}
23 | 


--------------------------------------------------------------------------------
/generators/sentiment/vader_sentiment_scores/code_snippet_backup.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
 3 | 
 4 | # replace this list with a list containing your data
 5 | text = ["I hate this.", "Meh it's ok.", "I love this!"]
 6 | 
 7 | # add the texts to a dict called records. Add further information as key-value pairs if needed
 8 | record = {
 9 |     "your_text": text,
10 | }
11 | 
12 | def vader_sentiment_scores(record: dict) -> dict:
13 |     analyzer = SentimentIntensityAnalyzer()
14 | 
15 |     sentiment = []
16 |     for entry in record["your_text"]:
17 |         vs = analyzer.polarity_scores(entry)
18 |         sentiment.append(vs)
19 |     return sentiment
20 | ```


--------------------------------------------------------------------------------
/generators/sentiment/vader_sentiment_scores/code_snippet_common.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | from typing import Dict
 3 | from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
 4 | 
 5 | def vader_sentiment_scores(text: str) -> Dict[str, float]:
 6 |     """ Vader sentiment scores for text
 7 |     @param text: The text to analyze.results.
 8 |     @return: Dict of scores (neg, neu, pos, compound)
 9 |     """
10 |     analyzer = SentimentIntensityAnalyzer()
11 |     return analyzer.polarity_scores(text)
12 | 
13 | # ↑ necessary bricks function 
14 | # -----------------------------------------------------------------------------------------
15 | # ↓ example implementation 
16 | 
17 | def example_integration():
18 |     texts = ["I hate this.", "Meh it's ok.", "I love this!"]  
19 |     for text in texts:
20 |         print(f"The sentiment sores of {text} are: {vader_sentiment_scores(text)}")
21 | example_integration() 
22 | ```


--------------------------------------------------------------------------------
/generators/sentiment/vader_sentiment_scores/code_snippet_refinery.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
 3 | import json
 4 | 
 5 | ATTRIBUTE: str = "text" # only text attributes
 6 | 
 7 | def vader_sentiment_scores(record):
 8 |     analyzer = SentimentIntensityAnalyzer()
 9 |     return json.dumps(analyzer.polarity_scores(record[ATTRIBUTE].text))
10 | ```


--------------------------------------------------------------------------------
/generators/speech_to_text/azure_speech_to_text/README.md:
--------------------------------------------------------------------------------
1 | Important: Currently, it's not possible to store audiofiles directly in Kern AI refinery. How this brick works is by accessing .wav files that are stored remotely, for example is an S3 bucket, a Google Drive and so on. All you need to provide is a valid link to access the file. As this module uses an Azure service, you would also need an active Azure subscription as well as an API key to Azures Cognitive Services. For the region parameter, please provide the region name that your Azure resource is deployed in (for example "northeurope" or "westus"). See [here](https://learn.microsoft.com/en-us/bing/search-apis/bing-news-search/reference/market-codes) for a full list of all available languages. Please contact the Kern AI Team if you need and API key or further support at info@kern.ai or join our Discord channel: https://discord.gg/WAnAgQEv 


--------------------------------------------------------------------------------
/generators/spelling/bing_spelling_correction/README.md:
--------------------------------------------------------------------------------
1 | Uses Bing to correct the spelling of sentences. Returns a corrected sentence. See here for a full list of all available languages. Requires an API key, which can be obtained from Azures Bing Resources. Please contact the Kern AI Team if you need further support at info@kern.ai or join our Discord channel: https://discord.gg/WAnAgQEv


--------------------------------------------------------------------------------
/generators/spelling/textblob_spelling_correction/README.md:
--------------------------------------------------------------------------------
1 | This module checks for spelling errors in a text and returns the corrected text using the TextBlob library. The spellchecker itself is based on the article "How to Write a Spelling Corrector" by Peter Norvig http://norvig.com/spell-correct.html.


--------------------------------------------------------------------------------
/generators/spelling/textblob_spelling_correction/__init__.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | from textblob import TextBlob
 3 | 
 4 | INPUT_EXAMPLE = {
 5 |     "text": "His text contaisn some speling errors.",
 6 | }
 7 | 
 8 | class TextblobSpellingCorrectionModel(BaseModel):
 9 |     text: str
10 | 
11 |     class Config:
12 |         schema_extra = {
13 |             "example": INPUT_EXAMPLE,
14 |         }
15 | 
16 | def textblob_spelling_correction(request: TextblobSpellingCorrectionModel):
17 |     """Correct spelling mistakes in a text using the TextBlob library."""
18 | 
19 |     text = request.text
20 |     textblob_text = TextBlob(text)
21 | 
22 |     return {"correctedText": str(textblob_text.correct())}
23 | 


--------------------------------------------------------------------------------
/generators/spelling/textblob_spelling_correction/code_snippet_backup.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | from textblob import TextBlob
 3 | 
 4 | # replace this list with a list containing your data
 5 | text = ["His text contaisn some speling errors."]
 6 | 
 7 | # add the texts to a dict called records. Add further information as key-value pairs if needed
 8 | record = {
 9 |     "text": text,
10 | }
11 | 
12 | def textblob_spelling_correction(record):
13 |     corrected_texts = []
14 |     for entry in record["text"]:
15 |         textblob_text = TextBlob(entry)
16 |         corrected_texts.append(str(textblob_text.correct()))
17 |     return {"correctedText": corrected_texts}
18 | ```


--------------------------------------------------------------------------------
/generators/spelling/textblob_spelling_correction/code_snippet_common.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | from textblob import TextBlob
 3 | 
 4 | def textblob_spelling_correction(text: str) -> str:
 5 |     """
 6 |     @param text: text to correct
 7 |     @return: corrected text
 8 |     """
 9 |     textblob_text = TextBlob(text)
10 |     return str(textblob_text.correct())
11 | 
12 | # ↑ necessary bricks stuff
13 | # -----------------------------------------------------------------------------------------
14 | # ↓ example implementation 
15 | 
16 | def example_integration():
17 |     texts = ["This text contaisn some speling errors."]
18 | 
19 |     for text in texts:
20 |         print(f"the corrected version of \"{text}\" is: {textblob_spelling_correction(text)}")
21 | example_integration() 
22 | 
23 | ```


--------------------------------------------------------------------------------
/generators/spelling/textblob_spelling_correction/code_snippet_refinery.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | from textblob import TextBlob
 3 | 
 4 | ATTRIBUTE: str = "text" # only text attributes
 5 | 
 6 | def textblob_spelling_correction(record):
 7 |     text = record[ATTRIBUTE].text # SpaCy doc, hence we need to use .text to get the string.
 8 |     textblob_text = TextBlob(text)
 9 |     return str(textblob_text.correct())
10 | ```


--------------------------------------------------------------------------------
/generators/summarization/smalltalk_truncation/README.md:
--------------------------------------------------------------------------------
1 | This module removes all the unnecessary and irrelevant information form chats or dialogues to the best of its accuracy. 


--------------------------------------------------------------------------------
/generators/summarization/smalltalk_truncation/code_snippet_refinery.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | import re
 3 | from nltk.corpus import stopwords
 4 | 
 5 | ATTRIBUTE: str = "text" #only text attributes
 6 | 
 7 | def smalltalk_truncation(record):
 8 |     sw = stopwords.words("english")
 9 |     regex = re.compile(r"\".*?\"")
10 |     text = record[ATTRIBUTE].text # SpaCy doc, hence we need to use .text to get string.
11 | 
12 |     removed_smalltalk = []
13 |     for message in regex.findall(text):
14 |         chat = message.replace('"','')
15 |         chat = chat.split()
16 |         new_text = []
17 |         stop_words = []
18 |         for token in chat:
19 |             if token not in sw:
20 |                 new_text.append(token)
21 |             else:
22 |                 stop_words.append(token)
23 |         if (len(new_text) > 0.5*len(chat) or len(stop_words) > 8) and not len(chat) < 3:
24 |             removed_smalltalk.append(" ".join(chat))
25 | 
26 |     return " ".join(removed_smalltalk)
27 | ```


--------------------------------------------------------------------------------
/generators/summarization/sumy_website_summarizer/README.md:
--------------------------------------------------------------------------------
1 | This brick module is using the sumy library from GitHub to summarize texts from websites like wikipedia or blog articles. The articles are scraped via sumy as well, so an internet connection is needed. The function excepts an url to a website and not a text itself. It uses Latent Semantic Analysis to extract structures of the words and texts used to find relevant sentences. You can read more about the sumy libraray here: https://github.com/miso-belica/sumy


--------------------------------------------------------------------------------
/generators/summarization/sumy_website_summarizer/code_snippet_refinery.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | from sumy.parsers.html import HtmlParser
 3 | from sumy.nlp.tokenizers import Tokenizer
 4 | from sumy.summarizers.lsa import LsaSummarizer as Summarizer
 5 | from sumy.nlp.stemmers import Stemmer
 6 | from sumy.utils import get_stop_words
 7 | 
 8 | ATTRIBUTE: str = "url"
 9 | LANGUAGE: str = "english"
10 | SENTENCE_COUNT: int = 5
11 | 
12 | def sumy_website_summarizer(record):
13 |     parser = HtmlParser.from_url(record[ATTRIBUTE], Tokenizer(LANGUAGE)) 
14 |     stemmer = Stemmer(LANGUAGE)
15 |     summarizer = Summarizer(stemmer)
16 |     summarizer.stop_words = get_stop_words(LANGUAGE)
17 |     summary = summarizer(parser.document, SENTENCE_COUNT)
18 |     return " ".join([str(sentence) for sentence in summary])
19 | ```


--------------------------------------------------------------------------------
/generators/summarization/text_summarization/README.md:
--------------------------------------------------------------------------------
1 | This module is useful if the text is too huge to read. It provides a short summary for the input text. The module uses spacy in order to tokenize the sentences and calculate scores based on the frequency of the tokenized words. The basic idea is that the sentences containing words that have the highest frequency score are assumed to be important. The summary is constructed out of the most important sentences. 


--------------------------------------------------------------------------------
/generators/text_analytics/most_frequent_words/README.md:
--------------------------------------------------------------------------------
1 | This module tokenizes the text and generates the most frequent words. This can be useful where the text document is too large and you want to know if it interests you by looking at the most frequent words.


--------------------------------------------------------------------------------
/generators/text_analytics/most_frequent_words/code_snippet_backup.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | from collections import Counter
 3 | import spacy
 4 | 
 5 | # replace this list with a list containing your data
 6 | text = ["APPL went down by 5% in the past two weeks. Shareholders are concerned over the continued recession since APPL and NASDAQ have been hit hard by this recession. Risks pertaining to short-selling are pouring in as APPL continues to depreciate. If the competitors come together and start short-selling, the stock can face calamity."]
 7 | 
 8 | # add the texts to a dict called records. Add further information as key-value pairs if needed
 9 | record = {
10 |     "text": text,
11 |     "n_words": 5,
12 | }
13 | 
14 | def most_frequent_words(record):
15 |     nlp = spacy.load("en_core_web_sm")
16 |     frequent_words = []
17 |     for entry in record["text"]:
18 |         doc = nlp(entry)
19 |         words = [token.text for token in doc if not token.is_stop and not token.is_punct]
20 |         frequent_words.append(str(Counter(words).most_common(record["n_words"])).strip("[]"))
21 |     return {"words": frequent_words}
22 | ```
23 | 


--------------------------------------------------------------------------------
/generators/text_analytics/most_frequent_words/code_snippet_refinery.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | from collections import Counter
 3 | 
 4 | ATTRIBUTE: str = "text"  # only text fields
 5 | N_WORDS: int = 5
 6 | 
 7 | def most_frequent_words(record):
 8 |     
 9 |     words = [token.text for token in record[ATTRIBUTE] if not token.is_stop and not token.is_punct]
10 |     return str(Counter(words).most_common(N_WORDS)).strip("[]")
11 | ```
12 | 


--------------------------------------------------------------------------------
/generators/text_analytics/phonetic_soundex/README.md:
--------------------------------------------------------------------------------
1 | SoundEx algorithm that converts words, for example names, into phonetic representations.


--------------------------------------------------------------------------------
/generators/text_analytics/reading_time/README.md:
--------------------------------------------------------------------------------
1 | Calculates the reading time of a text, based on following paper: https://homepages.inf.ed.ac.uk/keller/papers/cognition08a.pdf.


--------------------------------------------------------------------------------
/generators/text_analytics/reading_time/__init__.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | import textstat
 3 | 
 4 | INPUT_EXAMPLE = {"text": "This sentence should take less than 1 second to read."}
 5 | 
 6 | 
 7 | class ReadingTimeModel(BaseModel):
 8 |     text: str
 9 | 
10 |     class Config:
11 |         schema_extra = {"example": INPUT_EXAMPLE}
12 | 
13 | 
14 | def reading_time(request: ReadingTimeModel):
15 |     """Calculate the reading time of a text."""
16 |     text = request.text
17 |     time_to_read = textstat.reading_time(text, ms_per_char=14.69)
18 |     return {"readingTime": time_to_read}
19 | 


--------------------------------------------------------------------------------
/generators/text_analytics/reading_time/code_snippet_backup.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | import textstat
 3 | 
 4 | # replace this list with a list containing your data
 5 | text = ["Pizza is very delicious.", "Titanic is a movie made by James Cameron", "Apple pie is also very delicious."]
 6 | 
 7 | # add the texts to a dict called records. Add further information as key-value pairs if needed
 8 | record = {
 9 |     "text": text,
10 | }
11 | 
12 | def reading_time(record):
13 |     time_list = []
14 |     for entry in record["text"]:
15 |         time_list.append(textstat.reading_time(entry, ms_per_char=14.69))
16 |     return {"readingTimes": time_list}
17 | ```


--------------------------------------------------------------------------------
/generators/text_analytics/reading_time/code_snippet_common.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | import textstat
 3 | 
 4 | def reading_time(text:str)->float:
 5 |     """ 
 6 |     @param text: text we check the reading time for
 7 |     @return: reading time in seconds
 8 |     """
 9 |     return textstat.reading_time(text, ms_per_char=14.69)
10 |     
11 | # ↑ necessary bricks stuff
12 | # -----------------------------------------------------------------------------------------
13 | # ↓ example implementation 
14 | 
15 | def example_integration():
16 |     texts = ["Pizza is very delicious.", "Titanic is a movie made by James Cameron", "Apple pie is also very delicious."]
17 | 
18 |     for text in texts:
19 |         print(f"the text \"{text}\" will take around {reading_time(text)} sec")
20 | example_integration() 
21 | ```


--------------------------------------------------------------------------------
/generators/text_analytics/reading_time/code_snippet_refinery.md:
--------------------------------------------------------------------------------
1 | ```python
2 | import textstat
3 | 
4 | ATTRIBUTE: str = "text" # only text attributes
5 | 
6 | def reading_time(record):
7 |     text = record[ATTRIBUTE].text # SpaCy document, hence we need to call .text to get the string
8 |     return textstat.reading_time(text, ms_per_char=14.69)
9 | ```


--------------------------------------------------------------------------------
/generators/text_analytics/syllable_count/README.md:
--------------------------------------------------------------------------------
1 | This module uses `textstat` package to count the number of syllables from a given text. For example, `"There is no one in the hospital"` will output 9, since there are 9 syllables in the whole sentence.


--------------------------------------------------------------------------------
/generators/text_analytics/syllable_count/__init__.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | import textstat
 3 | 
 4 | INPUT_EXAMPLE = {"text": "This sentence has 7 syllables."}
 5 | 
 6 | 
 7 | class SyllableCountModel(BaseModel):
 8 |     text: str
 9 | 
10 |     class Config:
11 |         schema_extra = {"example": INPUT_EXAMPLE}
12 | 
13 | 
14 | def syllable_count(request: SyllableCountModel):
15 |     """Counts the number of sylabbles in a text."""
16 |     text = request.text
17 |     syllables = textstat.syllable_count(text)
18 |     return {"syllableCount": syllables}
19 | 


--------------------------------------------------------------------------------
/generators/text_analytics/syllable_count/code_snippet_backup.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | import textstat
 3 | 
 4 | # replace this list with a list containing your data
 5 | text = ["Pizza is very delicious.", "Titanic is a movie made by James Cameron", "Apple pie is also very delicious."]
 6 | 
 7 | # add the texts to a dict called records. Add further information as key-value pairs if needed
 8 | record = {
 9 |     "text": text,
10 | }
11 | 
12 | def syllable_count(record):
13 |     syllable_list = []
14 |     for entry in record["text"]:
15 |         syllable_list.append(textstat.syllable_count(entry))
16 |     return {"syllables": syllable_list}
17 | ```


--------------------------------------------------------------------------------
/generators/text_analytics/syllable_count/code_snippet_common.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | import textstat
 3 | 
 4 | def syllable_count(text: str) -> int:
 5 |     return textstat.syllable_count(text)
 6 | 
 7 | # ↑ necessary bricks stuff
 8 | # -----------------------------------------------------------------------------------------
 9 | # ↓ example implementation 
10 | 
11 | def example_integration():
12 |     texts = ["Pizza is very delicious.", "Titanic is a movie made by James Cameron", "Apple pie is also very delicious."]
13 |     for text in texts:
14 |         print(f"the text \"{text}\" has {syllable_count(text)} syllables")
15 | example_integration() 
16 | ```


--------------------------------------------------------------------------------
/generators/text_analytics/syllable_count/code_snippet_refinery.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | import textstat
 3 | 
 4 | ATTRIBUTE: str = "text"
 5 | 
 6 | def syllable_count(record):
 7 |     text = record[ATTRIBUTE].text # SpaCy document, hence we need to call .text to get the string
 8 |     num_syllables = textstat.syllable_count(text)
 9 |     return num_syllables
10 | ```


--------------------------------------------------------------------------------
/generators/text_analytics/tiktoken_token_counter/README.md:
--------------------------------------------------------------------------------
1 | Uses OpenAI's tiktoken tokenizer library to count the amount of tokens in a string. The tokenizer is used for the GPT models and converts words into integers. The conversion is reversible and lossless, meaning that a tokenized sentence can be converted back. This brick returns the amount of tokens in a given text.


--------------------------------------------------------------------------------
/generators/text_analytics/tiktoken_token_counter/__init__.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | import tiktoken
 3 | 
 4 | INPUT_EXAMPLE = {
 5 |     "text": "What a beautiful day to count tokens."
 6 | }
 7 | 
 8 | 
 9 | class TiktokenTokenCounterModel(BaseModel):
10 |     text: str
11 | 
12 |     class Config:
13 |         schema_extra = {"example": INPUT_EXAMPLE}
14 | 
15 | 
16 | def tiktoken_token_counter(req: TiktokenTokenCounterModel):
17 |     """Uses the Tiktoken library to count tokens in a string"""
18 |     encoding = tiktoken.get_encoding("cl100k_base")
19 |     tokens = encoding.encode(req.text)
20 |     return {"token_length": len(tokens)}


--------------------------------------------------------------------------------
/generators/text_analytics/tiktoken_token_counter/code_snippet_common.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | import tiktoken
 3 | 
 4 | def tiktoken_token_counter(text: str, encoding_name: str = "cl100k_base") -> int:
 5 |     """
 6 |     @param text: Text you want to count the number of tokens in
 7 |     @return: Integer with the token count
 8 |     """
 9 |     encoding = tiktoken.get_encoding(encoding_name)
10 |     tokens = encoding.encode(text)
11 |     return len(tokens)
12 | 
13 | # ↑ necessary bricks function 
14 | # -----------------------------------------------------------------------------------------
15 | # ↓ example implementation 
16 | 
17 | def example_integration():
18 |     texts = ["This is a short text with few tokens.", "This is a second short text"]
19 |     for text in texts:
20 |         print(f"\"{text}\" -> {tiktoken_token_counter(text)}")
21 | 
22 | example_integration()
23 | ```


--------------------------------------------------------------------------------
/generators/text_analytics/tiktoken_token_counter/code_snippet_refinery.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | import tiktoken 
 3 | 
 4 | ATTRIBUTE: str = "text" # only text attributes
 5 | ENCODING_NAME: str = "cl100k_base"
 6 | 
 7 | def tiktoken_token_counter(record):
 8 |     encoding = tiktoken.get_encoding(ENCODING_NAME)
 9 |     tokens = encoding.encode(record[ATTRIBUTE].text)
10 |     return len(tokens)
11 | ```


--------------------------------------------------------------------------------
/generators/text_cleaning/html_cleanser/README.md:
--------------------------------------------------------------------------------
1 | This module removes HTML tags from a text. For example, the text "This is a <b>test</b>" will be converted to "This is a test". The module is based on the HTMLParser from `BeautifulSoup`.


--------------------------------------------------------------------------------
/generators/text_cleaning/html_cleanser/__init__.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | from bs4 import BeautifulSoup
 3 | 
 4 | INPUT_EXAMPLE = {
 5 |     "html": """
 6 |             <!DOCTYPE html>
 7 |             <html>
 8 |             <body>
 9 |             <h1>Website header</h1>
10 |             <p>
11 |             Hello world.
12 |             My website is live!
13 |             </p>
14 |             </body>
15 |             </html>
16 |             """
17 | }
18 | 
19 | class HtmlCleanserModel(BaseModel):
20 |     html: str
21 | 
22 |     class Config:
23 |         schema_extra = {"example": INPUT_EXAMPLE}
24 | 
25 | def html_cleanser(req: HtmlCleanserModel):
26 |     """Removes the HTML tags from a text."""
27 |     html = req.html
28 | 
29 |     soup = BeautifulSoup(html, "html.parser")
30 | 
31 |     # Remove any line breakers as well
32 |     text = soup.text.splitlines()
33 |     text = " ".join([w for w in text if len(w) >= 1])
34 | 
35 |     return {"Cleaned text": text}
36 | 
37 | 
38 | 


--------------------------------------------------------------------------------
/generators/text_cleaning/html_cleanser/code_snippet_backup.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | from bs4 import BeautifulSoup
 3 | 
 4 | # replace this list with a list containing your data
 5 | text = ["""
 6 |             <!DOCTYPE html>
 7 |             <html>
 8 |             <body>
 9 |             <h1>Website header</h1>
10 |             <p>
11 |             Hello world.
12 |             My website is live!
13 |             </p>
14 |             </body>
15 |             </html>
16 |             """]
17 | 
18 | # add the texts to a dict called records. Add further information as key-value pairs if needed
19 | record = {
20 |     "text": text,
21 | }
22 | 
23 | def html_cleanser(record):
24 |     cleaned_text = []
25 |     for entry in record["text"]:
26 |         soup = BeautifulSoup(entry, "html.parser")
27 |         # Remove any line breakers as well
28 |         text = soup.text.splitlines()
29 |         cleaned_text.append(" ".join([w for w in text if len(w) >= 1]))
30 |     return {"cleaned_text": cleaned_text}
31 | ```


--------------------------------------------------------------------------------
/generators/text_cleaning/html_cleanser/code_snippet_common.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | from bs4 import BeautifulSoup
 3 | 
 4 | 
 5 | def html_cleanser(text:str)->str:
 6 |     """ 
 7 |     @param text: text we check the reading time for
 8 |     @return: cleansed text
 9 |     """
10 |     soup = BeautifulSoup(text, "html.parser")
11 |     lines = soup.text.splitlines()
12 |     return "\n".join([w for w in lines if len(w) >= 1])
13 | 
14 | # ↑ necessary bricks stuff
15 | # -----------------------------------------------------------------------------------------
16 | # ↓ example implementation 
17 | 
18 | def example_integration():
19 |     texts = ["""
20 | <!DOCTYPE html>
21 | <html>
22 | <body>
23 | <h1>Website header</h1>
24 | <p>
25 | Hello world.
26 | My website is live!
27 | </p>
28 | </body>
29 | </html>
30 | """]
31 | 
32 |     for text in texts:
33 |         print(f"the html page:{text}\nwill looked cleansed like this:\n{html_cleanser(text)}")
34 | example_integration() 
35 | ```


--------------------------------------------------------------------------------
/generators/text_cleaning/html_cleanser/code_snippet_refinery.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | from bs4 import BeautifulSoup
 3 | 
 4 | ATTRIBUTE: str = "text" #only text attributes
 5 | 
 6 | def html_cleanser(record):
 7 |     html = record[ATTRIBUTE].text # SpaCy doc, hence we need to use .text to get the string.
 8 | 
 9 |     soup = BeautifulSoup(html, "html.parser")
10 |     # Remove any line breakers as well
11 |     text = soup.text.splitlines()
12 |     text = " ".join([w for w in text if len(w) >= 1])
13 | 
14 |     return text
15 | ```


--------------------------------------------------------------------------------
/generators/text_cleaning/html_unescape/README.md:
--------------------------------------------------------------------------------
1 | This module unescapes HTML characters in a text. For example, the text "Here\&#39;s how \&quot;Kern.ai Newsletter\&quot; did today." will be converted to "Here's how "Kern.ai Newsletter" did today. Community contribution by @rasdani"


--------------------------------------------------------------------------------
/generators/text_cleaning/html_unescape/__init__.py:
--------------------------------------------------------------------------------
 1 | import html
 2 | from pydantic import BaseModel
 3 | 
 4 | INPUT_EXAMPLE = {
 5 |     "text": """Here&#39;s how &quot;Kern.ai Newsletter&quot; did today. 3. "World&#8217;s largest tech conference: &quot;Innovate 2023&#8482;&quot; begins tomorrow!"""
 6 | }
 7 | 
 8 | class HtmlUnescapeModel(BaseModel):
 9 |     text: str
10 | 
11 |     class Config:
12 |         schema_extra = {"example": INPUT_EXAMPLE}
13 | 
14 | def html_unescape(request: HtmlUnescapeModel):
15 |     """Unescapes HTML entities in a text."""
16 | 
17 |     text = request.text
18 | 
19 |     unescaped_text = html.unescape(text)
20 | 
21 |     return {"Unescaped text": unescaped_text}


--------------------------------------------------------------------------------
/generators/text_cleaning/html_unescape/code_snippet_common.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | import html
 3 | 
 4 | def html_unescape(text:str):
 5 |     """ 
 6 |     @param text: text we to unescape
 7 |     @return: unescaped text
 8 |     """
 9 |     unescaped_text = html.unescape(text)
10 |     return unescaped_text
11 | 
12 | 
13 | # ↑ necessary bricks stuff
14 | # -----------------------------------------------------------------------------------------
15 | # ↓ example implementation 
16 | 
17 | def example_integration():
18 |     texts = ["""Here&#39;s how &quot;Kern.ai Newsletter&quot; did today. 3. "World&#8217;s largest tech conference: &quot;Innovate 2023&#8482;&quot; begins tomorrow!"""]
19 | 
20 |     for text in texts:
21 |         print(f"the html page:{text}\nwill looked unescaped like this:\n{html_unescape(text)}")
22 | example_integration() 
23 | ```


--------------------------------------------------------------------------------
/generators/text_cleaning/html_unescape/code_snippet_refinery.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | import html
 3 | 
 4 | ATTRIBUTE: str = "text" #only text attributes
 5 | 
 6 | def html_unescape(record):
 7 |     text = record[ATTRIBUTE].text # SpaCy doc, hence we need to use .text to get the string.
 8 | 
 9 |     unescaped_text = html.unescape(text)
10 | 
11 |     return unescaped_text
12 | ```


--------------------------------------------------------------------------------
/generators/text_cleaning/html_unescape/config_backup.py:
--------------------------------------------------------------------------------
 1 | from util.configs import build_generator_function_config
 2 | from util.enums import State
 3 | from . import html_unescape, INPUT_EXAMPLE
 4 | 
 5 | 
 6 | def get_config():
 7 |     return build_generator_function_config(
 8 |         function=html_unescape,
 9 |         input_example=INPUT_EXAMPLE, 
10 |         data_type="text",
11 |         issue_id=233,
12 |         tabler_icon="BrandHtml5",
13 |         min_refinery_version="1.7.0",
14 |         state=State.PUBLIC,
15 |     )
16 | 


--------------------------------------------------------------------------------
/generators/translation/deepl_translator/README.md:
--------------------------------------------------------------------------------
1 | Translates texts with DeepL. Requires a paid API key, which you can obtain at https://www.deepl.com/. The code in the `code_snippet_refinery.md` translates only a single sentence, while the `code_snippet_common.md` translates a whole list of sentences. Please contact the Kern AI Team if you need further support at info@kern.ai or join our Discord channel: https://discord.gg/WAnAgQEv 


--------------------------------------------------------------------------------
/generators/translation/deepl_translator/__init__.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | from typing import List
 3 | import requests, uuid
 4 | 
 5 | INPUT_EXAMPLE = {
 6 |     "text": "Hallo, guten Tag.",
 7 |     "toLang": "en",
 8 |     "apiKey": "<API_KEY_GOES_HERE>",
 9 | }
10 | 
11 | 
12 | class DeeplTranslatorModel(BaseModel):
13 |     text: str
14 |     toLang: str
15 |     apiKey: str
16 | 
17 |     class Config:
18 |         schema_extra = {"example": INPUT_EXAMPLE}
19 | 
20 | 
21 | def deepl_translator(req: DeeplTranslatorModel):
22 |     """Uses DeepL API to translate texts."""
23 | 
24 |     deepl_url = "https://api.deepl.com/v2/translate"
25 |     params = {
26 |         "auth_key": req.apiKey,
27 |         "target_lang": req.toLang,
28 |         "text": req.text,
29 |     }
30 | 
31 |     deepl_result = requests.get(deepl_url, params=params)
32 |     try:
33 |         deepl_result_json = deepl_result.json()
34 |     except:
35 |         return "That didn't work. Maybe the API key is wrong?"
36 | 
37 |     return deepl_result_json
38 | 


--------------------------------------------------------------------------------
/generators/translation/deepl_translator/code_snippet_refinery.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | import requests
 3 | 
 4 | ATTRIBUTE: str = "text" # only text attributes
 5 | API_KEY: str = "<API_KEY_GOES_HERE>" # Deepl API Key
 6 | TARGET_LANGUAGE: str = "de" # only iso format
 7 | 
 8 | def deepl_translator(record):
 9 |     '''Uses DeepL API to translate texts.'''
10 |     deepl_url = "https://api.deepl.com/v2/translate"
11 |     params={ 
12 |         "auth_key": API_KEY, 
13 |         "target_lang": TARGET_LANGUAGE, # Change this to the language of your choice
14 |         "text": record[ATTRIBUTE].text, 
15 |     }
16 | 
17 |     deepl_result = requests.get(
18 |     deepl_url, 
19 |     params=params
20 |     ) 
21 |     deepl_result_json= deepl_result.json()
22 |     translation = deepl_result_json["translations"][0]["text"]
23 | 
24 |     return translation
25 | ```


--------------------------------------------------------------------------------
/generators/translation/ibm_translator/README.md:
--------------------------------------------------------------------------------
1 | Translates texts using the IBM watson language translator. Requires a paid API key from IBM cloud. IBM offers a plan with 1 million free characters per month. Please contact the Kern AI Team if you need further support at info@kern.ai or join our Discord channel: https://discord.gg/WAnAgQEv 


--------------------------------------------------------------------------------
/generators/translation/ibm_translator/code_snippet_refinery.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | import requests
 3 | from typing import List
 4 | 
 5 | ATTRIBUTE: str = "text" # only text attributes
 6 | API_KEY: str = "<API_KEY_GOES_HERE>"
 7 | IBM_INSTANCE_ID: str = "<INSTANCE_ID_TO_USE>"
 8 | ORIGIN_LANG: str = "en"
 9 | TARGET_LANG: str = "de"
10 | 
11 | 
12 | def ibm_translator(record):
13 |     headers = {'Content-Type': 'application/json'}
14 |     data = '{"text":'+f'["{record[ATTRIBUTE].text}"], '+'"model_id":'+f'"{ORIGIN_LANG}-'+f'{TARGET_LANG}"'+'}'
15 |     auth = ('apikey', API_KEY)
16 |     url = f"https://api.eu-de.language-translator.watson.cloud.ibm.com/instances/{IBM_INSTANCE_ID}" 
17 |     response = requests.post(
18 |         url, 
19 |         headers=headers,
20 |         data=data, 
21 |         auth=auth
22 |     )
23 |     try:
24 |         translation = [i["translation"] for i in response.json()["translations"]]
25 |         return " ".join(translation)
26 |     except:
27 |         return "Translation not possible."
28 | ```


--------------------------------------------------------------------------------
/generators/translation/language_translator/README.md:
--------------------------------------------------------------------------------
1 | This module uses the free `translate` library to translate texts to any specified language. The library is limited to to 5000 chars/day. Please note that on app.kern.ai, this quota might be shared with other users as well. If you need to translate large volumes of text, please take a look at the premium translation modules. Feel free to contact the Kern team if you require further support at info@kern.ai or join our Discord channel: https://discord.com/invite/qf4rGCEphW


--------------------------------------------------------------------------------
/generators/translation/language_translator/__init__.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | from translate import Translator
 3 | 
 4 | INPUT_EXAMPLE = {
 5 |     "text": "Hallo, guten Tag.",
 6 |     "fromLang": "de",
 7 |     "toLang": "en",
 8 |     }
 9 | 
10 | class LanguageTranslatorModel(BaseModel):
11 |     text: str
12 |     fromLang: str
13 |     toLang: str
14 | 
15 |     class Config:
16 |         schema_extra = {"example": INPUT_EXAMPLE}
17 | 
18 | def language_translator(req: LanguageTranslatorModel):
19 |     """Function to translate text."""
20 | 
21 |     origin_lang = req.fromLang
22 |     target_lang = req.toLang
23 |     string_to_translate = req.text
24 | 
25 |     translator = Translator(from_lang=origin_lang, to_lang=target_lang)
26 |     translation = translator.translate(string_to_translate)
27 |     return {"translation": translation}
28 | 
29 | 


--------------------------------------------------------------------------------
/generators/translation/language_translator/code_snippet_backup.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | from translate import Translator
 3 | 
 4 | # replace this list with a list containing your data
 5 | text = ["Pizza is very delicious.", "Titanic is a movie made by James Cameron", "Apple pie is also very delicious."]
 6 | 
 7 | # add the texts to a dict called records. Add further information as key-value pairs if needed
 8 | record = {
 9 |     "your_text": text,
10 |     "origin_language": "en", # change this to the language of your texts
11 |     "target_language": "de" # change this to the language you want to translate to
12 | }
13 | 
14 | def language_translator(record: dict) -> dict:
15 |     translations = []
16 |     for entry in record["your_text"]:     
17 |         translator = Translator(from_lang=record["origin_language"], to_lang=record["target_language"])
18 |         translation = translator.translate(entry)
19 |         translations.append(translation)
20 |     return {"translations": translations}
21 | ```


--------------------------------------------------------------------------------
/generators/translation/language_translator/code_snippet_common.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | from translate import Translator
 3 | 
 4 | def language_translator(text: str,original_language:str,target_language:str) -> str:
 5 |     """ 
 6 |     @param text: text we want to translate
 7 |     @param original_language: only iso format
 8 |     @param target_language: only iso format
 9 |     @return: translated text
10 |     """
11 |     translator = Translator(from_lang=original_language, to_lang=target_language)
12 |     return translator.translate(text)
13 | 
14 | # ↑ necessary bricks stuff
15 | # -----------------------------------------------------------------------------------------
16 | # ↓ example implementation 
17 | 
18 | def example_integration():
19 |     texts = ["Pizza is very delicious.", "Titanic is a movie made by James Cameron", "Apple pie is also very delicious."]
20 |     original_language = "en"
21 |     target_language = "de"
22 |     for text in texts:
23 |         print(f"the text \"{text}\" in {target_language} is {language_translator(text,original_language,target_language)}")
24 | example_integration() 
25 | ```


--------------------------------------------------------------------------------
/generators/translation/language_translator/code_snippet_refinery.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | from translate import Translator
 3 | 
 4 | ATTRIBUTE: str = "text" #only text attributes
 5 | ORIGINAL_LANGUAGE: str = "en" #only iso format
 6 | TARGET_LANGUAGE: str = "de" #only iso format
 7 | 
 8 | def language_translator(record):
 9 | 
10 |     string_to_translate = record[ATTRIBUTE].text # SpaCy doc, hence we need to use .text to get string.
11 | 
12 |     translator = Translator(from_lang=ORIGINAL_LANGUAGE, to_lang=TARGET_LANGUAGE)
13 |     translation = translator.translate(string_to_translate)
14 |     return translation
15 | ```
16 | 


--------------------------------------------------------------------------------
/generators/translation/microsoft_translator/README.md:
--------------------------------------------------------------------------------
1 | Translates texts using Microsoft's Azure cognitive services. An API key can be obtained via the Azures cognitive services. Currently Microsoft offers a free plan with which allows the translation of two million characters per month. Please contact the Kern AI Team if you need further support at info@kern.ai or join our Discord channel: https://discord.gg/WAnAgQEv 


--------------------------------------------------------------------------------
/generators/util/spacy.py:
--------------------------------------------------------------------------------
 1 | import spacy
 2 | 
 3 | def download_all_models():
 4 |     print("Downloading spacy models...")
 5 |     models = [
 6 |         "en_core_web_sm",
 7 |         "de_core_news_sm"
 8 |     ]
 9 |     for model in models:
10 |         print(f"Downloading {model}...")
11 |         download_model(model)
12 | 
13 | def download_model(model):
14 |     """Download a spacy model if it doesn't exist."""
15 |     try:
16 |         spacy.load(model)
17 |     except OSError:
18 |         spacy.cli.download(model)
19 |         spacy.load(model)
20 | 
21 | class SpacySingleton:
22 |     nlp = None
23 | 
24 |     @classmethod
25 |     def get_nlp(cls, model="en_core_web_sm"):
26 |         if cls.nlp is None:
27 |             cls.nlp = spacy.load(model)
28 |         return cls.nlp


--------------------------------------------------------------------------------
/generators/zero_shot/README.md:
--------------------------------------------------------------------------------
1 | We don't support zero-shot configurations as code in refinery yet, but as soon as we do so, you can find zero-shot configurations in here.


--------------------------------------------------------------------------------
/gunicorn.config.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # gunicorn.config.py
 3 | """
 4 | Gunicorn Uvicorn config to lauch in Digital Ocean's App Platform.
 5 | Using their Flask template: https://github.com/digitalocean/sample-flask
 6 | """
 7 | 
 8 | bind = "0.0.0.0:8080"
 9 | workers = 4
10 | # Using Uvicorn's Gunicorn worker class
11 | worker_class = "uvicorn.workers.UvicornWorker"
12 | 


--------------------------------------------------------------------------------
/images/fastapi_testing_01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/code-kern-ai/bricks/3a43c1e2b1ad8f41cee43028fce12f94fe889ea2/images/fastapi_testing_01.png


--------------------------------------------------------------------------------
/images/fastapi_testing_02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/code-kern-ai/bricks/3a43c1e2b1ad8f41cee43028fce12f94fe889ea2/images/fastapi_testing_02.png


--------------------------------------------------------------------------------
/images/thumbnail-bricks.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/code-kern-ai/bricks/3a43c1e2b1ad8f41cee43028fce12f94fe889ea2/images/thumbnail-bricks.png


--------------------------------------------------------------------------------
/nltk.txt:
--------------------------------------------------------------------------------
1 | words
2 | brown
3 | stopwords
4 | punkt
5 | wordnet
6 | omw-1.4


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | beautifulsoup4==4.11.1
 2 | better_profanity==0.7.0
 3 | fastapi==0.85.1
 4 | fire==0.4.0
 5 | gunicorn==20.1.0
 6 | langdetect==1.0.9
 7 | nltk==3.7
 8 | pip-chill==1.0.1
 9 | phonenumbers==8.13.0
10 | python-dotenv==0.21.0
11 | python-levenshtein==0.20.8
12 | textblob==0.17.1
13 | textstat==0.7.3
14 | translate==3.6.1
15 | uvicorn==0.19.0
16 | scikit-learn==1.1.3
17 | spacy==3.4.2
18 | stemming==1.0.1
19 | quantulum3==0.7.11
20 | language_tool_python==2.7.1
21 | LeXmo==0.1.4
22 | better_profanity==0.7.0
23 | flashtext==2.7
24 | openai==0.27.7
25 | vaderSentiment==3.3.2
26 | google-search-results==2.4.1
27 | textacy==0.12.0
28 | scikit-optimize==0.9.0
29 | holidays==0.21.13
30 | sumy==0.11.0
31 | tiktoken==0.4.0


--------------------------------------------------------------------------------
/runtime.txt:
--------------------------------------------------------------------------------
1 | python-3.10.6


--------------------------------------------------------------------------------
/util/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/code-kern-ai/bricks/3a43c1e2b1ad8f41cee43028fce12f94fe889ea2/util/__init__.py


--------------------------------------------------------------------------------
/util/enums.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum
 2 | 
 3 | 
 4 | class State(Enum):
 5 |     DRAFT = "draft"
 6 |     PUBLIC = "public"
 7 | 
 8 | 
 9 | class SelectionType(Enum):
10 |     STRING = "string"
11 |     CHOICE = "choice"
12 |     RANGE = "range"
13 |     INTEGER = "integer"
14 |     FLOAT = "float"
15 |     BOOLEAN = "boolean"
16 |     LIST = "list"
17 | 
18 | 
19 | class BricksVariableType(Enum):
20 |     ATTRIBUTE = "attribute"
21 |     LANGUAGE = "language"
22 |     LABELING_TASK = "labeling_task"
23 |     LABEL = "label"
24 |     EMBEDDING = "embedding"
25 |     LOOKUP_LIST = "lookup_list"
26 |     REGEX = "regex"
27 |     GENERIC_STRING = "generic_string"
28 |     GENERIC_INT = "generic_int"
29 |     GENERIC_FLOAT = "generic_float"
30 |     GENERIC_BOOLEAN = "generic_boolean"
31 |     UNKNOWN = "unknown"
32 | 
33 | 
34 | class RefineryDataType(Enum):
35 |     CATEGORY = "category"
36 |     TEXT = "text"
37 |     INTEGER = "integer"
38 |     FLOAT = "float"
39 |     BOOLEAN = "boolean"
40 |     EMBEDDING_LIST = "embedding_list"
41 | 


--------------------------------------------------------------------------------
/util/exceptions.py:
--------------------------------------------------------------------------------
1 | class ErrorneousConfiguration(Exception):
2 |     """Exception raised when the configuration is erroneous"""
3 | 
4 |     pass
5 | 


--------------------------------------------------------------------------------
/util/paths.py:
--------------------------------------------------------------------------------
 1 | from re import finditer
 2 | import os
 3 | 
 4 | def camel_case_to_snake_case(text):
 5 |     matches = finditer(".+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)", text)
 6 |     return "_".join([match.group(0) for match in matches]).lower()
 7 | 
 8 | 
 9 | def snake_case_to_camel_case(text):
10 |     text_cased = "".join([word.capitalize() for word in text.split("_")])
11 |     return text_cased[0].lower() + text_cased[1:]
12 | 
13 | def get_module_folders(base_folder):
14 |     folders = os.listdir(base_folder)
15 |     ignore_these = ["__pycache__", "README.md", "util", "zero_shot", "__init__.py", ".DS_Store", "_template"]
16 |     for item in ignore_these:
17 |         try:
18 |             folders.remove(item)
19 |         except:
20 |             pass
21 |     return folders
22 | 
23 | 


--------------------------------------------------------------------------------