├── .gitignore ├── README.md ├── email ├── ham │ ├── 1.txt │ ├── 10.txt │ ├── 11.txt │ ├── 12.txt │ ├── 13.txt │ ├── 14.txt │ ├── 15.txt │ ├── 16.txt │ ├── 17.txt │ ├── 18.txt │ ├── 19.txt │ ├── 2.txt │ ├── 20.txt │ ├── 21.txt │ ├── 22.txt │ ├── 23.txt │ ├── 24.txt │ ├── 25.txt │ ├── 3.txt │ ├── 4.txt │ ├── 5.txt │ ├── 6.txt │ ├── 7.txt │ ├── 8.txt │ └── 9.txt └── spam │ ├── 1.txt │ ├── 10.txt │ ├── 11.txt │ ├── 12.txt │ ├── 13.txt │ ├── 14.txt │ ├── 15.txt │ ├── 16.txt │ ├── 17.txt │ ├── 18.txt │ ├── 19.txt │ ├── 2.txt │ ├── 20.txt │ ├── 21.txt │ ├── 22.txt │ ├── 23.txt │ ├── 24.txt │ ├── 25.txt │ ├── 3.txt │ ├── 4.txt │ ├── 5.txt │ ├── 6.txt │ ├── 7.txt │ ├── 8.txt │ └── 9.txt └── spam_filter.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/August1s/spam_filter/7981182ed54c87bfe76abaad9f8f4620ae43ea99/README.md -------------------------------------------------------------------------------- /email/ham/1.txt: -------------------------------------------------------------------------------- 1 | Hi Peter, 2 | 3 | With Jose out of town, do you want to 4 | meet once in a while to keep things 5 | going and do some interesting stuff? 6 | 7 | Let me know 8 | Eugene -------------------------------------------------------------------------------- /email/ham/10.txt: -------------------------------------------------------------------------------- 1 | Ryan Whybrew commented on your status. 2 | 3 | Ryan wrote: 4 | "turd ferguson or butt horn." 5 | -------------------------------------------------------------------------------- /email/ham/11.txt: -------------------------------------------------------------------------------- 1 | Arvind Thirumalai commented on your status. 2 | 3 | Arvind wrote: 4 | ""you know"" 5 | 6 | 7 | Reply to this email to comment on this status. 8 | 9 | -------------------------------------------------------------------------------- /email/ham/12.txt: -------------------------------------------------------------------------------- 1 | Thanks Peter. 2 | 3 | I'll definitely check in on this. How is your book 4 | going? I heard chapter 1 came in and it was in 5 | good shape. ;-) 6 | 7 | I hope you are doing well. 8 | 9 | Cheers, 10 | 11 | Troy -------------------------------------------------------------------------------- /email/ham/13.txt: -------------------------------------------------------------------------------- 1 | Jay Stepp commented on your status. 2 | 3 | Jay wrote: 4 | ""to the" ???" 5 | 6 | 7 | Reply to this email to comment on this status. 8 | 9 | To see the comment thread, follow the link below: 10 | 11 | -------------------------------------------------------------------------------- /email/ham/14.txt: -------------------------------------------------------------------------------- 1 | LinkedIn 2 | 3 | Kerry Haloney requested to add you as a connection on LinkedIn: 4 | 5 | Peter, 6 | 7 | I'd like to add you to my professional network on LinkedIn. 8 | 9 | - Kerry Haloney 10 | 11 | -------------------------------------------------------------------------------- /email/ham/15.txt: -------------------------------------------------------------------------------- 1 | Hi Peter, 2 | 3 | The hotels are the ones that rent out the tent. They are all lined up on the hotel grounds : )) So much for being one with nature, more like being one with a couple dozen tour groups and nature. 4 | I have about 100M of pictures from that trip. I can go through them and get you jpgs of my favorite scenic pictures. 5 | 6 | Where are you and Jocelyn now? New York? Will you come to Tokyo for Chinese New Year? Perhaps to see the two of you then. I will go to Thailand for winter holiday to see my mom : ) 7 | 8 | Take care, 9 | D 10 | -------------------------------------------------------------------------------- /email/ham/16.txt: -------------------------------------------------------------------------------- 1 | yeah I am ready. I may not be here because Jar Jar has plane tickets to Germany for me. -------------------------------------------------------------------------------- /email/ham/17.txt: -------------------------------------------------------------------------------- 1 | Benoit Mandelbrot 1924-2010 2 | 3 | Benoit Mandelbrot 1924-2010 4 | 5 | Wilmott Team 6 | 7 | Benoit Mandelbrot, the mathematician, the father of fractal mathematics, and advocate of more sophisticated modelling in quantitative finance, died on 14th October 2010 aged 85. 8 | 9 | Wilmott magazine has often featured Mandelbrot, his ideas, and the work of others inspired by his fundamental insights. 10 | 11 | You must be logged on to view these articles from past issues of Wilmott Magazine. -------------------------------------------------------------------------------- /email/ham/18.txt: -------------------------------------------------------------------------------- 1 | Hi Peter, 2 | 3 | Sure thing. Sounds good. Let me know what time would be good for you. 4 | I will come prepared with some ideas and we can go from there. 5 | 6 | Regards, 7 | 8 | -Vivek. -------------------------------------------------------------------------------- /email/ham/19.txt: -------------------------------------------------------------------------------- 1 | LinkedIn 2 | 3 | Julius O requested to add you as a connection on LinkedIn: 4 | 5 | Hi Peter. 6 | 7 | Looking forward to the book! 8 | 9 | 10 | Accept View invitation from Julius O 11 | -------------------------------------------------------------------------------- /email/ham/2.txt: -------------------------------------------------------------------------------- 1 | Yay to you both doing fine! 2 | 3 | I'm working on an MBA in Design Strategy at CCA (top art school.) It's a new program focusing on more of a right-brained creative and strategic approach to management. I'm an 1/8 of the way done today! -------------------------------------------------------------------------------- /email/ham/20.txt: -------------------------------------------------------------------------------- 1 | I've thought about this and think it's possible. We should get another 2 | lunch. I have a car now and could come pick you up this time. Does 3 | this wednesday work? 11:50? 4 | 5 | Can I have a signed copy of you book? -------------------------------------------------------------------------------- /email/ham/21.txt: -------------------------------------------------------------------------------- 1 | we saw this on the way to the coast...thought u might like it 2 | 3 | hangzhou is huge, one day wasn't enough, but we got a glimpse... 4 | 5 | we went inside the china pavilion at expo, it is pretty interesting, 6 | each province has an exhibit... -------------------------------------------------------------------------------- /email/ham/22.txt: -------------------------------------------------------------------------------- 1 | Hi Hommies, 2 | 3 | Just got a phone call from the roofer, they will come and spaying the foaming today. it will be dusty. pls close all the doors and windows. 4 | Could you help me to close my bathroom window, cat window and the sliding door behind the TV? 5 | I don't know how can those 2 cats survive...... 6 | 7 | Sorry for any inconvenience! -------------------------------------------------------------------------------- /email/ham/23.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/August1s/spam_filter/7981182ed54c87bfe76abaad9f8f4620ae43ea99/email/ham/23.txt -------------------------------------------------------------------------------- /email/ham/24.txt: -------------------------------------------------------------------------------- 1 | Ok I will be there by 10:00 at the latest. -------------------------------------------------------------------------------- /email/ham/25.txt: -------------------------------------------------------------------------------- 1 | That is cold. Is there going to be a retirement party? 2 | Are the leaves changing color? -------------------------------------------------------------------------------- /email/ham/3.txt: -------------------------------------------------------------------------------- 1 | WHat is going on there? 2 | I talked to John on email. We talked about some computer stuff that's it. 3 | 4 | I went bike riding in the rain, it was not that cold. 5 | 6 | We went to the museum in SF yesterday it was $3 to get in and they had 7 | free food. At the same time was a SF Giants game, when we got done we 8 | had to take the train with all the Giants fans, they are 1/2 drunk. -------------------------------------------------------------------------------- /email/ham/4.txt: -------------------------------------------------------------------------------- 1 | Yo. I've been working on my running website. I'm using jquery and the jqplot plugin. I'm not too far away from having a prototype to launch. 2 | 3 | You used jqplot right? If not, I think you would like it. -------------------------------------------------------------------------------- /email/ham/5.txt: -------------------------------------------------------------------------------- 1 | There was a guy at the gas station who told me that if I knew Mandarin 2 | and Python I could get a job with the FBI. -------------------------------------------------------------------------------- /email/ham/6.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/August1s/spam_filter/7981182ed54c87bfe76abaad9f8f4620ae43ea99/email/ham/6.txt -------------------------------------------------------------------------------- /email/ham/7.txt: -------------------------------------------------------------------------------- 1 | Zach Hamm commented on your status. 2 | 3 | Zach wrote: 4 | "doggy style - enough said, thank you & good night" 5 | 6 | 7 | -------------------------------------------------------------------------------- /email/ham/8.txt: -------------------------------------------------------------------------------- 1 | This e-mail was sent from a notification-only address that cannot accept incoming e-mail. Please do not reply to this message. 2 | 3 | Thank you for your online reservation. The store you selected has located the item you requested and has placed it on hold in your name. Please note that all items are held for 1 day. Please note store prices may differ from those online. 4 | 5 | If you have questions or need assistance with your reservation, please contact the store at the phone number listed below. You can also access store information, such as store hours and location, on the web at http://www.borders.com/online/store/StoreDetailView_98. -------------------------------------------------------------------------------- /email/ham/9.txt: -------------------------------------------------------------------------------- 1 | Hi Peter, 2 | 3 | These are the only good scenic ones and it's too bad there was a girl's back in one of them. Just try to enjoy the blue sky : )) 4 | 5 | D -------------------------------------------------------------------------------- /email/spam/1.txt: -------------------------------------------------------------------------------- 1 | --- Codeine 15mg -- 30 for $203.70 -- VISA Only!!! -- 2 | 3 | -- Codeine (Methylmorphine) is a narcotic (opioid) pain reliever 4 | -- We have 15mg & 30mg pills -- 30/15mg for $203.70 - 60/15mg for $385.80 - 90/15mg for $562.50 -- VISA Only!!! --- -------------------------------------------------------------------------------- /email/spam/10.txt: -------------------------------------------------------------------------------- 1 | OrderCializViagra Online & Save 75-90% 2 | 3 | 0nline Pharmacy NoPrescription required 4 | Buy Canadian Drugs at Wholesale Prices and Save 75-90% 5 | FDA-Approved drugs + Superb Quality Drugs only! 6 | Accept all major credit cards -------------------------------------------------------------------------------- /email/spam/11.txt: -------------------------------------------------------------------------------- 1 | You Have Everything To Gain! 2 | 3 | Incredib1e gains in length of 3-4 inches to yourPenis, PERMANANTLY 4 | 5 | Amazing increase in thickness of yourPenis, up to 30% 6 | BetterEjacu1ation control 7 | Experience Rock-HardErecetions 8 | Explosive, intenseOrgasns 9 | Increase volume ofEjacu1ate 10 | Doctor designed and endorsed 11 | 100% herbal, 100% Natural, 100% Safe 12 | The proven NaturalPenisEnhancement that works! 13 | 100% MoneyBack Guaranteeed -------------------------------------------------------------------------------- /email/spam/12.txt: -------------------------------------------------------------------------------- 1 | Buy Ambiem (Zolpidem) 5mg/10mg @ $2.39/- pill 2 | 3 | 30 pills x 5 mg - $129.00 4 | 60 pills x 5 mg - $199.20 5 | 180 pills x 5 mg - $430.20 6 | 30 pills x 10 mg - $ 138.00 7 | 120 pills x 10 mg - $ 322.80 -------------------------------------------------------------------------------- /email/spam/13.txt: -------------------------------------------------------------------------------- 1 | OrderCializViagra Online & Save 75-90% 2 | 3 | 0nline Pharmacy NoPrescription required 4 | Buy Canadian Drugs at Wholesale Prices and Save 75-90% 5 | FDA-Approved drugs + Superb Quality Drugs only! 6 | Accept all major credit cards 7 | Order Today! From $1.38 8 | -------------------------------------------------------------------------------- /email/spam/14.txt: -------------------------------------------------------------------------------- 1 | BuyVIAGRA 25mg, 50mg, 100mg, 2 | BrandViagra, FemaleViagra from $1.15 per pill 3 | 4 | 5 | ViagraNoPrescription needed - from Certified Canadian Pharmacy 6 | 7 | Buy Here... We accept VISA, AMEX, E-Check... Worldwide Delivery -------------------------------------------------------------------------------- /email/spam/15.txt: -------------------------------------------------------------------------------- 1 | You Have Everything To Gain! 2 | 3 | Incredib1e gains in length of 3-4 inches to yourPenis, PERMANANTLY 4 | 5 | Amazing increase in thickness of yourPenis, up to 30% 6 | BetterEjacu1ation control 7 | Experience Rock-HardErecetions 8 | Explosive, intenseOrgasns 9 | Increase volume ofEjacu1ate 10 | Doctor designed and endorsed 11 | 100% herbal, 100% Natural, 100% Safe -------------------------------------------------------------------------------- /email/spam/16.txt: -------------------------------------------------------------------------------- 1 | You Have Everything To Gain! 2 | 3 | Incredib1e gains in length of 3-4 inches to yourPenis, PERMANANTLY 4 | 5 | Amazing increase in thickness of yourPenis, up to 30% 6 | BetterEjacu1ation control 7 | Experience Rock-HardErecetions 8 | Explosive, intenseOrgasns 9 | Increase volume ofEjacu1ate 10 | Doctor designed and endorsed 11 | 100% herbal, 100% Natural, 100% Safe -------------------------------------------------------------------------------- /email/spam/17.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/August1s/spam_filter/7981182ed54c87bfe76abaad9f8f4620ae43ea99/email/spam/17.txt -------------------------------------------------------------------------------- /email/spam/18.txt: -------------------------------------------------------------------------------- 1 | Codeine (the most competitive price on NET!) 2 | 3 | Codeine (WILSON) 30mg x 30 $156.00 4 | Codeine (WILSON) 30mg x 60 $291.00 (+4 FreeViagra pills) 5 | Codeine (WILSON) 30mg x 90 $396.00 (+4 FreeViagra pills) 6 | Codeine (WILSON) 30mg x 120 $492.00 (+10 FreeViagra pills) -------------------------------------------------------------------------------- /email/spam/19.txt: -------------------------------------------------------------------------------- 1 | Get Up to 75% OFF at Online WatchesStore 2 | 3 | Discount Watches for All Famous Brands 4 | 5 | * Watches: aRolexBvlgari, Dior, Hermes, Oris, Cartier, AP and more brands 6 | * Louis Vuitton Bags & Wallets 7 | * Gucci Bags 8 | * Tiffany & Co Jewerly 9 | 10 | Enjoy a full 1 year WARRANTY 11 | Shipment via reputable courier: FEDEX, UPS, DHL and EMS Speedpost 12 | You will 100% recieve your order 13 | Save Up to 75% OFF Quality Watches -------------------------------------------------------------------------------- /email/spam/2.txt: -------------------------------------------------------------------------------- 1 | Hydrocodone/Vicodin ES/Brand Watson 2 | 3 | Vicodin ES - 7.5/750 mg: 30 - $195 / 120 $570 4 | Brand Watson - 7.5/750 mg: 30 - $195 / 120 $570 5 | Brand Watson - 10/325 mg: 30 - $199 / 120 - $588 6 | NoPrescription Required 7 | FREE Express FedEx (3-5 days Delivery) for over $200 order 8 | Major Credit Cards + E-CHECK -------------------------------------------------------------------------------- /email/spam/20.txt: -------------------------------------------------------------------------------- 1 | Get Up to 75% OFF at Online WatchesStore 2 | 3 | Discount Watches for All Famous Brands 4 | 5 | * Watches: aRolexBvlgari, Dior, Hermes, Oris, Cartier, AP and more brands 6 | * Louis Vuitton Bags & Wallets 7 | * Gucci Bags 8 | * Tiffany & Co Jewerly 9 | 10 | Enjoy a full 1 year WARRANTY 11 | Shipment via reputable courier: FEDEX, UPS, DHL and EMS Speedpost 12 | You will 100% recieve your order -------------------------------------------------------------------------------- /email/spam/21.txt: -------------------------------------------------------------------------------- 1 | Percocet 10/625 mg withoutPrescription 30 tabs - $225! 2 | Percocet, a narcotic analgesic, is used to treat moderate to moderately SeverePain 3 | Top Quality, EXPRESS Shipping, 100% Safe & Discreet & Private. 4 | Buy Cheap Percocet Online -------------------------------------------------------------------------------- /email/spam/22.txt: -------------------------------------------------------------------------------- 1 | Get Up to 75% OFF at Online WatchesStore 2 | 3 | Discount Watches for All Famous Brands 4 | 5 | * Watches: aRolexBvlgari, Dior, Hermes, Oris, Cartier, AP and more brands 6 | * Louis Vuitton Bags & Wallets 7 | * Gucci Bags 8 | * Tiffany & Co Jewerly 9 | 10 | Enjoy a full 1 year WARRANTY 11 | Shipment via reputable courier: FEDEX, UPS, DHL and EMS Speedpost 12 | You will 100% recieve your order -------------------------------------------------------------------------------- /email/spam/23.txt: -------------------------------------------------------------------------------- 1 | You Have Everything To Gain! 2 | 3 | Incredib1e gains in length of 3-4 inches to yourPenis, PERMANANTLY 4 | 5 | Amazing increase in thickness of yourPenis, up to 30% 6 | BetterEjacu1ation control 7 | Experience Rock-HardErecetions 8 | Explosive, intenseOrgasns 9 | Increase volume ofEjacu1ate 10 | Doctor designed and endorsed 11 | 100% herbal, 100% Natural, 100% Safe -------------------------------------------------------------------------------- /email/spam/24.txt: -------------------------------------------------------------------------------- 1 | You Have Everything To Gain! 2 | 3 | Incredib1e gains in length of 3-4 inches to yourPenis, PERMANANTLY 4 | 5 | Amazing increase in thickness of yourPenis, up to 30% 6 | BetterEjacu1ation control 7 | Experience Rock-HardErecetions 8 | Explosive, intenseOrgasns 9 | Increase volume ofEjacu1ate 10 | Doctor designed and endorsed 11 | 100% herbal, 100% Natural, 100% Safe -------------------------------------------------------------------------------- /email/spam/25.txt: -------------------------------------------------------------------------------- 1 | Experience with BiggerPenis Today! Grow 3-inches more 2 | 3 | The Safest & Most Effective Methods Of_PenisEn1argement. 4 | Save your time and money! 5 | BetterErections with effective Ma1eEnhancement products. 6 | 7 | #1 Ma1eEnhancement Supplement. Trusted by Millions. Buy Today! -------------------------------------------------------------------------------- /email/spam/3.txt: -------------------------------------------------------------------------------- 1 | You Have Everything To Gain! 2 | 3 | Incredib1e gains in length of 3-4 inches to yourPenis, PERMANANTLY 4 | 5 | Amazing increase in thickness of yourPenis, up to 30% 6 | BetterEjacu1ation control 7 | Experience Rock-HardErecetions 8 | Explosive, intenseOrgasns 9 | Increase volume ofEjacu1ate 10 | Doctor designed and endorsed 11 | 100% herbal, 100% Natural, 100% Safe 12 | The proven NaturalPenisEnhancement that works! 13 | 100% MoneyBack Guaranteeed -------------------------------------------------------------------------------- /email/spam/4.txt: -------------------------------------------------------------------------------- 1 | Percocet 10/625 mg withoutPrescription 30 tabs - $225! 2 | Percocet, a narcotic analgesic, is used to treat moderate to moderately SeverePain 3 | Top Quality, EXPRESS Shipping, 100% Safe & Discreet & Private. 4 | Buy Cheap Percocet Online -------------------------------------------------------------------------------- /email/spam/5.txt: -------------------------------------------------------------------------------- 1 | --- Codeine 15mg -- 30 for $203.70 -- VISA Only!!! -- 2 | 3 | -- Codeine (Methylmorphine) is a narcotic (opioid) pain reliever 4 | -- We have 15mg & 30mg pills -- 30/15mg for $203.70 - 60/15mg for $385.80 - 90/15mg for $562.50 -- VISA Only!!! --- -------------------------------------------------------------------------------- /email/spam/6.txt: -------------------------------------------------------------------------------- 1 | OEM Adobe & Microsoft softwares 2 | Fast order and download 3 | 4 | Microsoft Office Professional Plus 2007/2010 $129 5 | Microsoft Windows 7 Ultimate $119 6 | Adobe Photoshop CS5 Extended 7 | Adobe Acrobat 9 Pro Extended 8 | Windows XP Professional & thousand more titles -------------------------------------------------------------------------------- /email/spam/7.txt: -------------------------------------------------------------------------------- 1 | Bargains Here! Buy Phentermin 37.5 mg (K-25) 2 | 3 | Buy Genuine Phentermin at Low Cost 4 | VISA Accepted 5 | 30 - $130.50 6 | 60 - $219.00 7 | 90 - $292.50 8 | 120 - $366.00 9 | 180 - $513.00 -------------------------------------------------------------------------------- /email/spam/8.txt: -------------------------------------------------------------------------------- 1 | You Have Everything To Gain! 2 | 3 | Incredib1e gains in length of 3-4 inches to yourPenis, PERMANANTLY 4 | 5 | Amazing increase in thickness of yourPenis, up to 30% 6 | BetterEjacu1ation control 7 | Experience Rock-HardErecetions 8 | Explosive, intenseOrgasns 9 | Increase volume ofEjacu1ate 10 | Doctor designed and endorsed 11 | 100% herbal, 100% Natural, 100% Safe -------------------------------------------------------------------------------- /email/spam/9.txt: -------------------------------------------------------------------------------- 1 | Bargains Here! Buy Phentermin 37.5 mg (K-25) 2 | 3 | Buy Genuine Phentermin at Low Cost 4 | VISA Accepted 5 | 30 - $130.50 6 | 60 - $219.00 7 | 90 - $292.50 8 | 120 - $366.00 9 | 180 - $513.00 -------------------------------------------------------------------------------- /spam_filter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import re 5 | from sklearn.feature_extraction.text import CountVectorizer 6 | 7 | 8 | # 贝叶斯公式 P(A|B)=P(B|A)*P(A)/P(B) 9 | # 利用贝叶斯公式进行邮件筛选 10 | # A:收到垃圾邮件 B:邮件含有某个词语 11 | # P(A|B):在含有该词语的条件下A为垃圾的概率 12 | # P(B|A):垃圾邮件中出现该词的概率 13 | # P(A):收到垃圾邮件的概率,一般为0.5 14 | # P(B):在所有邮件中该词语出现的概率 15 | 16 | # 联合概率计算公式 P=P1*P2...Pm/P1*P2*...Pn+(1-P1)*(1-P2)...(1-Pn) 17 | # 利用贝叶斯公式计算出每一个词对应的条件概率后,再代入联合概率公式 18 | 19 | 20 | # 将text中的标点符号和数字过滤,小写化 21 | def Filter_text(text): 22 | str = re.sub('[^a-zA-Z]', ' ', text) 23 | str = re.sub(r'\s+', ' ', str) 24 | # print(str) 25 | return str.lower() 26 | 27 | 28 | # 统计垃圾邮件和健康邮件的词频 29 | def Count(text): 30 | vectorizer = CountVectorizer() 31 | L = [''] 32 | L[0] = text 33 | weight = vectorizer.fit_transform(L).toarray() 34 | word = vectorizer.get_feature_names() # 所有文本的关键字 35 | print(word) 36 | return {word[j]: int(weight[0][j]) for j in range(len(word))} 37 | 38 | 39 | # 求词频字典的总频数 40 | def Sum(dic): 41 | n = 0 42 | for value in dic.values(): 43 | n = n + value 44 | return n 45 | 46 | 47 | def Bayes(test): 48 | test = Filter_text(test) 49 | test_count = sorted(Count(test).items(), key=lambda x: x[1], reverse=True) 50 | # print(test_count) 51 | 52 | # 提取前15个词作计算条件概率,代入贝叶斯联合公式 53 | # 如果长度不够,就取总词数 54 | if len(test_count) >= 15: 55 | r = 15 56 | else: 57 | r = len(test_count) 58 | # print(r) 59 | P = [] 60 | for n in range(r): 61 | word = test_count[n][0] 62 | if not spam_dic.get(word): 63 | P.append(0.4) 64 | # 如果有的词是第一次出现,无法计算P(S | W),就假定这个值等于0.4。 65 | # 因为垃圾邮件用的往往都是某些固定的词语,所以如果你从来没见过某个词,它多半是一个正常的词。 66 | elif not health_dic.get(word): 67 | word_ham = 0.003 68 | # 这个值可能还需要修正,资料中给出的值是1% 69 | # 如果某个词只出现在垃圾邮件中, 就假定,它在正常邮件的出现频率是0.3 % 70 | word_spam = spam_dic[word] / spam_sum 71 | P.append((word_spam * 0.5) / ((word_ham * 0.5) + (word_spam * 0.5))) 72 | 73 | else: 74 | word_spam = spam_dic[word] / spam_sum 75 | word_ham = health_dic[word] / health_sum 76 | P.append((word_spam * 0.5) / ((word_ham * 0.5) + (word_spam * 0.5))) 77 | # print(P) 78 | # 计算联合概率 79 | p1 = 1 80 | p2 = 1 81 | for n in range(r): 82 | p1 = p1 * P[n] 83 | p2 = p2 * (1 - P[n]) 84 | return (p1 / (p1 + p2)) 85 | 86 | 87 | # 导入文件,创建 health邮件库和spam邮件库 88 | health = '' 89 | spam = '' 90 | for x in range(1, 21): 91 | f = open('E:/PY/spam_filter/email/ham/' + str(x) + '.txt', 'r', errors='ignore') 92 | health = health + f.read() + ' ' 93 | f.close() 94 | f = open('E:/PY/spam_filter/email/spam/' + str(x) + '.txt', 'r', errors='ignore') 95 | spam = spam + f.read() + ' ' 96 | f.close() 97 | health = Filter_text(health) 98 | spam = Filter_text(spam)[1:] # spam字符串第一个是空格,不好看 99 | # 转化为有序的字典 100 | health_dic = dict(sorted(Count(health).items(), key=lambda x: x[1], reverse=True)) 101 | spam_dic = dict(sorted(Count(spam).items(), key=lambda x: x[1], reverse=True)) 102 | # print(spam_dic) 103 | health_sum = Sum(health_dic) 104 | spam_sum = Sum(spam_dic) 105 | 106 | # 测试 107 | for x in range(21, 26): 108 | f = open('E:/PY/spam_filter/email/spam/' + str(x) + '.txt', 'r', errors='ignore') 109 | test = f.read() 110 | f.close() 111 | print('spam' + str(x), Bayes(test)) 112 | for x in range(21, 26): 113 | f = open('E:/PY/spam_filter/email/ham/' + str(x) + '.txt', 'r', errors='ignore') 114 | test = f.read() 115 | f.close() 116 | print('ham' + str(x), Bayes(test)) 117 | 118 | --------------------------------------------------------------------------------