├── README.md
├── api
    ├── index.py
    ├── model.pkl
    └── tfidf.pkl
├── client
    ├── .gitignore
    ├── README.md
    ├── package-lock.json
    ├── package.json
    ├── public
    │   ├── evil-hacker-thief-anonymous-bad-man-killer-virus-attack-danger-dark_268834-416.webp
    │   ├── index.html
    │   └── robots.txt
    ├── src
    │   ├── App.tsx
    │   ├── components
    │   │   ├── FakeNews.tsx
    │   │   └── Form.tsx
    │   ├── index.css
    │   ├── index.tsx
    │   ├── react-app-env.d.ts
    │   └── styles
    │   │   └── globals.css
    ├── tailwind.config.js
    └── tsconfig.json
├── demo
    ├── benign.png
    ├── defacement.png
    └── phishing.png
└── model
    └── index.ipynb


/README.md:
--------------------------------------------------------------------------------
1 | # Malicious URLs detector
2 | 
3 | This project classifies and detect malicious URLs using Machine Learning in a web application built with Flask and Next.js
4 | 
5 | ![Logo](./demo/benign.png)
6 | ![Logo](./demo/phishing.png)
7 | ![Logo](./demo/defacement.png)
8 | 


--------------------------------------------------------------------------------
/api/index.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from flask import Flask, request, jsonify, render_template
 3 | from flask_cors import CORS
 4 | import numpy as np
 5 | import joblib
 6 | 
 7 | 
 8 | # Create the application.
 9 | app = Flask(__name__, template_folder='../client/build', static_folder='../client/build/static')
10 | CORS(app)
11 | 
12 | @app.route('/')
13 | def main():
14 |     """ Displays the main page accessible at '/'
15 |     """
16 |     return render_template("index.html", token="Hello React+Flask")
17 | 
18 | @app.route('/predict', methods=['POST'])
19 | def predict():
20 |     """ Predict whether news is fake or real """
21 |     
22 |     news = request.json["news"]
23 |     news_transformed = tfidf_transformer.transform(np.array([news]))
24 |     prediction = model.predict(news_transformed)
25 | 
26 |     return jsonify({"prediction" : list(prediction)})
27 | 
28 | 
29 | 
30 | if __name__ == '__main__':
31 |     
32 |     model = joblib.load('./model.pkl')
33 |     tfidf_transformer = joblib.load('./tfidf.pkl')
34 |     
35 |     app.run(debug=True)
36 |     
37 | 
38 | 


--------------------------------------------------------------------------------
/api/model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ramzy1453/malicious-url-detection/1b32e638cb8a70f43a6a77677cea5976e12a93dc/api/model.pkl


--------------------------------------------------------------------------------
/api/tfidf.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ramzy1453/malicious-url-detection/1b32e638cb8a70f43a6a77677cea5976e12a93dc/api/tfidf.pkl


--------------------------------------------------------------------------------
/client/.gitignore:
--------------------------------------------------------------------------------
 1 | # See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
 2 | 
 3 | # dependencies
 4 | /node_modules
 5 | /.pnp
 6 | .pnp.js
 7 | 
 8 | # testing
 9 | /coverage
10 | 
11 | # production
12 | /build
13 | 
14 | # misc
15 | .DS_Store
16 | .env.local
17 | .env.development.local
18 | .env.test.local
19 | .env.production.local
20 | 
21 | npm-debug.log*
22 | yarn-debug.log*
23 | yarn-error.log*
24 | 


--------------------------------------------------------------------------------
/client/README.md:
--------------------------------------------------------------------------------
 1 | # Getting Started with Create React App
 2 | 
 3 | This project was bootstrapped with [Create React App](https://github.com/facebook/create-react-app).
 4 | 
 5 | ## Available Scripts
 6 | 
 7 | In the project directory, you can run:
 8 | 
 9 | ### `npm start`
10 | 
11 | Runs the app in the development mode.\
12 | Open [http://localhost:3000](http://localhost:3000) to view it in the browser.
13 | 
14 | The page will reload if you make edits.\
15 | You will also see any lint errors in the console.
16 | 
17 | ### `npm test`
18 | 
19 | Launches the test runner in the interactive watch mode.\
20 | See the section about [running tests](https://facebook.github.io/create-react-app/docs/running-tests) for more information.
21 | 
22 | ### `npm run build`
23 | 
24 | Builds the app for production to the `build` folder.\
25 | It correctly bundles React in production mode and optimizes the build for the best performance.
26 | 
27 | The build is minified and the filenames include the hashes.\
28 | Your app is ready to be deployed!
29 | 
30 | See the section about [deployment](https://facebook.github.io/create-react-app/docs/deployment) for more information.
31 | 
32 | ### `npm run eject`
33 | 
34 | **Note: this is a one-way operation. Once you `eject`, you can’t go back!**
35 | 
36 | If you aren’t satisfied with the build tool and configuration choices, you can `eject` at any time. This command will remove the single build dependency from your project.
37 | 
38 | Instead, it will copy all the configuration files and the transitive dependencies (webpack, Babel, ESLint, etc) right into your project so you have full control over them. All of the commands except `eject` will still work, but they will point to the copied scripts so you can tweak them. At this point you’re on your own.
39 | 
40 | You don’t have to ever use `eject`. The curated feature set is suitable for small and middle deployments, and you shouldn’t feel obligated to use this feature. However we understand that this tool wouldn’t be useful if you couldn’t customize it when you are ready for it.
41 | 
42 | ## Learn More
43 | 
44 | You can learn more in the [Create React App documentation](https://facebook.github.io/create-react-app/docs/getting-started).
45 | 
46 | To learn React, check out the [React documentation](https://reactjs.org/).
47 | 


--------------------------------------------------------------------------------
/client/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "client",
 3 |   "version": "0.1.0",
 4 |   "private": true,
 5 |   "dependencies": {
 6 |     "@testing-library/jest-dom": "^5.16.5",
 7 |     "@testing-library/react": "^13.4.0",
 8 |     "@testing-library/user-event": "^13.5.0",
 9 |     "@types/jest": "^27.5.2",
10 |     "@types/node": "^16.18.11",
11 |     "@types/react": "^18.0.26",
12 |     "@types/react-dom": "^18.0.10",
13 |     "react": "^18.2.0",
14 |     "react-dom": "^18.2.0",
15 |     "react-scripts": "5.0.1",
16 |     "typescript": "^4.9.4",
17 |     "web-vitals": "^2.1.4"
18 |   },
19 |   "scripts": {
20 |     "start": "react-scripts start",
21 |     "build": "react-scripts build",
22 |     "test": "react-scripts test",
23 |     "eject": "react-scripts eject"
24 |   },
25 |   "eslintConfig": {
26 |     "extends": [
27 |       "react-app",
28 |       "react-app/jest"
29 |     ]
30 |   },
31 |   "browserslist": {
32 |     "production": [
33 |       ">0.2%",
34 |       "not dead",
35 |       "not op_mini all"
36 |     ],
37 |     "development": [
38 |       "last 1 chrome version",
39 |       "last 1 firefox version",
40 |       "last 1 safari version"
41 |     ]
42 |   },
43 |   "devDependencies": {
44 |     "tailwindcss": "^3.2.4"
45 |   }
46 | }
47 | 


--------------------------------------------------------------------------------
/client/public/evil-hacker-thief-anonymous-bad-man-killer-virus-attack-danger-dark_268834-416.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ramzy1453/malicious-url-detection/1b32e638cb8a70f43a6a77677cea5976e12a93dc/client/public/evil-hacker-thief-anonymous-bad-man-killer-virus-attack-danger-dark_268834-416.webp


--------------------------------------------------------------------------------
/client/public/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 |   <head>
 4 |     <meta charset="utf-8" />
 5 |     <link
 6 |       rel="icon"
 7 |       href="%PUBLIC_URL%/evil-hacker-thief-anonymous-bad-man-killer-virus-attack-danger-dark_268834-416.webp"
 8 |     />
 9 |     <meta name="viewport" content="width=device-width, initial-scale=1" />
10 |     <meta name="theme-color" content="#000000" />
11 |     <meta
12 |       name="description"
13 |       content="Web site created using create-react-app"
14 |     />
15 |     <link rel="apple-touch-icon" href="%PUBLIC_URL%/logo192.png" />
16 |     <link rel="manifest" href="%PUBLIC_URL%/manifest.json" />
17 |     <title>Malicious URL detection</title>
18 |   </head>
19 |   <body>
20 |     <div id="root"></div>
21 |   </body>
22 | </html>
23 | 


--------------------------------------------------------------------------------
/client/public/robots.txt:
--------------------------------------------------------------------------------
1 | # https://www.robotstxt.org/robotstxt.html
2 | User-agent: *
3 | Disallow:
4 | 


--------------------------------------------------------------------------------
/client/src/App.tsx:
--------------------------------------------------------------------------------
 1 | import { ChangeEvent, useState } from "react";
 2 | import FakeNews from "./components/FakeNews";
 3 | import Form from "./components/Form";
 4 | 
 5 | export default function App() {
 6 |   const [message, setMessage] = useState("");
 7 |   const [messageType, setMessageType] = useState<undefined | string>();
 8 |   const onChangeMessage = (e: ChangeEvent<HTMLInputElement>) => {
 9 |     setMessage(e.target.value);
10 |   };
11 |   return (
12 |     <div className="h-screen py-16 px-8 md:p-0 md:flex md:flex-col justify-center items-center">
13 |       <div className="text-[#3ea743] font-bold text-4xl text-center md:text-left [text-shadow:_0_1px_0_rgb(0_0_0_/_40%)] shadow-[#3ea743]">
14 |         MALICIOUS URL DETECTIONS
15 |       </div>
16 |       <div className="flex flex-col space-y-6 my-8 md:w-[60%]">
17 |         <Form
18 |           message={message}
19 |           onChangeMessage={onChangeMessage}
20 |           setMessageType={setMessageType}
21 |         />
22 |       </div>
23 |       <div className="flex items-center justify-center my-16 lg:m-0">
24 |         <FakeNews messageType={messageType} />
25 |       </div>
26 |     </div>
27 |   );
28 | }
29 | 


--------------------------------------------------------------------------------
/client/src/components/FakeNews.tsx:
--------------------------------------------------------------------------------
 1 | type Props = {
 2 |   messageType: string | undefined;
 3 | };
 4 | 
 5 | export default function FakeNews(props: Props) {
 6 |   if (props.messageType === "error") {
 7 |     return (
 8 |       <div className="text-red-500 text-xl text-center">
 9 |         An error occured at the level of the API
10 |       </div>
11 |     );
12 |   }
13 |   const text = `it's a ${props.messageType} link`;
14 |   return (
15 |     <>
16 |       <div
17 |         className={`${
18 |           props.messageType === undefined
19 |             ? "invisible"
20 |             : props.messageType !== "benign"
21 |             ? "text-red-500"
22 |             : "text-green-500"
23 |         } text-4xl text-center`}
24 |       >
25 |         {text}
26 |       </div>
27 |     </>
28 |   );
29 | }
30 | 


--------------------------------------------------------------------------------
/client/src/components/Form.tsx:
--------------------------------------------------------------------------------
 1 | import { ChangeEventHandler, Dispatch, SetStateAction } from "react";
 2 | 
 3 | type Props = {
 4 |   message: string;
 5 |   onChangeMessage: ChangeEventHandler;
 6 |   setMessageType: Dispatch<SetStateAction<undefined | string>>;
 7 | };
 8 | 
 9 | export default function Form(props: Props) {
10 |   const onSubmit = async () => {
11 |     if (props.message === "") {
12 |       props.setMessageType(undefined);
13 |     }
14 |     try {
15 |       const response = await fetch("http://localhost:5000/predict", {
16 |         method: "POST",
17 |         headers: {
18 |           "Content-Type": "application/json",
19 |         },
20 |         body: JSON.stringify({ news: props.message }),
21 |       });
22 |       const data = await response.json();
23 |       props.setMessageType(data.prediction.pop());
24 |     } catch (error) {
25 |       props.setMessageType("error");
26 |     }
27 |   };
28 | 
29 |   return (
30 |     <>
31 |       <input
32 |         className="outline-none border-none py-2 px-3"
33 |         id="message"
34 |         type="text"
35 |         value={props.message}
36 |         onChange={props.onChangeMessage}
37 |       />
38 |       <button
39 |         onClick={onSubmit}
40 |         className="bg-blue-700 text-white hover:bg-blue-500 transition-all py-2"
41 |       >
42 |         Predict
43 |       </button>
44 |     </>
45 |   );
46 | }
47 | 


--------------------------------------------------------------------------------
/client/src/index.css:
--------------------------------------------------------------------------------
 1 | body {
 2 |   margin: 0;
 3 |   font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', 'Roboto', 'Oxygen',
 4 |     'Ubuntu', 'Cantarell', 'Fira Sans', 'Droid Sans', 'Helvetica Neue',
 5 |     sans-serif;
 6 |   -webkit-font-smoothing: antialiased;
 7 |   -moz-osx-font-smoothing: grayscale;
 8 | }
 9 | 
10 | code {
11 |   font-family: source-code-pro, Menlo, Monaco, Consolas, 'Courier New',
12 |     monospace;
13 | }
14 | 


--------------------------------------------------------------------------------
/client/src/index.tsx:
--------------------------------------------------------------------------------
 1 | import "./styles/globals.css";
 2 | import React from "react";
 3 | import ReactDOM from "react-dom/client";
 4 | import App from "./App";
 5 | 
 6 | const root = ReactDOM.createRoot(
 7 |   document.getElementById("root") as HTMLElement
 8 | );
 9 | root.render(
10 |   <React.StrictMode>
11 |     <App />
12 |   </React.StrictMode>
13 | );
14 | 


--------------------------------------------------------------------------------
/client/src/react-app-env.d.ts:
--------------------------------------------------------------------------------
1 | /// <reference types="react-scripts" />
2 | 


--------------------------------------------------------------------------------
/client/src/styles/globals.css:
--------------------------------------------------------------------------------
 1 | @import url("https://fonts.googleapis.com/css2?family=Roboto+Mono:wght@500&display=swap");
 2 | 
 3 | @tailwind base;
 4 | @tailwind components;
 5 | @tailwind utilities;
 6 | 
 7 | html,
 8 | body {
 9 |   margin: 0;
10 |   padding: 0;
11 |   font-family: "Roboto Mono", monospace;
12 |   background-color: #0f0f23;
13 |   font-size: max(1.25vw, 1.1rem);
14 | }
15 | 
16 | *,
17 | *::before,
18 | *::after {
19 |   box-sizing: border-box;
20 | }
21 | 


--------------------------------------------------------------------------------
/client/tailwind.config.js:
--------------------------------------------------------------------------------
1 | /** @type {import('tailwindcss').Config} */
2 | module.exports = {
3 |   content: ["./src/**/*.{js,jsx,ts,tsx}"],
4 |   theme: {
5 |     extend: {},
6 |   },
7 |   plugins: [],
8 | };
9 | 


--------------------------------------------------------------------------------
/client/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "compilerOptions": {
 3 |     "target": "es5",
 4 |     "lib": [
 5 |       "dom",
 6 |       "dom.iterable",
 7 |       "esnext"
 8 |     ],
 9 |     "allowJs": true,
10 |     "skipLibCheck": true,
11 |     "esModuleInterop": true,
12 |     "allowSyntheticDefaultImports": true,
13 |     "strict": true,
14 |     "forceConsistentCasingInFileNames": true,
15 |     "noFallthroughCasesInSwitch": true,
16 |     "module": "esnext",
17 |     "moduleResolution": "node",
18 |     "resolveJsonModule": true,
19 |     "isolatedModules": true,
20 |     "noEmit": true,
21 |     "jsx": "react-jsx"
22 |   },
23 |   "include": [
24 |     "src"
25 |   ]
26 | }
27 | 


--------------------------------------------------------------------------------
/demo/benign.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ramzy1453/malicious-url-detection/1b32e638cb8a70f43a6a77677cea5976e12a93dc/demo/benign.png


--------------------------------------------------------------------------------
/demo/defacement.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ramzy1453/malicious-url-detection/1b32e638cb8a70f43a6a77677cea5976e12a93dc/demo/defacement.png


--------------------------------------------------------------------------------
/demo/phishing.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ramzy1453/malicious-url-detection/1b32e638cb8a70f43a6a77677cea5976e12a93dc/demo/phishing.png


--------------------------------------------------------------------------------
/model/index.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "cells": [
  3 |     {
  4 |       "cell_type": "code",
  5 |       "execution_count": 16,
  6 |       "metadata": {
  7 |         "colab": {
  8 |           "base_uri": "https://localhost:8080/"
  9 |         },
 10 |         "id": "qD-AJDPl0xyk",
 11 |         "outputId": "e0fcaad1-52f0-48c9-e487-4d890dd64072"
 12 |       },
 13 |       "outputs": [
 14 |         {
 15 |           "name": "stdout",
 16 |           "output_type": "stream",
 17 |           "text": [
 18 |             "-rw-r--r-- 1 root root 68 Jan  9 13:55 kaggle.json\n",
 19 |             "/content\n",
 20 |             "mkdir: cannot create directory ‘datasets’: File exists\n",
 21 |             "Downloading malicious-urls-dataset.zip to /content\n",
 22 |             " 95% 16.0M/16.9M [00:01<00:00, 18.0MB/s]\n",
 23 |             "100% 16.9M/16.9M [00:01<00:00, 10.1MB/s]\n",
 24 |             "datasets     malicious_phish.csv\t sample_data\n",
 25 |             "kaggle.json  malicious-urls-dataset.zip\n",
 26 |             "Archive:  malicious-urls-dataset.zip\n",
 27 |             "replace malicious_phish.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: "
 28 |           ]
 29 |         }
 30 |       ],
 31 |       "source": [
 32 |         "# Execute those command to download the dataset from kaggle (place the kaggle.json in the folder)\n",
 33 |         "# !ls -lha kaggle.json \n",
 34 |         "# !pip install -q kaggle\n",
 35 |         "# !mkdir -p ~/.kaggle \n",
 36 |         "# !cp kaggle.json ~/.kaggle/\n",
 37 |         "# !pwd\n",
 38 |         "# !chmod 600 ~/.kaggle/kaggle.json\n",
 39 |         "# !mkdir datasets\n",
 40 |         "# !cd datasets\n",
 41 |         "# !kaggle datasets download -d sid321axn/malicious-urls-dataset\n",
 42 |         "# !ls\n",
 43 |         "# !unzip *.zip\n",
 44 |         "# !rm *.zip"
 45 |       ]
 46 |     },
 47 |     {
 48 |       "cell_type": "code",
 49 |       "execution_count": 17,
 50 |       "metadata": {
 51 |         "id": "rAf_JwDE6kg6"
 52 |       },
 53 |       "outputs": [],
 54 |       "source": [
 55 |         "import numpy as np\n",
 56 |         "import pandas as pd"
 57 |       ]
 58 |     },
 59 |     {
 60 |       "cell_type": "code",
 61 |       "execution_count": 18,
 62 |       "metadata": {
 63 |         "id": "6yKTbfnm2fZV"
 64 |       },
 65 |       "outputs": [],
 66 |       "source": [
 67 |         "df = pd.read_csv('malicious_phish.csv')"
 68 |       ]
 69 |     },
 70 |     {
 71 |       "cell_type": "code",
 72 |       "execution_count": 19,
 73 |       "metadata": {
 74 |         "colab": {
 75 |           "base_uri": "https://localhost:8080/",
 76 |           "height": 224
 77 |         },
 78 |         "id": "q2hmMPEt3Ai2",
 79 |         "outputId": "ac961db5-05a4-40b6-ee21-8f4a5ede9ae8"
 80 |       },
 81 |       "outputs": [
 82 |         {
 83 |           "name": "stdout",
 84 |           "output_type": "stream",
 85 |           "text": [
 86 |             "(651191, 2)\n"
 87 |           ]
 88 |         },
 89 |         {
 90 |           "data": {
 91 |             "text/html": [
 92 |               "\n",
 93 |               "  <div id=\"df-bf599edd-369b-4ef0-9a3f-3237751941ea\">\n",
 94 |               "    <div class=\"colab-df-container\">\n",
 95 |               "      <div>\n",
 96 |               "<style scoped>\n",
 97 |               "    .dataframe tbody tr th:only-of-type {\n",
 98 |               "        vertical-align: middle;\n",
 99 |               "    }\n",
100 |               "\n",
101 |               "    .dataframe tbody tr th {\n",
102 |               "        vertical-align: top;\n",
103 |               "    }\n",
104 |               "\n",
105 |               "    .dataframe thead th {\n",
106 |               "        text-align: right;\n",
107 |               "    }\n",
108 |               "</style>\n",
109 |               "<table border=\"1\" class=\"dataframe\">\n",
110 |               "  <thead>\n",
111 |               "    <tr style=\"text-align: right;\">\n",
112 |               "      <th></th>\n",
113 |               "      <th>url</th>\n",
114 |               "      <th>type</th>\n",
115 |               "    </tr>\n",
116 |               "  </thead>\n",
117 |               "  <tbody>\n",
118 |               "    <tr>\n",
119 |               "      <th>0</th>\n",
120 |               "      <td>br-icloud.com.br</td>\n",
121 |               "      <td>phishing</td>\n",
122 |               "    </tr>\n",
123 |               "    <tr>\n",
124 |               "      <th>1</th>\n",
125 |               "      <td>mp3raid.com/music/krizz_kaliko.html</td>\n",
126 |               "      <td>benign</td>\n",
127 |               "    </tr>\n",
128 |               "    <tr>\n",
129 |               "      <th>2</th>\n",
130 |               "      <td>bopsecrets.org/rexroth/cr/1.htm</td>\n",
131 |               "      <td>benign</td>\n",
132 |               "    </tr>\n",
133 |               "    <tr>\n",
134 |               "      <th>3</th>\n",
135 |               "      <td>http://www.garage-pirenne.be/index.php?option=...</td>\n",
136 |               "      <td>defacement</td>\n",
137 |               "    </tr>\n",
138 |               "    <tr>\n",
139 |               "      <th>4</th>\n",
140 |               "      <td>http://adventure-nicaragua.net/index.php?optio...</td>\n",
141 |               "      <td>defacement</td>\n",
142 |               "    </tr>\n",
143 |               "  </tbody>\n",
144 |               "</table>\n",
145 |               "</div>\n",
146 |               "      <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-bf599edd-369b-4ef0-9a3f-3237751941ea')\"\n",
147 |               "              title=\"Convert this dataframe to an interactive table.\"\n",
148 |               "              style=\"display:none;\">\n",
149 |               "        \n",
150 |               "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
151 |               "       width=\"24px\">\n",
152 |               "    <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n",
153 |               "    <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n",
154 |               "  </svg>\n",
155 |               "      </button>\n",
156 |               "      \n",
157 |               "  <style>\n",
158 |               "    .colab-df-container {\n",
159 |               "      display:flex;\n",
160 |               "      flex-wrap:wrap;\n",
161 |               "      gap: 12px;\n",
162 |               "    }\n",
163 |               "\n",
164 |               "    .colab-df-convert {\n",
165 |               "      background-color: #E8F0FE;\n",
166 |               "      border: none;\n",
167 |               "      border-radius: 50%;\n",
168 |               "      cursor: pointer;\n",
169 |               "      display: none;\n",
170 |               "      fill: #1967D2;\n",
171 |               "      height: 32px;\n",
172 |               "      padding: 0 0 0 0;\n",
173 |               "      width: 32px;\n",
174 |               "    }\n",
175 |               "\n",
176 |               "    .colab-df-convert:hover {\n",
177 |               "      background-color: #E2EBFA;\n",
178 |               "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
179 |               "      fill: #174EA6;\n",
180 |               "    }\n",
181 |               "\n",
182 |               "    [theme=dark] .colab-df-convert {\n",
183 |               "      background-color: #3B4455;\n",
184 |               "      fill: #D2E3FC;\n",
185 |               "    }\n",
186 |               "\n",
187 |               "    [theme=dark] .colab-df-convert:hover {\n",
188 |               "      background-color: #434B5C;\n",
189 |               "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
190 |               "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
191 |               "      fill: #FFFFFF;\n",
192 |               "    }\n",
193 |               "  </style>\n",
194 |               "\n",
195 |               "      <script>\n",
196 |               "        const buttonEl =\n",
197 |               "          document.querySelector('#df-bf599edd-369b-4ef0-9a3f-3237751941ea button.colab-df-convert');\n",
198 |               "        buttonEl.style.display =\n",
199 |               "          google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
200 |               "\n",
201 |               "        async function convertToInteractive(key) {\n",
202 |               "          const element = document.querySelector('#df-bf599edd-369b-4ef0-9a3f-3237751941ea');\n",
203 |               "          const dataTable =\n",
204 |               "            await google.colab.kernel.invokeFunction('convertToInteractive',\n",
205 |               "                                                     [key], {});\n",
206 |               "          if (!dataTable) return;\n",
207 |               "\n",
208 |               "          const docLinkHtml = 'Like what you see? Visit the ' +\n",
209 |               "            '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
210 |               "            + ' to learn more about interactive tables.';\n",
211 |               "          element.innerHTML = '';\n",
212 |               "          dataTable['output_type'] = 'display_data';\n",
213 |               "          await google.colab.output.renderOutput(dataTable, element);\n",
214 |               "          const docLink = document.createElement('div');\n",
215 |               "          docLink.innerHTML = docLinkHtml;\n",
216 |               "          element.appendChild(docLink);\n",
217 |               "        }\n",
218 |               "      </script>\n",
219 |               "    </div>\n",
220 |               "  </div>\n",
221 |               "  "
222 |             ],
223 |             "text/plain": [
224 |               "                                                 url        type\n",
225 |               "0                                   br-icloud.com.br    phishing\n",
226 |               "1                mp3raid.com/music/krizz_kaliko.html      benign\n",
227 |               "2                    bopsecrets.org/rexroth/cr/1.htm      benign\n",
228 |               "3  http://www.garage-pirenne.be/index.php?option=...  defacement\n",
229 |               "4  http://adventure-nicaragua.net/index.php?optio...  defacement"
230 |             ]
231 |           },
232 |           "execution_count": 19,
233 |           "metadata": {},
234 |           "output_type": "execute_result"
235 |         }
236 |       ],
237 |       "source": [
238 |         "print(df.shape)\n",
239 |         "df.head()"
240 |       ]
241 |     },
242 |     {
243 |       "cell_type": "code",
244 |       "execution_count": 20,
245 |       "metadata": {
246 |         "colab": {
247 |           "base_uri": "https://localhost:8080/"
248 |         },
249 |         "id": "e8N-0lSiCNHx",
250 |         "outputId": "3609e1e8-2b79-453c-a002-06f4807bbde2"
251 |       },
252 |       "outputs": [
253 |         {
254 |           "data": {
255 |             "text/plain": [
256 |               "benign        428103\n",
257 |               "defacement     96457\n",
258 |               "phishing       94111\n",
259 |               "malware        32520\n",
260 |               "Name: type, dtype: int64"
261 |             ]
262 |           },
263 |           "execution_count": 20,
264 |           "metadata": {},
265 |           "output_type": "execute_result"
266 |         }
267 |       ],
268 |       "source": [
269 |         "df.type.value_counts()"
270 |       ]
271 |     },
272 |     {
273 |       "cell_type": "code",
274 |       "execution_count": 25,
275 |       "metadata": {
276 |         "id": "f1xUP9DAI7Vn"
277 |       },
278 |       "outputs": [],
279 |       "source": [
280 |         "X = df['url'].values\n",
281 |         "y = df['type'].values"
282 |       ]
283 |     },
284 |     {
285 |       "cell_type": "code",
286 |       "execution_count": 26,
287 |       "metadata": {
288 |         "id": "Z55h_e9S2ivF"
289 |       },
290 |       "outputs": [],
291 |       "source": [
292 |         "from sklearn.model_selection import train_test_split\n",
293 |         "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, stratify=y)"
294 |       ]
295 |     },
296 |     {
297 |       "cell_type": "code",
298 |       "execution_count": 27,
299 |       "metadata": {
300 |         "id": "bvvXc3sLJukN"
301 |       },
302 |       "outputs": [],
303 |       "source": [
304 |         "#@title converting the textual data to numerical\n",
305 |         "from sklearn.feature_extraction.text import TfidfVectorizer\n",
306 |         "\n",
307 |         "tfidf = TfidfVectorizer()\n",
308 |         "\n",
309 |         "X_train = tfidf.fit_transform(X_train)\n",
310 |         "X_test = tfidf.transform(X_test)"
311 |       ]
312 |     },
313 |     {
314 |       "cell_type": "code",
315 |       "execution_count": 36,
316 |       "metadata": {
317 |         "id": "KV2fohj91UpZ"
318 |       },
319 |       "outputs": [],
320 |       "source": [
321 |         "from sklearn.linear_model import LogisticRegression\n",
322 |         "from sklearn.model_selection import GridSearchCV"
323 |       ]
324 |     },
325 |     {
326 |       "cell_type": "code",
327 |       "execution_count": 38,
328 |       "metadata": {
329 |         "colab": {
330 |           "base_uri": "https://localhost:8080/"
331 |         },
332 |         "id": "-jy8Q4x67mSn",
333 |         "outputId": "715d48e7-5409-4087-d313-2ae1c58d929f"
334 |       },
335 |       "outputs": [
336 |         {
337 |           "name": "stdout",
338 |           "output_type": "stream",
339 |           "text": [
340 |             "Fitting 5 folds for each of 14 candidates, totalling 70 fits\n",
341 |             "[CV] END ................................C=0.001, penalty=l1; total time=   0.0s\n",
342 |             "[CV] END ................................C=0.001, penalty=l1; total time=   0.0s\n",
343 |             "[CV] END ................................C=0.001, penalty=l1; total time=   0.0s\n",
344 |             "[CV] END ................................C=0.001, penalty=l1; total time=   0.0s\n",
345 |             "[CV] END ................................C=0.001, penalty=l1; total time=   0.0s\n",
346 |             "[CV] END ................................C=0.001, penalty=l2; total time=  25.8s\n",
347 |             "[CV] END ................................C=0.001, penalty=l2; total time=  25.6s\n",
348 |             "[CV] END ................................C=0.001, penalty=l2; total time=  26.0s\n",
349 |             "[CV] END ................................C=0.001, penalty=l2; total time=  25.5s\n",
350 |             "[CV] END ................................C=0.001, penalty=l2; total time=  26.7s\n",
351 |             "[CV] END .................................C=0.01, penalty=l1; total time=   0.0s\n",
352 |             "[CV] END .................................C=0.01, penalty=l1; total time=   0.0s\n",
353 |             "[CV] END .................................C=0.01, penalty=l1; total time=   0.0s\n",
354 |             "[CV] END .................................C=0.01, penalty=l1; total time=   0.0s\n",
355 |             "[CV] END .................................C=0.01, penalty=l1; total time=   0.0s\n",
356 |             "[CV] END .................................C=0.01, penalty=l2; total time=  51.0s\n",
357 |             "[CV] END .................................C=0.01, penalty=l2; total time=  48.6s\n",
358 |             "[CV] END .................................C=0.01, penalty=l2; total time=  52.3s\n",
359 |             "[CV] END .................................C=0.01, penalty=l2; total time=  51.3s\n",
360 |             "[CV] END .................................C=0.01, penalty=l2; total time=  45.2s\n",
361 |             "[CV] END ..................................C=0.1, penalty=l1; total time=   0.0s\n",
362 |             "[CV] END ..................................C=0.1, penalty=l1; total time=   0.0s\n",
363 |             "[CV] END ..................................C=0.1, penalty=l1; total time=   0.0s\n",
364 |             "[CV] END ..................................C=0.1, penalty=l1; total time=   0.0s\n",
365 |             "[CV] END ..................................C=0.1, penalty=l1; total time=   0.0s\n",
366 |             "[CV] END ..................................C=0.1, penalty=l2; total time= 1.8min\n",
367 |             "[CV] END ..................................C=0.1, penalty=l2; total time= 1.8min\n",
368 |             "[CV] END ..................................C=0.1, penalty=l2; total time= 1.9min\n",
369 |             "[CV] END ..................................C=0.1, penalty=l2; total time= 1.8min\n",
370 |             "[CV] END ..................................C=0.1, penalty=l2; total time= 1.7min\n",
371 |             "[CV] END ..................................C=1.0, penalty=l1; total time=   0.0s\n",
372 |             "[CV] END ..................................C=1.0, penalty=l1; total time=   0.0s\n",
373 |             "[CV] END ..................................C=1.0, penalty=l1; total time=   0.0s\n",
374 |             "[CV] END ..................................C=1.0, penalty=l1; total time=   0.0s\n",
375 |             "[CV] END ..................................C=1.0, penalty=l1; total time=   0.0s\n"
376 |           ]
377 |         },
378 |         {
379 |           "name": "stderr",
380 |           "output_type": "stream",
381 |           "text": [
382 |             "/usr/local/lib/python3.8/dist-packages/sklearn/linear_model/_logistic.py:814: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
383 |             "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
384 |             "\n",
385 |             "Increase the number of iterations (max_iter) or scale the data as shown in:\n",
386 |             "    https://scikit-learn.org/stable/modules/preprocessing.html\n",
387 |             "Please also refer to the documentation for alternative solver options:\n",
388 |             "    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
389 |             "  n_iter_i = _check_optimize_result(\n"
390 |           ]
391 |         },
392 |         {
393 |           "name": "stdout",
394 |           "output_type": "stream",
395 |           "text": [
396 |             "[CV] END ..................................C=1.0, penalty=l2; total time= 3.0min\n"
397 |           ]
398 |         },
399 |         {
400 |           "name": "stderr",
401 |           "output_type": "stream",
402 |           "text": [
403 |             "/usr/local/lib/python3.8/dist-packages/sklearn/linear_model/_logistic.py:814: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
404 |             "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
405 |             "\n",
406 |             "Increase the number of iterations (max_iter) or scale the data as shown in:\n",
407 |             "    https://scikit-learn.org/stable/modules/preprocessing.html\n",
408 |             "Please also refer to the documentation for alternative solver options:\n",
409 |             "    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
410 |             "  n_iter_i = _check_optimize_result(\n"
411 |           ]
412 |         },
413 |         {
414 |           "name": "stdout",
415 |           "output_type": "stream",
416 |           "text": [
417 |             "[CV] END ..................................C=1.0, penalty=l2; total time= 3.0min\n"
418 |           ]
419 |         },
420 |         {
421 |           "name": "stderr",
422 |           "output_type": "stream",
423 |           "text": [
424 |             "/usr/local/lib/python3.8/dist-packages/sklearn/linear_model/_logistic.py:814: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
425 |             "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
426 |             "\n",
427 |             "Increase the number of iterations (max_iter) or scale the data as shown in:\n",
428 |             "    https://scikit-learn.org/stable/modules/preprocessing.html\n",
429 |             "Please also refer to the documentation for alternative solver options:\n",
430 |             "    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
431 |             "  n_iter_i = _check_optimize_result(\n"
432 |           ]
433 |         },
434 |         {
435 |           "name": "stdout",
436 |           "output_type": "stream",
437 |           "text": [
438 |             "[CV] END ..................................C=1.0, penalty=l2; total time= 3.1min\n"
439 |           ]
440 |         },
441 |         {
442 |           "name": "stderr",
443 |           "output_type": "stream",
444 |           "text": [
445 |             "/usr/local/lib/python3.8/dist-packages/sklearn/linear_model/_logistic.py:814: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
446 |             "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
447 |             "\n",
448 |             "Increase the number of iterations (max_iter) or scale the data as shown in:\n",
449 |             "    https://scikit-learn.org/stable/modules/preprocessing.html\n",
450 |             "Please also refer to the documentation for alternative solver options:\n",
451 |             "    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
452 |             "  n_iter_i = _check_optimize_result(\n"
453 |           ]
454 |         },
455 |         {
456 |           "name": "stdout",
457 |           "output_type": "stream",
458 |           "text": [
459 |             "[CV] END ..................................C=1.0, penalty=l2; total time= 3.0min\n"
460 |           ]
461 |         },
462 |         {
463 |           "name": "stderr",
464 |           "output_type": "stream",
465 |           "text": [
466 |             "/usr/local/lib/python3.8/dist-packages/sklearn/linear_model/_logistic.py:814: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
467 |             "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
468 |             "\n",
469 |             "Increase the number of iterations (max_iter) or scale the data as shown in:\n",
470 |             "    https://scikit-learn.org/stable/modules/preprocessing.html\n",
471 |             "Please also refer to the documentation for alternative solver options:\n",
472 |             "    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
473 |             "  n_iter_i = _check_optimize_result(\n"
474 |           ]
475 |         },
476 |         {
477 |           "name": "stdout",
478 |           "output_type": "stream",
479 |           "text": [
480 |             "[CV] END ..................................C=1.0, penalty=l2; total time= 3.0min\n",
481 |             "[CV] END .................................C=10.0, penalty=l1; total time=   0.0s\n",
482 |             "[CV] END .................................C=10.0, penalty=l1; total time=   0.0s\n",
483 |             "[CV] END .................................C=10.0, penalty=l1; total time=   0.0s\n",
484 |             "[CV] END .................................C=10.0, penalty=l1; total time=   0.0s\n",
485 |             "[CV] END .................................C=10.0, penalty=l1; total time=   0.0s\n"
486 |           ]
487 |         },
488 |         {
489 |           "name": "stderr",
490 |           "output_type": "stream",
491 |           "text": [
492 |             "/usr/local/lib/python3.8/dist-packages/sklearn/linear_model/_logistic.py:814: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
493 |             "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
494 |             "\n",
495 |             "Increase the number of iterations (max_iter) or scale the data as shown in:\n",
496 |             "    https://scikit-learn.org/stable/modules/preprocessing.html\n",
497 |             "Please also refer to the documentation for alternative solver options:\n",
498 |             "    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
499 |             "  n_iter_i = _check_optimize_result(\n"
500 |           ]
501 |         },
502 |         {
503 |           "name": "stdout",
504 |           "output_type": "stream",
505 |           "text": [
506 |             "[CV] END .................................C=10.0, penalty=l2; total time= 2.9min\n"
507 |           ]
508 |         },
509 |         {
510 |           "name": "stderr",
511 |           "output_type": "stream",
512 |           "text": [
513 |             "/usr/local/lib/python3.8/dist-packages/sklearn/linear_model/_logistic.py:814: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
514 |             "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
515 |             "\n",
516 |             "Increase the number of iterations (max_iter) or scale the data as shown in:\n",
517 |             "    https://scikit-learn.org/stable/modules/preprocessing.html\n",
518 |             "Please also refer to the documentation for alternative solver options:\n",
519 |             "    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
520 |             "  n_iter_i = _check_optimize_result(\n"
521 |           ]
522 |         },
523 |         {
524 |           "name": "stdout",
525 |           "output_type": "stream",
526 |           "text": [
527 |             "[CV] END .................................C=10.0, penalty=l2; total time= 3.0min\n"
528 |           ]
529 |         },
530 |         {
531 |           "name": "stderr",
532 |           "output_type": "stream",
533 |           "text": [
534 |             "/usr/local/lib/python3.8/dist-packages/sklearn/linear_model/_logistic.py:814: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
535 |             "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
536 |             "\n",
537 |             "Increase the number of iterations (max_iter) or scale the data as shown in:\n",
538 |             "    https://scikit-learn.org/stable/modules/preprocessing.html\n",
539 |             "Please also refer to the documentation for alternative solver options:\n",
540 |             "    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
541 |             "  n_iter_i = _check_optimize_result(\n"
542 |           ]
543 |         },
544 |         {
545 |           "name": "stdout",
546 |           "output_type": "stream",
547 |           "text": [
548 |             "[CV] END .................................C=10.0, penalty=l2; total time= 3.0min\n"
549 |           ]
550 |         },
551 |         {
552 |           "name": "stderr",
553 |           "output_type": "stream",
554 |           "text": [
555 |             "/usr/local/lib/python3.8/dist-packages/sklearn/linear_model/_logistic.py:814: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
556 |             "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
557 |             "\n",
558 |             "Increase the number of iterations (max_iter) or scale the data as shown in:\n",
559 |             "    https://scikit-learn.org/stable/modules/preprocessing.html\n",
560 |             "Please also refer to the documentation for alternative solver options:\n",
561 |             "    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
562 |             "  n_iter_i = _check_optimize_result(\n"
563 |           ]
564 |         },
565 |         {
566 |           "name": "stdout",
567 |           "output_type": "stream",
568 |           "text": [
569 |             "[CV] END .................................C=10.0, penalty=l2; total time= 3.0min\n"
570 |           ]
571 |         },
572 |         {
573 |           "name": "stderr",
574 |           "output_type": "stream",
575 |           "text": [
576 |             "/usr/local/lib/python3.8/dist-packages/sklearn/linear_model/_logistic.py:814: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
577 |             "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
578 |             "\n",
579 |             "Increase the number of iterations (max_iter) or scale the data as shown in:\n",
580 |             "    https://scikit-learn.org/stable/modules/preprocessing.html\n",
581 |             "Please also refer to the documentation for alternative solver options:\n",
582 |             "    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
583 |             "  n_iter_i = _check_optimize_result(\n"
584 |           ]
585 |         },
586 |         {
587 |           "name": "stdout",
588 |           "output_type": "stream",
589 |           "text": [
590 |             "[CV] END .................................C=10.0, penalty=l2; total time= 3.0min\n",
591 |             "[CV] END ................................C=100.0, penalty=l1; total time=   0.0s\n",
592 |             "[CV] END ................................C=100.0, penalty=l1; total time=   0.0s\n",
593 |             "[CV] END ................................C=100.0, penalty=l1; total time=   0.0s\n",
594 |             "[CV] END ................................C=100.0, penalty=l1; total time=   0.0s\n",
595 |             "[CV] END ................................C=100.0, penalty=l1; total time=   0.0s\n"
596 |           ]
597 |         },
598 |         {
599 |           "name": "stderr",
600 |           "output_type": "stream",
601 |           "text": [
602 |             "/usr/local/lib/python3.8/dist-packages/sklearn/linear_model/_logistic.py:814: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
603 |             "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
604 |             "\n",
605 |             "Increase the number of iterations (max_iter) or scale the data as shown in:\n",
606 |             "    https://scikit-learn.org/stable/modules/preprocessing.html\n",
607 |             "Please also refer to the documentation for alternative solver options:\n",
608 |             "    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
609 |             "  n_iter_i = _check_optimize_result(\n"
610 |           ]
611 |         },
612 |         {
613 |           "name": "stdout",
614 |           "output_type": "stream",
615 |           "text": [
616 |             "[CV] END ................................C=100.0, penalty=l2; total time= 3.0min\n"
617 |           ]
618 |         },
619 |         {
620 |           "name": "stderr",
621 |           "output_type": "stream",
622 |           "text": [
623 |             "/usr/local/lib/python3.8/dist-packages/sklearn/linear_model/_logistic.py:814: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
624 |             "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
625 |             "\n",
626 |             "Increase the number of iterations (max_iter) or scale the data as shown in:\n",
627 |             "    https://scikit-learn.org/stable/modules/preprocessing.html\n",
628 |             "Please also refer to the documentation for alternative solver options:\n",
629 |             "    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
630 |             "  n_iter_i = _check_optimize_result(\n"
631 |           ]
632 |         },
633 |         {
634 |           "name": "stdout",
635 |           "output_type": "stream",
636 |           "text": [
637 |             "[CV] END ................................C=100.0, penalty=l2; total time= 3.0min\n"
638 |           ]
639 |         },
640 |         {
641 |           "name": "stderr",
642 |           "output_type": "stream",
643 |           "text": [
644 |             "/usr/local/lib/python3.8/dist-packages/sklearn/linear_model/_logistic.py:814: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
645 |             "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
646 |             "\n",
647 |             "Increase the number of iterations (max_iter) or scale the data as shown in:\n",
648 |             "    https://scikit-learn.org/stable/modules/preprocessing.html\n",
649 |             "Please also refer to the documentation for alternative solver options:\n",
650 |             "    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
651 |             "  n_iter_i = _check_optimize_result(\n"
652 |           ]
653 |         },
654 |         {
655 |           "name": "stdout",
656 |           "output_type": "stream",
657 |           "text": [
658 |             "[CV] END ................................C=100.0, penalty=l2; total time= 3.0min\n"
659 |           ]
660 |         },
661 |         {
662 |           "name": "stderr",
663 |           "output_type": "stream",
664 |           "text": [
665 |             "/usr/local/lib/python3.8/dist-packages/sklearn/linear_model/_logistic.py:814: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
666 |             "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
667 |             "\n",
668 |             "Increase the number of iterations (max_iter) or scale the data as shown in:\n",
669 |             "    https://scikit-learn.org/stable/modules/preprocessing.html\n",
670 |             "Please also refer to the documentation for alternative solver options:\n",
671 |             "    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
672 |             "  n_iter_i = _check_optimize_result(\n"
673 |           ]
674 |         },
675 |         {
676 |           "name": "stdout",
677 |           "output_type": "stream",
678 |           "text": [
679 |             "[CV] END ................................C=100.0, penalty=l2; total time= 3.0min\n"
680 |           ]
681 |         },
682 |         {
683 |           "name": "stderr",
684 |           "output_type": "stream",
685 |           "text": [
686 |             "/usr/local/lib/python3.8/dist-packages/sklearn/linear_model/_logistic.py:814: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
687 |             "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
688 |             "\n",
689 |             "Increase the number of iterations (max_iter) or scale the data as shown in:\n",
690 |             "    https://scikit-learn.org/stable/modules/preprocessing.html\n",
691 |             "Please also refer to the documentation for alternative solver options:\n",
692 |             "    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
693 |             "  n_iter_i = _check_optimize_result(\n"
694 |           ]
695 |         },
696 |         {
697 |           "name": "stdout",
698 |           "output_type": "stream",
699 |           "text": [
700 |             "[CV] END ................................C=100.0, penalty=l2; total time= 3.0min\n",
701 |             "[CV] END ...............................C=1000.0, penalty=l1; total time=   0.0s\n",
702 |             "[CV] END ...............................C=1000.0, penalty=l1; total time=   0.0s\n",
703 |             "[CV] END ...............................C=1000.0, penalty=l1; total time=   0.0s\n",
704 |             "[CV] END ...............................C=1000.0, penalty=l1; total time=   0.0s\n",
705 |             "[CV] END ...............................C=1000.0, penalty=l1; total time=   0.0s\n"
706 |           ]
707 |         },
708 |         {
709 |           "name": "stderr",
710 |           "output_type": "stream",
711 |           "text": [
712 |             "/usr/local/lib/python3.8/dist-packages/sklearn/linear_model/_logistic.py:814: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
713 |             "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
714 |             "\n",
715 |             "Increase the number of iterations (max_iter) or scale the data as shown in:\n",
716 |             "    https://scikit-learn.org/stable/modules/preprocessing.html\n",
717 |             "Please also refer to the documentation for alternative solver options:\n",
718 |             "    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
719 |             "  n_iter_i = _check_optimize_result(\n"
720 |           ]
721 |         },
722 |         {
723 |           "name": "stdout",
724 |           "output_type": "stream",
725 |           "text": [
726 |             "[CV] END ...............................C=1000.0, penalty=l2; total time= 3.0min\n"
727 |           ]
728 |         },
729 |         {
730 |           "name": "stderr",
731 |           "output_type": "stream",
732 |           "text": [
733 |             "/usr/local/lib/python3.8/dist-packages/sklearn/linear_model/_logistic.py:814: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
734 |             "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
735 |             "\n",
736 |             "Increase the number of iterations (max_iter) or scale the data as shown in:\n",
737 |             "    https://scikit-learn.org/stable/modules/preprocessing.html\n",
738 |             "Please also refer to the documentation for alternative solver options:\n",
739 |             "    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
740 |             "  n_iter_i = _check_optimize_result(\n"
741 |           ]
742 |         },
743 |         {
744 |           "name": "stdout",
745 |           "output_type": "stream",
746 |           "text": [
747 |             "[CV] END ...............................C=1000.0, penalty=l2; total time= 3.0min\n"
748 |           ]
749 |         },
750 |         {
751 |           "name": "stderr",
752 |           "output_type": "stream",
753 |           "text": [
754 |             "/usr/local/lib/python3.8/dist-packages/sklearn/linear_model/_logistic.py:814: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
755 |             "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
756 |             "\n",
757 |             "Increase the number of iterations (max_iter) or scale the data as shown in:\n",
758 |             "    https://scikit-learn.org/stable/modules/preprocessing.html\n",
759 |             "Please also refer to the documentation for alternative solver options:\n",
760 |             "    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
761 |             "  n_iter_i = _check_optimize_result(\n"
762 |           ]
763 |         },
764 |         {
765 |           "name": "stdout",
766 |           "output_type": "stream",
767 |           "text": [
768 |             "[CV] END ...............................C=1000.0, penalty=l2; total time= 3.0min\n"
769 |           ]
770 |         },
771 |         {
772 |           "name": "stderr",
773 |           "output_type": "stream",
774 |           "text": [
775 |             "/usr/local/lib/python3.8/dist-packages/sklearn/linear_model/_logistic.py:814: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
776 |             "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
777 |             "\n",
778 |             "Increase the number of iterations (max_iter) or scale the data as shown in:\n",
779 |             "    https://scikit-learn.org/stable/modules/preprocessing.html\n",
780 |             "Please also refer to the documentation for alternative solver options:\n",
781 |             "    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
782 |             "  n_iter_i = _check_optimize_result(\n"
783 |           ]
784 |         },
785 |         {
786 |           "name": "stdout",
787 |           "output_type": "stream",
788 |           "text": [
789 |             "[CV] END ...............................C=1000.0, penalty=l2; total time= 3.0min\n"
790 |           ]
791 |         },
792 |         {
793 |           "name": "stderr",
794 |           "output_type": "stream",
795 |           "text": [
796 |             "/usr/local/lib/python3.8/dist-packages/sklearn/linear_model/_logistic.py:814: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
797 |             "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
798 |             "\n",
799 |             "Increase the number of iterations (max_iter) or scale the data as shown in:\n",
800 |             "    https://scikit-learn.org/stable/modules/preprocessing.html\n",
801 |             "Please also refer to the documentation for alternative solver options:\n",
802 |             "    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
803 |             "  n_iter_i = _check_optimize_result(\n",
804 |             "/usr/local/lib/python3.8/dist-packages/sklearn/model_selection/_validation.py:372: FitFailedWarning: \n",
805 |             "35 fits failed out of a total of 70.\n",
806 |             "The score on these train-test partitions for these parameters will be set to nan.\n",
807 |             "If these failures are not expected, you can try to debug them by setting error_score='raise'.\n",
808 |             "\n",
809 |             "Below are more details about the failures:\n",
810 |             "--------------------------------------------------------------------------------\n",
811 |             "35 fits failed with the following error:\n",
812 |             "Traceback (most recent call last):\n",
813 |             "  File \"/usr/local/lib/python3.8/dist-packages/sklearn/model_selection/_validation.py\", line 680, in _fit_and_score\n",
814 |             "    estimator.fit(X_train, y_train, **fit_params)\n",
815 |             "  File \"/usr/local/lib/python3.8/dist-packages/sklearn/linear_model/_logistic.py\", line 1461, in fit\n",
816 |             "    solver = _check_solver(self.solver, self.penalty, self.dual)\n",
817 |             "  File \"/usr/local/lib/python3.8/dist-packages/sklearn/linear_model/_logistic.py\", line 447, in _check_solver\n",
818 |             "    raise ValueError(\n",
819 |             "ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.\n",
820 |             "\n",
821 |             "  warnings.warn(some_fits_failed_message, FitFailedWarning)\n",
822 |             "/usr/local/lib/python3.8/dist-packages/sklearn/model_selection/_search.py:969: UserWarning: One or more of the test scores are non-finite: [       nan 0.7389173         nan 0.86208838        nan 0.92426686\n",
823 |             "        nan 0.94845676        nan 0.95565725        nan 0.9561043\n",
824 |             "        nan 0.95504811]\n",
825 |             "  warnings.warn(\n"
826 |           ]
827 |         },
828 |         {
829 |           "name": "stdout",
830 |           "output_type": "stream",
831 |           "text": [
832 |             "[CV] END ...............................C=1000.0, penalty=l2; total time= 3.0min\n",
833 |             "tuned hpyerparameters :(best parameters)  {'C': 100.0, 'penalty': 'l2'}\n",
834 |             "accuracy : 0.9561042957793336\n"
835 |           ]
836 |         },
837 |         {
838 |           "name": "stderr",
839 |           "output_type": "stream",
840 |           "text": [
841 |             "/usr/local/lib/python3.8/dist-packages/sklearn/linear_model/_logistic.py:814: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
842 |             "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
843 |             "\n",
844 |             "Increase the number of iterations (max_iter) or scale the data as shown in:\n",
845 |             "    https://scikit-learn.org/stable/modules/preprocessing.html\n",
846 |             "Please also refer to the documentation for alternative solver options:\n",
847 |             "    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
848 |             "  n_iter_i = _check_optimize_result(\n"
849 |           ]
850 |         }
851 |       ],
852 |       "source": [
853 |         "grid={\"C\":np.logspace(-3,3,7), \"penalty\":[\"l1\",\"l2\"]}\n",
854 |         "logreg_cv=GridSearchCV(LogisticRegression(max_iter=300), grid)\n",
855 |         "logreg_cv.fit(X_train,y_train)\n",
856 |         "\n"
857 |       ]
858 |     },
859 |     {
860 |       "cell_type": "code",
861 |       "execution_count": 39,
862 |       "metadata": {
863 |         "colab": {
864 |           "base_uri": "https://localhost:8080/"
865 |         },
866 |         "id": "iuJ46jtVcRY_",
867 |         "outputId": "44929a87-5efa-489a-fbe0-6f1f6a1ab9ff"
868 |       },
869 |       "outputs": [
870 |         {
871 |           "name": "stdout",
872 |           "output_type": "stream",
873 |           "text": [
874 |             "tuned hpyerparameters :(best parameters)  {'C': 100.0, 'penalty': 'l2'}\n",
875 |             "accuracy : 0.9561042957793336\n"
876 |           ]
877 |         }
878 |       ],
879 |       "source": [
880 |         "print(\"tuned hpyerparameters :(best parameters) \",logreg_cv.best_params_)\n",
881 |         "print(\"accuracy :\",logreg_cv.best_score_)"
882 |       ]
883 |     },
884 |     {
885 |       "cell_type": "code",
886 |       "execution_count": 41,
887 |       "metadata": {
888 |         "colab": {
889 |           "base_uri": "https://localhost:8080/"
890 |         },
891 |         "id": "NHi7tEjl3O64",
892 |         "outputId": "d7265bbe-1b30-4463-f3f6-0712be1b347f"
893 |       },
894 |       "outputs": [
895 |         {
896 |           "data": {
897 |             "text/plain": [
898 |               "['tfidf.pkl']"
899 |             ]
900 |           },
901 |           "execution_count": 41,
902 |           "metadata": {},
903 |           "output_type": "execute_result"
904 |         }
905 |       ],
906 |       "source": [
907 |         "import joblib\n",
908 |         "joblib.dump(logreg_cv, 'model.pkl')\n",
909 |         "joblib.dump(tfidf, 'tfidf.pkl')"
910 |       ]
911 |     },
912 |     {
913 |       "cell_type": "code",
914 |       "execution_count": null,
915 |       "metadata": {
916 |         "id": "AjuMBy0IcS7I"
917 |       },
918 |       "outputs": [],
919 |       "source": []
920 |     }
921 |   ],
922 |   "metadata": {
923 |     "accelerator": "GPU",
924 |     "colab": {
925 |       "provenance": []
926 |     },
927 |     "gpuClass": "standard",
928 |     "kernelspec": {
929 |       "display_name": "Python 3",
930 |       "language": "python",
931 |       "name": "python3"
932 |     },
933 |     "language_info": {
934 |       "codemirror_mode": {
935 |         "name": "ipython",
936 |         "version": 3
937 |       },
938 |       "file_extension": ".py",
939 |       "mimetype": "text/x-python",
940 |       "name": "python",
941 |       "nbconvert_exporter": "python",
942 |       "pygments_lexer": "ipython3",
943 |       "version": "3.10.6 (main, Nov 14 2022, 16:10:14) [GCC 11.3.0]"
944 |     },
945 |     "vscode": {
946 |       "interpreter": {
947 |         "hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1"
948 |       }
949 |     }
950 |   },
951 |   "nbformat": 4,
952 |   "nbformat_minor": 0
953 | }
954 | 


--------------------------------------------------------------------------------