"
577 | ],
578 | "text/plain": [
579 | " Dst Port Protocol Timestamp Flow Duration Tot Fwd Pkts \\\n",
580 | "0 0 0 15/02/2018 08:25:18 112641158 3 \n",
581 | "1 22 6 15/02/2018 08:29:05 37366762 14 \n",
582 | "2 47514 6 15/02/2018 08:29:42 543 2 \n",
583 | "3 0 0 15/02/2018 08:28:07 112640703 3 \n",
584 | "4 0 0 15/02/2018 08:30:56 112640874 3 \n",
585 | "\n",
586 | " Tot Bwd Pkts TotLen Fwd Pkts TotLen Bwd Pkts Fwd Pkt Len Max \\\n",
587 | "0 0 0 0 0 \n",
588 | "1 12 2168 2993 712 \n",
589 | "2 0 64 0 64 \n",
590 | "3 0 0 0 0 \n",
591 | "4 0 0 0 0 \n",
592 | "\n",
593 | " Fwd Pkt Len Min Fwd Pkt Len Mean Fwd Pkt Len Std Bwd Pkt Len Max \\\n",
594 | "0 0 0.000000 0.000000 0 \n",
595 | "1 0 154.857143 254.855270 976 \n",
596 | "2 0 32.000000 45.254834 0 \n",
597 | "3 0 0.000000 0.000000 0 \n",
598 | "4 0 0.000000 0.000000 0 \n",
599 | "\n",
600 | " Bwd Pkt Len Min Bwd Pkt Len Mean Bwd Pkt Len Std Flow Byts/s \\\n",
601 | "0 0 0.000000 0.000000 0.000000 \n",
602 | "1 0 249.416667 395.929392 138.117400 \n",
603 | "2 0 0.000000 0.000000 117863.720074 \n",
604 | "3 0 0.000000 0.000000 0.000000 \n",
605 | "4 0 0.000000 0.000000 0.000000 \n",
606 | "\n",
607 | " Flow Pkts/s Flow IAT Mean Flow IAT Std Flow IAT Max Flow IAT Min \\\n",
608 | "0 0.026633 56320579.00 7.042784e+02 56321077 56320081 \n",
609 | "1 0.695806 1494670.48 3.894924e+06 15617415 7 \n",
610 | "2 3683.241252 543.00 0.000000e+00 543 543 \n",
611 | "3 0.026633 56320351.50 3.669884e+02 56320611 56320092 \n",
612 | "4 0.026633 56320437.00 7.198347e+02 56320946 56319928 \n",
613 | "\n",
614 | " Fwd IAT Tot Fwd IAT Mean Fwd IAT Std Fwd IAT Max Fwd IAT Min \\\n",
615 | "0 112641158 5.632058e+07 7.042784e+02 56321077 56320081 \n",
616 | "1 37366762 2.874366e+06 5.104444e+06 15617415 40 \n",
617 | "2 543 5.430000e+02 0.000000e+00 543 543 \n",
618 | "3 112640703 5.632035e+07 3.669884e+02 56320611 56320092 \n",
619 | "4 112640874 5.632044e+07 7.198347e+02 56320946 56319928 \n",
620 | "\n",
621 | " Bwd IAT Tot Bwd IAT Mean Bwd IAT Std Bwd IAT Max Bwd IAT Min \\\n",
622 | "0 0 0.000000e+00 0.000000e+00 0 0 \n",
623 | "1 37366730 3.396975e+06 5.564224e+06 15936762 893 \n",
624 | "2 0 0.000000e+00 0.000000e+00 0 0 \n",
625 | "3 0 0.000000e+00 0.000000e+00 0 0 \n",
626 | "4 0 0.000000e+00 0.000000e+00 0 0 \n",
627 | "\n",
628 | " Fwd PSH Flags Bwd PSH Flags Fwd URG Flags Bwd URG Flags Fwd Header Len \\\n",
629 | "0 0 0 0 0 0 \n",
630 | "1 0 0 0 0 456 \n",
631 | "2 1 0 0 0 64 \n",
632 | "3 0 0 0 0 0 \n",
633 | "4 0 0 0 0 0 \n",
634 | "\n",
635 | " Bwd Header Len Fwd Pkts/s Bwd Pkts/s Pkt Len Min Pkt Len Max \\\n",
636 | "0 0 0.026633 0.000000 0 0 \n",
637 | "1 416 0.374665 0.321141 0 976 \n",
638 | "2 0 3683.241252 0.000000 0 64 \n",
639 | "3 0 0.026633 0.000000 0 0 \n",
640 | "4 0 0.026633 0.000000 0 0 \n",
641 | "\n",
642 | " Pkt Len Mean Pkt Len Std Pkt Len Var FIN Flag Cnt SYN Flag Cnt \\\n",
643 | "0 0.000000 0.000000 0.000000 0 0 \n",
644 | "1 191.148148 320.122898 102478.669516 0 0 \n",
645 | "2 42.666667 36.950417 1365.333333 0 1 \n",
646 | "3 0.000000 0.000000 0.000000 0 0 \n",
647 | "4 0.000000 0.000000 0.000000 0 0 \n",
648 | "\n",
649 | " RST Flag Cnt PSH Flag Cnt ACK Flag Cnt URG Flag Cnt CWE Flag Count \\\n",
650 | "0 0 0 0 0 0 \n",
651 | "1 0 1 0 0 0 \n",
652 | "2 0 0 1 0 0 \n",
653 | "3 0 0 0 0 0 \n",
654 | "4 0 0 0 0 0 \n",
655 | "\n",
656 | " ECE Flag Cnt Down/Up Ratio Pkt Size Avg Fwd Seg Size Avg \\\n",
657 | "0 0 0 0.0 0.000000 \n",
658 | "1 0 0 198.5 154.857143 \n",
659 | "2 0 0 64.0 32.000000 \n",
660 | "3 0 0 0.0 0.000000 \n",
661 | "4 0 0 0.0 0.000000 \n",
662 | "\n",
663 | " Bwd Seg Size Avg Fwd Byts/b Avg Fwd Pkts/b Avg Fwd Blk Rate Avg \\\n",
664 | "0 0.000000 0 0 0 \n",
665 | "1 249.416667 0 0 0 \n",
666 | "2 0.000000 0 0 0 \n",
667 | "3 0.000000 0 0 0 \n",
668 | "4 0.000000 0 0 0 \n",
669 | "\n",
670 | " Bwd Byts/b Avg Bwd Pkts/b Avg Bwd Blk Rate Avg Subflow Fwd Pkts \\\n",
671 | "0 0 0 0 3 \n",
672 | "1 0 0 0 14 \n",
673 | "2 0 0 0 2 \n",
674 | "3 0 0 0 3 \n",
675 | "4 0 0 0 3 \n",
676 | "\n",
677 | " Subflow Fwd Byts Subflow Bwd Pkts Subflow Bwd Byts Init Fwd Win Byts \\\n",
678 | "0 0 0 0 -1 \n",
679 | "1 2168 12 2993 29200 \n",
680 | "2 64 0 0 244 \n",
681 | "3 0 0 0 -1 \n",
682 | "4 0 0 0 -1 \n",
683 | "\n",
684 | " Init Bwd Win Byts Fwd Act Data Pkts Fwd Seg Size Min Active Mean \\\n",
685 | "0 -1 0 0 0.0 \n",
686 | "1 232 8 32 1024353.0 \n",
687 | "2 -1 0 32 0.0 \n",
688 | "3 -1 0 0 0.0 \n",
689 | "4 -1 0 0 0.0 \n",
690 | "\n",
691 | " Active Std Active Max Active Min Idle Mean Idle Std Idle Max \\\n",
692 | "0 0.000000 0 0 56320579.0 7.042784e+02 56321077 \n",
693 | "1 649038.754495 1601183 321569 11431221.0 3.644991e+06 15617415 \n",
694 | "2 0.000000 0 0 0.0 0.000000e+00 0 \n",
695 | "3 0.000000 0 0 56320351.5 3.669884e+02 56320611 \n",
696 | "4 0.000000 0 0 56320437.0 7.198347e+02 56320946 \n",
697 | "\n",
698 | " Idle Min Label \n",
699 | "0 56320081 Benign \n",
700 | "1 8960247 Benign \n",
701 | "2 0 Benign \n",
702 | "3 56320092 Benign \n",
703 | "4 56319928 Benign "
704 | ]
705 | },
706 | "execution_count": 6,
707 | "metadata": {},
708 | "output_type": "execute_result"
709 | }
710 | ],
711 | "source": [
712 | "data.head()"
713 | ]
714 | },
715 | {
716 | "cell_type": "code",
717 | "execution_count": 9,
718 | "metadata": {},
719 | "outputs": [
720 | {
721 | "data": {
722 | "text/plain": [
723 | "array([ 0, 6, 17], dtype=int64)"
724 | ]
725 | },
726 | "execution_count": 9,
727 | "metadata": {},
728 | "output_type": "execute_result"
729 | }
730 | ],
731 | "source": [
732 | "data['Protocol'].unique()"
733 | ]
734 | },
735 | {
736 | "cell_type": "code",
737 | "execution_count": null,
738 | "metadata": {},
739 | "outputs": [],
740 | "source": []
741 | }
742 | ],
743 | "metadata": {
744 | "kernelspec": {
745 | "display_name": "Python 3",
746 | "language": "python",
747 | "name": "python3"
748 | },
749 | "language_info": {
750 | "codemirror_mode": {
751 | "name": "ipython",
752 | "version": 3
753 | },
754 | "file_extension": ".py",
755 | "mimetype": "text/x-python",
756 | "name": "python",
757 | "nbconvert_exporter": "python",
758 | "pygments_lexer": "ipython3",
759 | "version": "3.7.9"
760 | }
761 | },
762 | "nbformat": 4,
763 | "nbformat_minor": 4
764 | }
765 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | GNU GENERAL PUBLIC LICENSE
2 | Version 3, 29 June 2007
3 |
4 | Copyright (C) 2007 Free Software Foundation, Inc.
5 | Everyone is permitted to copy and distribute verbatim copies
6 | of this license document, but changing it is not allowed.
7 |
8 | Preamble
9 |
10 | The GNU General Public License is a free, copyleft license for
11 | software and other kinds of works.
12 |
13 | The licenses for most software and other practical works are designed
14 | to take away your freedom to share and change the works. By contrast,
15 | the GNU General Public License is intended to guarantee your freedom to
16 | share and change all versions of a program--to make sure it remains free
17 | software for all its users. We, the Free Software Foundation, use the
18 | GNU General Public License for most of our software; it applies also to
19 | any other work released this way by its authors. You can apply it to
20 | your programs, too.
21 |
22 | When we speak of free software, we are referring to freedom, not
23 | price. Our General Public Licenses are designed to make sure that you
24 | have the freedom to distribute copies of free software (and charge for
25 | them if you wish), that you receive source code or can get it if you
26 | want it, that you can change the software or use pieces of it in new
27 | free programs, and that you know you can do these things.
28 |
29 | To protect your rights, we need to prevent others from denying you
30 | these rights or asking you to surrender the rights. Therefore, you have
31 | certain responsibilities if you distribute copies of the software, or if
32 | you modify it: responsibilities to respect the freedom of others.
33 |
34 | For example, if you distribute copies of such a program, whether
35 | gratis or for a fee, you must pass on to the recipients the same
36 | freedoms that you received. You must make sure that they, too, receive
37 | or can get the source code. And you must show them these terms so they
38 | know their rights.
39 |
40 | Developers that use the GNU GPL protect your rights with two steps:
41 | (1) assert copyright on the software, and (2) offer you this License
42 | giving you legal permission to copy, distribute and/or modify it.
43 |
44 | For the developers' and authors' protection, the GPL clearly explains
45 | that there is no warranty for this free software. For both users' and
46 | authors' sake, the GPL requires that modified versions be marked as
47 | changed, so that their problems will not be attributed erroneously to
48 | authors of previous versions.
49 |
50 | Some devices are designed to deny users access to install or run
51 | modified versions of the software inside them, although the manufacturer
52 | can do so. This is fundamentally incompatible with the aim of
53 | protecting users' freedom to change the software. The systematic
54 | pattern of such abuse occurs in the area of products for individuals to
55 | use, which is precisely where it is most unacceptable. Therefore, we
56 | have designed this version of the GPL to prohibit the practice for those
57 | products. If such problems arise substantially in other domains, we
58 | stand ready to extend this provision to those domains in future versions
59 | of the GPL, as needed to protect the freedom of users.
60 |
61 | Finally, every program is threatened constantly by software patents.
62 | States should not allow patents to restrict development and use of
63 | software on general-purpose computers, but in those that do, we wish to
64 | avoid the special danger that patents applied to a free program could
65 | make it effectively proprietary. To prevent this, the GPL assures that
66 | patents cannot be used to render the program non-free.
67 |
68 | The precise terms and conditions for copying, distribution and
69 | modification follow.
70 |
71 | TERMS AND CONDITIONS
72 |
73 | 0. Definitions.
74 |
75 | "This License" refers to version 3 of the GNU General Public License.
76 |
77 | "Copyright" also means copyright-like laws that apply to other kinds of
78 | works, such as semiconductor masks.
79 |
80 | "The Program" refers to any copyrightable work licensed under this
81 | License. Each licensee is addressed as "you". "Licensees" and
82 | "recipients" may be individuals or organizations.
83 |
84 | To "modify" a work means to copy from or adapt all or part of the work
85 | in a fashion requiring copyright permission, other than the making of an
86 | exact copy. The resulting work is called a "modified version" of the
87 | earlier work or a work "based on" the earlier work.
88 |
89 | A "covered work" means either the unmodified Program or a work based
90 | on the Program.
91 |
92 | To "propagate" a work means to do anything with it that, without
93 | permission, would make you directly or secondarily liable for
94 | infringement under applicable copyright law, except executing it on a
95 | computer or modifying a private copy. Propagation includes copying,
96 | distribution (with or without modification), making available to the
97 | public, and in some countries other activities as well.
98 |
99 | To "convey" a work means any kind of propagation that enables other
100 | parties to make or receive copies. Mere interaction with a user through
101 | a computer network, with no transfer of a copy, is not conveying.
102 |
103 | An interactive user interface displays "Appropriate Legal Notices"
104 | to the extent that it includes a convenient and prominently visible
105 | feature that (1) displays an appropriate copyright notice, and (2)
106 | tells the user that there is no warranty for the work (except to the
107 | extent that warranties are provided), that licensees may convey the
108 | work under this License, and how to view a copy of this License. If
109 | the interface presents a list of user commands or options, such as a
110 | menu, a prominent item in the list meets this criterion.
111 |
112 | 1. Source Code.
113 |
114 | The "source code" for a work means the preferred form of the work
115 | for making modifications to it. "Object code" means any non-source
116 | form of a work.
117 |
118 | A "Standard Interface" means an interface that either is an official
119 | standard defined by a recognized standards body, or, in the case of
120 | interfaces specified for a particular programming language, one that
121 | is widely used among developers working in that language.
122 |
123 | The "System Libraries" of an executable work include anything, other
124 | than the work as a whole, that (a) is included in the normal form of
125 | packaging a Major Component, but which is not part of that Major
126 | Component, and (b) serves only to enable use of the work with that
127 | Major Component, or to implement a Standard Interface for which an
128 | implementation is available to the public in source code form. A
129 | "Major Component", in this context, means a major essential component
130 | (kernel, window system, and so on) of the specific operating system
131 | (if any) on which the executable work runs, or a compiler used to
132 | produce the work, or an object code interpreter used to run it.
133 |
134 | The "Corresponding Source" for a work in object code form means all
135 | the source code needed to generate, install, and (for an executable
136 | work) run the object code and to modify the work, including scripts to
137 | control those activities. However, it does not include the work's
138 | System Libraries, or general-purpose tools or generally available free
139 | programs which are used unmodified in performing those activities but
140 | which are not part of the work. For example, Corresponding Source
141 | includes interface definition files associated with source files for
142 | the work, and the source code for shared libraries and dynamically
143 | linked subprograms that the work is specifically designed to require,
144 | such as by intimate data communication or control flow between those
145 | subprograms and other parts of the work.
146 |
147 | The Corresponding Source need not include anything that users
148 | can regenerate automatically from other parts of the Corresponding
149 | Source.
150 |
151 | The Corresponding Source for a work in source code form is that
152 | same work.
153 |
154 | 2. Basic Permissions.
155 |
156 | All rights granted under this License are granted for the term of
157 | copyright on the Program, and are irrevocable provided the stated
158 | conditions are met. This License explicitly affirms your unlimited
159 | permission to run the unmodified Program. The output from running a
160 | covered work is covered by this License only if the output, given its
161 | content, constitutes a covered work. This License acknowledges your
162 | rights of fair use or other equivalent, as provided by copyright law.
163 |
164 | You may make, run and propagate covered works that you do not
165 | convey, without conditions so long as your license otherwise remains
166 | in force. You may convey covered works to others for the sole purpose
167 | of having them make modifications exclusively for you, or provide you
168 | with facilities for running those works, provided that you comply with
169 | the terms of this License in conveying all material for which you do
170 | not control copyright. Those thus making or running the covered works
171 | for you must do so exclusively on your behalf, under your direction
172 | and control, on terms that prohibit them from making any copies of
173 | your copyrighted material outside their relationship with you.
174 |
175 | Conveying under any other circumstances is permitted solely under
176 | the conditions stated below. Sublicensing is not allowed; section 10
177 | makes it unnecessary.
178 |
179 | 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
180 |
181 | No covered work shall be deemed part of an effective technological
182 | measure under any applicable law fulfilling obligations under article
183 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or
184 | similar laws prohibiting or restricting circumvention of such
185 | measures.
186 |
187 | When you convey a covered work, you waive any legal power to forbid
188 | circumvention of technological measures to the extent such circumvention
189 | is effected by exercising rights under this License with respect to
190 | the covered work, and you disclaim any intention to limit operation or
191 | modification of the work as a means of enforcing, against the work's
192 | users, your or third parties' legal rights to forbid circumvention of
193 | technological measures.
194 |
195 | 4. Conveying Verbatim Copies.
196 |
197 | You may convey verbatim copies of the Program's source code as you
198 | receive it, in any medium, provided that you conspicuously and
199 | appropriately publish on each copy an appropriate copyright notice;
200 | keep intact all notices stating that this License and any
201 | non-permissive terms added in accord with section 7 apply to the code;
202 | keep intact all notices of the absence of any warranty; and give all
203 | recipients a copy of this License along with the Program.
204 |
205 | You may charge any price or no price for each copy that you convey,
206 | and you may offer support or warranty protection for a fee.
207 |
208 | 5. Conveying Modified Source Versions.
209 |
210 | You may convey a work based on the Program, or the modifications to
211 | produce it from the Program, in the form of source code under the
212 | terms of section 4, provided that you also meet all of these conditions:
213 |
214 | a) The work must carry prominent notices stating that you modified
215 | it, and giving a relevant date.
216 |
217 | b) The work must carry prominent notices stating that it is
218 | released under this License and any conditions added under section
219 | 7. This requirement modifies the requirement in section 4 to
220 | "keep intact all notices".
221 |
222 | c) You must license the entire work, as a whole, under this
223 | License to anyone who comes into possession of a copy. This
224 | License will therefore apply, along with any applicable section 7
225 | additional terms, to the whole of the work, and all its parts,
226 | regardless of how they are packaged. This License gives no
227 | permission to license the work in any other way, but it does not
228 | invalidate such permission if you have separately received it.
229 |
230 | d) If the work has interactive user interfaces, each must display
231 | Appropriate Legal Notices; however, if the Program has interactive
232 | interfaces that do not display Appropriate Legal Notices, your
233 | work need not make them do so.
234 |
235 | A compilation of a covered work with other separate and independent
236 | works, which are not by their nature extensions of the covered work,
237 | and which are not combined with it such as to form a larger program,
238 | in or on a volume of a storage or distribution medium, is called an
239 | "aggregate" if the compilation and its resulting copyright are not
240 | used to limit the access or legal rights of the compilation's users
241 | beyond what the individual works permit. Inclusion of a covered work
242 | in an aggregate does not cause this License to apply to the other
243 | parts of the aggregate.
244 |
245 | 6. Conveying Non-Source Forms.
246 |
247 | You may convey a covered work in object code form under the terms
248 | of sections 4 and 5, provided that you also convey the
249 | machine-readable Corresponding Source under the terms of this License,
250 | in one of these ways:
251 |
252 | a) Convey the object code in, or embodied in, a physical product
253 | (including a physical distribution medium), accompanied by the
254 | Corresponding Source fixed on a durable physical medium
255 | customarily used for software interchange.
256 |
257 | b) Convey the object code in, or embodied in, a physical product
258 | (including a physical distribution medium), accompanied by a
259 | written offer, valid for at least three years and valid for as
260 | long as you offer spare parts or customer support for that product
261 | model, to give anyone who possesses the object code either (1) a
262 | copy of the Corresponding Source for all the software in the
263 | product that is covered by this License, on a durable physical
264 | medium customarily used for software interchange, for a price no
265 | more than your reasonable cost of physically performing this
266 | conveying of source, or (2) access to copy the
267 | Corresponding Source from a network server at no charge.
268 |
269 | c) Convey individual copies of the object code with a copy of the
270 | written offer to provide the Corresponding Source. This
271 | alternative is allowed only occasionally and noncommercially, and
272 | only if you received the object code with such an offer, in accord
273 | with subsection 6b.
274 |
275 | d) Convey the object code by offering access from a designated
276 | place (gratis or for a charge), and offer equivalent access to the
277 | Corresponding Source in the same way through the same place at no
278 | further charge. You need not require recipients to copy the
279 | Corresponding Source along with the object code. If the place to
280 | copy the object code is a network server, the Corresponding Source
281 | may be on a different server (operated by you or a third party)
282 | that supports equivalent copying facilities, provided you maintain
283 | clear directions next to the object code saying where to find the
284 | Corresponding Source. Regardless of what server hosts the
285 | Corresponding Source, you remain obligated to ensure that it is
286 | available for as long as needed to satisfy these requirements.
287 |
288 | e) Convey the object code using peer-to-peer transmission, provided
289 | you inform other peers where the object code and Corresponding
290 | Source of the work are being offered to the general public at no
291 | charge under subsection 6d.
292 |
293 | A separable portion of the object code, whose source code is excluded
294 | from the Corresponding Source as a System Library, need not be
295 | included in conveying the object code work.
296 |
297 | A "User Product" is either (1) a "consumer product", which means any
298 | tangible personal property which is normally used for personal, family,
299 | or household purposes, or (2) anything designed or sold for incorporation
300 | into a dwelling. In determining whether a product is a consumer product,
301 | doubtful cases shall be resolved in favor of coverage. For a particular
302 | product received by a particular user, "normally used" refers to a
303 | typical or common use of that class of product, regardless of the status
304 | of the particular user or of the way in which the particular user
305 | actually uses, or expects or is expected to use, the product. A product
306 | is a consumer product regardless of whether the product has substantial
307 | commercial, industrial or non-consumer uses, unless such uses represent
308 | the only significant mode of use of the product.
309 |
310 | "Installation Information" for a User Product means any methods,
311 | procedures, authorization keys, or other information required to install
312 | and execute modified versions of a covered work in that User Product from
313 | a modified version of its Corresponding Source. The information must
314 | suffice to ensure that the continued functioning of the modified object
315 | code is in no case prevented or interfered with solely because
316 | modification has been made.
317 |
318 | If you convey an object code work under this section in, or with, or
319 | specifically for use in, a User Product, and the conveying occurs as
320 | part of a transaction in which the right of possession and use of the
321 | User Product is transferred to the recipient in perpetuity or for a
322 | fixed term (regardless of how the transaction is characterized), the
323 | Corresponding Source conveyed under this section must be accompanied
324 | by the Installation Information. But this requirement does not apply
325 | if neither you nor any third party retains the ability to install
326 | modified object code on the User Product (for example, the work has
327 | been installed in ROM).
328 |
329 | The requirement to provide Installation Information does not include a
330 | requirement to continue to provide support service, warranty, or updates
331 | for a work that has been modified or installed by the recipient, or for
332 | the User Product in which it has been modified or installed. Access to a
333 | network may be denied when the modification itself materially and
334 | adversely affects the operation of the network or violates the rules and
335 | protocols for communication across the network.
336 |
337 | Corresponding Source conveyed, and Installation Information provided,
338 | in accord with this section must be in a format that is publicly
339 | documented (and with an implementation available to the public in
340 | source code form), and must require no special password or key for
341 | unpacking, reading or copying.
342 |
343 | 7. Additional Terms.
344 |
345 | "Additional permissions" are terms that supplement the terms of this
346 | License by making exceptions from one or more of its conditions.
347 | Additional permissions that are applicable to the entire Program shall
348 | be treated as though they were included in this License, to the extent
349 | that they are valid under applicable law. If additional permissions
350 | apply only to part of the Program, that part may be used separately
351 | under those permissions, but the entire Program remains governed by
352 | this License without regard to the additional permissions.
353 |
354 | When you convey a copy of a covered work, you may at your option
355 | remove any additional permissions from that copy, or from any part of
356 | it. (Additional permissions may be written to require their own
357 | removal in certain cases when you modify the work.) You may place
358 | additional permissions on material, added by you to a covered work,
359 | for which you have or can give appropriate copyright permission.
360 |
361 | Notwithstanding any other provision of this License, for material you
362 | add to a covered work, you may (if authorized by the copyright holders of
363 | that material) supplement the terms of this License with terms:
364 |
365 | a) Disclaiming warranty or limiting liability differently from the
366 | terms of sections 15 and 16 of this License; or
367 |
368 | b) Requiring preservation of specified reasonable legal notices or
369 | author attributions in that material or in the Appropriate Legal
370 | Notices displayed by works containing it; or
371 |
372 | c) Prohibiting misrepresentation of the origin of that material, or
373 | requiring that modified versions of such material be marked in
374 | reasonable ways as different from the original version; or
375 |
376 | d) Limiting the use for publicity purposes of names of licensors or
377 | authors of the material; or
378 |
379 | e) Declining to grant rights under trademark law for use of some
380 | trade names, trademarks, or service marks; or
381 |
382 | f) Requiring indemnification of licensors and authors of that
383 | material by anyone who conveys the material (or modified versions of
384 | it) with contractual assumptions of liability to the recipient, for
385 | any liability that these contractual assumptions directly impose on
386 | those licensors and authors.
387 |
388 | All other non-permissive additional terms are considered "further
389 | restrictions" within the meaning of section 10. If the Program as you
390 | received it, or any part of it, contains a notice stating that it is
391 | governed by this License along with a term that is a further
392 | restriction, you may remove that term. If a license document contains
393 | a further restriction but permits relicensing or conveying under this
394 | License, you may add to a covered work material governed by the terms
395 | of that license document, provided that the further restriction does
396 | not survive such relicensing or conveying.
397 |
398 | If you add terms to a covered work in accord with this section, you
399 | must place, in the relevant source files, a statement of the
400 | additional terms that apply to those files, or a notice indicating
401 | where to find the applicable terms.
402 |
403 | Additional terms, permissive or non-permissive, may be stated in the
404 | form of a separately written license, or stated as exceptions;
405 | the above requirements apply either way.
406 |
407 | 8. Termination.
408 |
409 | You may not propagate or modify a covered work except as expressly
410 | provided under this License. Any attempt otherwise to propagate or
411 | modify it is void, and will automatically terminate your rights under
412 | this License (including any patent licenses granted under the third
413 | paragraph of section 11).
414 |
415 | However, if you cease all violation of this License, then your
416 | license from a particular copyright holder is reinstated (a)
417 | provisionally, unless and until the copyright holder explicitly and
418 | finally terminates your license, and (b) permanently, if the copyright
419 | holder fails to notify you of the violation by some reasonable means
420 | prior to 60 days after the cessation.
421 |
422 | Moreover, your license from a particular copyright holder is
423 | reinstated permanently if the copyright holder notifies you of the
424 | violation by some reasonable means, this is the first time you have
425 | received notice of violation of this License (for any work) from that
426 | copyright holder, and you cure the violation prior to 30 days after
427 | your receipt of the notice.
428 |
429 | Termination of your rights under this section does not terminate the
430 | licenses of parties who have received copies or rights from you under
431 | this License. If your rights have been terminated and not permanently
432 | reinstated, you do not qualify to receive new licenses for the same
433 | material under section 10.
434 |
435 | 9. Acceptance Not Required for Having Copies.
436 |
437 | You are not required to accept this License in order to receive or
438 | run a copy of the Program. Ancillary propagation of a covered work
439 | occurring solely as a consequence of using peer-to-peer transmission
440 | to receive a copy likewise does not require acceptance. However,
441 | nothing other than this License grants you permission to propagate or
442 | modify any covered work. These actions infringe copyright if you do
443 | not accept this License. Therefore, by modifying or propagating a
444 | covered work, you indicate your acceptance of this License to do so.
445 |
446 | 10. Automatic Licensing of Downstream Recipients.
447 |
448 | Each time you convey a covered work, the recipient automatically
449 | receives a license from the original licensors, to run, modify and
450 | propagate that work, subject to this License. You are not responsible
451 | for enforcing compliance by third parties with this License.
452 |
453 | An "entity transaction" is a transaction transferring control of an
454 | organization, or substantially all assets of one, or subdividing an
455 | organization, or merging organizations. If propagation of a covered
456 | work results from an entity transaction, each party to that
457 | transaction who receives a copy of the work also receives whatever
458 | licenses to the work the party's predecessor in interest had or could
459 | give under the previous paragraph, plus a right to possession of the
460 | Corresponding Source of the work from the predecessor in interest, if
461 | the predecessor has it or can get it with reasonable efforts.
462 |
463 | You may not impose any further restrictions on the exercise of the
464 | rights granted or affirmed under this License. For example, you may
465 | not impose a license fee, royalty, or other charge for exercise of
466 | rights granted under this License, and you may not initiate litigation
467 | (including a cross-claim or counterclaim in a lawsuit) alleging that
468 | any patent claim is infringed by making, using, selling, offering for
469 | sale, or importing the Program or any portion of it.
470 |
471 | 11. Patents.
472 |
473 | A "contributor" is a copyright holder who authorizes use under this
474 | License of the Program or a work on which the Program is based. The
475 | work thus licensed is called the contributor's "contributor version".
476 |
477 | A contributor's "essential patent claims" are all patent claims
478 | owned or controlled by the contributor, whether already acquired or
479 | hereafter acquired, that would be infringed by some manner, permitted
480 | by this License, of making, using, or selling its contributor version,
481 | but do not include claims that would be infringed only as a
482 | consequence of further modification of the contributor version. For
483 | purposes of this definition, "control" includes the right to grant
484 | patent sublicenses in a manner consistent with the requirements of
485 | this License.
486 |
487 | Each contributor grants you a non-exclusive, worldwide, royalty-free
488 | patent license under the contributor's essential patent claims, to
489 | make, use, sell, offer for sale, import and otherwise run, modify and
490 | propagate the contents of its contributor version.
491 |
492 | In the following three paragraphs, a "patent license" is any express
493 | agreement or commitment, however denominated, not to enforce a patent
494 | (such as an express permission to practice a patent or covenant not to
495 | sue for patent infringement). To "grant" such a patent license to a
496 | party means to make such an agreement or commitment not to enforce a
497 | patent against the party.
498 |
499 | If you convey a covered work, knowingly relying on a patent license,
500 | and the Corresponding Source of the work is not available for anyone
501 | to copy, free of charge and under the terms of this License, through a
502 | publicly available network server or other readily accessible means,
503 | then you must either (1) cause the Corresponding Source to be so
504 | available, or (2) arrange to deprive yourself of the benefit of the
505 | patent license for this particular work, or (3) arrange, in a manner
506 | consistent with the requirements of this License, to extend the patent
507 | license to downstream recipients. "Knowingly relying" means you have
508 | actual knowledge that, but for the patent license, your conveying the
509 | covered work in a country, or your recipient's use of the covered work
510 | in a country, would infringe one or more identifiable patents in that
511 | country that you have reason to believe are valid.
512 |
513 | If, pursuant to or in connection with a single transaction or
514 | arrangement, you convey, or propagate by procuring conveyance of, a
515 | covered work, and grant a patent license to some of the parties
516 | receiving the covered work authorizing them to use, propagate, modify
517 | or convey a specific copy of the covered work, then the patent license
518 | you grant is automatically extended to all recipients of the covered
519 | work and works based on it.
520 |
521 | A patent license is "discriminatory" if it does not include within
522 | the scope of its coverage, prohibits the exercise of, or is
523 | conditioned on the non-exercise of one or more of the rights that are
524 | specifically granted under this License. You may not convey a covered
525 | work if you are a party to an arrangement with a third party that is
526 | in the business of distributing software, under which you make payment
527 | to the third party based on the extent of your activity of conveying
528 | the work, and under which the third party grants, to any of the
529 | parties who would receive the covered work from you, a discriminatory
530 | patent license (a) in connection with copies of the covered work
531 | conveyed by you (or copies made from those copies), or (b) primarily
532 | for and in connection with specific products or compilations that
533 | contain the covered work, unless you entered into that arrangement,
534 | or that patent license was granted, prior to 28 March 2007.
535 |
536 | Nothing in this License shall be construed as excluding or limiting
537 | any implied license or other defenses to infringement that may
538 | otherwise be available to you under applicable patent law.
539 |
540 | 12. No Surrender of Others' Freedom.
541 |
542 | If conditions are imposed on you (whether by court order, agreement or
543 | otherwise) that contradict the conditions of this License, they do not
544 | excuse you from the conditions of this License. If you cannot convey a
545 | covered work so as to satisfy simultaneously your obligations under this
546 | License and any other pertinent obligations, then as a consequence you may
547 | not convey it at all. For example, if you agree to terms that obligate you
548 | to collect a royalty for further conveying from those to whom you convey
549 | the Program, the only way you could satisfy both those terms and this
550 | License would be to refrain entirely from conveying the Program.
551 |
552 | 13. Use with the GNU Affero General Public License.
553 |
554 | Notwithstanding any other provision of this License, you have
555 | permission to link or combine any covered work with a work licensed
556 | under version 3 of the GNU Affero General Public License into a single
557 | combined work, and to convey the resulting work. The terms of this
558 | License will continue to apply to the part which is the covered work,
559 | but the special requirements of the GNU Affero General Public License,
560 | section 13, concerning interaction through a network will apply to the
561 | combination as such.
562 |
563 | 14. Revised Versions of this License.
564 |
565 | The Free Software Foundation may publish revised and/or new versions of
566 | the GNU General Public License from time to time. Such new versions will
567 | be similar in spirit to the present version, but may differ in detail to
568 | address new problems or concerns.
569 |
570 | Each version is given a distinguishing version number. If the
571 | Program specifies that a certain numbered version of the GNU General
572 | Public License "or any later version" applies to it, you have the
573 | option of following the terms and conditions either of that numbered
574 | version or of any later version published by the Free Software
575 | Foundation. If the Program does not specify a version number of the
576 | GNU General Public License, you may choose any version ever published
577 | by the Free Software Foundation.
578 |
579 | If the Program specifies that a proxy can decide which future
580 | versions of the GNU General Public License can be used, that proxy's
581 | public statement of acceptance of a version permanently authorizes you
582 | to choose that version for the Program.
583 |
584 | Later license versions may give you additional or different
585 | permissions. However, no additional obligations are imposed on any
586 | author or copyright holder as a result of your choosing to follow a
587 | later version.
588 |
589 | 15. Disclaimer of Warranty.
590 |
591 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
592 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
593 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
594 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
595 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
596 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
597 | IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
598 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
599 |
600 | 16. Limitation of Liability.
601 |
602 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
603 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
604 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
605 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
606 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
607 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
608 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
609 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
610 | SUCH DAMAGES.
611 |
612 | 17. Interpretation of Sections 15 and 16.
613 |
614 | If the disclaimer of warranty and limitation of liability provided
615 | above cannot be given local legal effect according to their terms,
616 | reviewing courts shall apply local law that most closely approximates
617 | an absolute waiver of all civil liability in connection with the
618 | Program, unless a warranty or assumption of liability accompanies a
619 | copy of the Program in return for a fee.
620 |
621 | END OF TERMS AND CONDITIONS
622 |
623 | How to Apply These Terms to Your New Programs
624 |
625 | If you develop a new program, and you want it to be of the greatest
626 | possible use to the public, the best way to achieve this is to make it
627 | free software which everyone can redistribute and change under these terms.
628 |
629 | To do so, attach the following notices to the program. It is safest
630 | to attach them to the start of each source file to most effectively
631 | state the exclusion of warranty; and each file should have at least
632 | the "copyright" line and a pointer to where the full notice is found.
633 |
634 |
635 | Copyright (C)
636 |
637 | This program is free software: you can redistribute it and/or modify
638 | it under the terms of the GNU General Public License as published by
639 | the Free Software Foundation, either version 3 of the License, or
640 | (at your option) any later version.
641 |
642 | This program is distributed in the hope that it will be useful,
643 | but WITHOUT ANY WARRANTY; without even the implied warranty of
644 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
645 | GNU General Public License for more details.
646 |
647 | You should have received a copy of the GNU General Public License
648 | along with this program. If not, see .
649 |
650 | Also add information on how to contact you by electronic and paper mail.
651 |
652 | If the program does terminal interaction, make it output a short
653 | notice like this when it starts in an interactive mode:
654 |
655 | Copyright (C)
656 | This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
657 | This is free software, and you are welcome to redistribute it
658 | under certain conditions; type `show c' for details.
659 |
660 | The hypothetical commands `show w' and `show c' should show the appropriate
661 | parts of the General Public License. Of course, your program's commands
662 | might be different; for a GUI interface, you would use an "about box".
663 |
664 | You should also get your employer (if you work as a programmer) or school,
665 | if any, to sign a "copyright disclaimer" for the program, if necessary.
666 | For more information on this, and how to apply and follow the GNU GPL, see
667 | .
668 |
669 | The GNU General Public License does not permit incorporating your program
670 | into proprietary programs. If your program is a subroutine library, you
671 | may consider it more useful to permit linking proprietary applications with
672 | the library. If this is what you want to do, use the GNU Lesser General
673 | Public License instead of this License. But first, please read
674 | .
675 |
--------------------------------------------------------------------------------
/feature_engineering.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import numpy as np\n",
10 | "import pandas as pd\n",
11 | "from scipy.stats import pearsonr\n",
12 | "from sklearn.tree import DecisionTreeClassifier\n",
13 | "from sklearn.feature_selection import chi2, mutual_info_classif, RFE, SelectFromModel, SelectKBest, VarianceThreshold\n",
14 | "from sklearn.linear_model import LogisticRegression\n",
15 | "import warnings"
16 | ]
17 | },
18 | {
19 | "cell_type": "code",
20 | "execution_count": 2,
21 | "metadata": {},
22 | "outputs": [],
23 | "source": [
24 | "pd.set_option('display.max_columns', None)\n",
25 | "pd.options.mode.use_inf_as_na = True\n",
26 | "warnings.filterwarnings('ignore')"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": 3,
32 | "metadata": {},
33 | "outputs": [],
34 | "source": [
35 | "data = pd.read_csv('G:\\IDS2018\\datasets\\Thursday-15-02-2018_TrafficForML_CICFlowMeter.csv')"
36 | ]
37 | },
38 | {
39 | "cell_type": "markdown",
40 | "metadata": {},
41 | "source": [
42 | "# 数据预处理"
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": 4,
48 | "metadata": {},
49 | "outputs": [],
50 | "source": [
51 | "numerical_fea = list(data.select_dtypes(exclude=['object']).columns)\n",
52 | "category_fea = list(filter(lambda x: x not in numerical_fea, list(data.columns)))"
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": 5,
58 | "metadata": {},
59 | "outputs": [],
60 | "source": [
61 | "def get_numerical_serial_fea(data, feas):\n",
62 | " numerical_serial_fea = []\n",
63 | " numerical_noserial_fea = []\n",
64 | " for fea in feas:\n",
65 | " temp = data[fea].nunique()\n",
66 | " if temp <= 10:\n",
67 | " numerical_noserial_fea.append(fea)\n",
68 | " else:\n",
69 | " numerical_serial_fea.append(fea)\n",
70 | " return numerical_serial_fea, numerical_noserial_fea\n",
71 | "numerical_serial_fea, numerical_noserial_fea = get_numerical_serial_fea(data, numerical_fea)"
72 | ]
73 | },
74 | {
75 | "cell_type": "code",
76 | "execution_count": 36,
77 | "metadata": {},
78 | "outputs": [
79 | {
80 | "data": {
81 | "text/plain": [
82 | "{'Dst Port': 0,\n",
83 | " 'Protocol': 0,\n",
84 | " 'Timestamp': 0,\n",
85 | " 'Flow Duration': 0,\n",
86 | " 'Tot Fwd Pkts': 0,\n",
87 | " 'Tot Bwd Pkts': 0,\n",
88 | " 'TotLen Fwd Pkts': 0,\n",
89 | " 'TotLen Bwd Pkts': 0,\n",
90 | " 'Fwd Pkt Len Max': 0,\n",
91 | " 'Fwd Pkt Len Min': 0,\n",
92 | " 'Fwd Pkt Len Mean': 0,\n",
93 | " 'Fwd Pkt Len Std': 0,\n",
94 | " 'Bwd Pkt Len Max': 0,\n",
95 | " 'Bwd Pkt Len Min': 0,\n",
96 | " 'Bwd Pkt Len Mean': 0,\n",
97 | " 'Bwd Pkt Len Std': 0,\n",
98 | " 'Flow Byts/s': 8027,\n",
99 | " 'Flow Pkts/s': 8027,\n",
100 | " 'Flow IAT Mean': 0,\n",
101 | " 'Flow IAT Std': 0,\n",
102 | " 'Flow IAT Max': 0,\n",
103 | " 'Flow IAT Min': 0,\n",
104 | " 'Fwd IAT Tot': 0,\n",
105 | " 'Fwd IAT Mean': 0,\n",
106 | " 'Fwd IAT Std': 0,\n",
107 | " 'Fwd IAT Max': 0,\n",
108 | " 'Fwd IAT Min': 0,\n",
109 | " 'Bwd IAT Tot': 0,\n",
110 | " 'Bwd IAT Mean': 0,\n",
111 | " 'Bwd IAT Std': 0,\n",
112 | " 'Bwd IAT Max': 0,\n",
113 | " 'Bwd IAT Min': 0,\n",
114 | " 'Fwd PSH Flags': 0,\n",
115 | " 'Bwd PSH Flags': 0,\n",
116 | " 'Fwd URG Flags': 0,\n",
117 | " 'Bwd URG Flags': 0,\n",
118 | " 'Fwd Header Len': 0,\n",
119 | " 'Bwd Header Len': 0,\n",
120 | " 'Fwd Pkts/s': 0,\n",
121 | " 'Bwd Pkts/s': 0,\n",
122 | " 'Pkt Len Min': 0,\n",
123 | " 'Pkt Len Max': 0,\n",
124 | " 'Pkt Len Mean': 0,\n",
125 | " 'Pkt Len Std': 0,\n",
126 | " 'Pkt Len Var': 0,\n",
127 | " 'FIN Flag Cnt': 0,\n",
128 | " 'SYN Flag Cnt': 0,\n",
129 | " 'RST Flag Cnt': 0,\n",
130 | " 'PSH Flag Cnt': 0,\n",
131 | " 'ACK Flag Cnt': 0,\n",
132 | " 'URG Flag Cnt': 0,\n",
133 | " 'CWE Flag Count': 0,\n",
134 | " 'ECE Flag Cnt': 0,\n",
135 | " 'Down/Up Ratio': 0,\n",
136 | " 'Pkt Size Avg': 0,\n",
137 | " 'Fwd Seg Size Avg': 0,\n",
138 | " 'Bwd Seg Size Avg': 0,\n",
139 | " 'Fwd Byts/b Avg': 0,\n",
140 | " 'Fwd Pkts/b Avg': 0,\n",
141 | " 'Fwd Blk Rate Avg': 0,\n",
142 | " 'Bwd Byts/b Avg': 0,\n",
143 | " 'Bwd Pkts/b Avg': 0,\n",
144 | " 'Bwd Blk Rate Avg': 0,\n",
145 | " 'Subflow Fwd Pkts': 0,\n",
146 | " 'Subflow Fwd Byts': 0,\n",
147 | " 'Subflow Bwd Pkts': 0,\n",
148 | " 'Subflow Bwd Byts': 0,\n",
149 | " 'Init Fwd Win Byts': 0,\n",
150 | " 'Init Bwd Win Byts': 0,\n",
151 | " 'Fwd Act Data Pkts': 0,\n",
152 | " 'Fwd Seg Size Min': 0,\n",
153 | " 'Active Mean': 0,\n",
154 | " 'Active Std': 0,\n",
155 | " 'Active Max': 0,\n",
156 | " 'Active Min': 0,\n",
157 | " 'Idle Mean': 0,\n",
158 | " 'Idle Std': 0,\n",
159 | " 'Idle Max': 0,\n",
160 | " 'Idle Min': 0,\n",
161 | " 'Label': 0}"
162 | ]
163 | },
164 | "execution_count": 36,
165 | "metadata": {},
166 | "output_type": "execute_result"
167 | }
168 | ],
169 | "source": [
170 | "data.isnull().sum().to_dict()"
171 | ]
172 | },
173 | {
174 | "cell_type": "code",
175 | "execution_count": 6,
176 | "metadata": {},
177 | "outputs": [],
178 | "source": [
179 | "data[numerical_serial_fea] = data[numerical_serial_fea].fillna(data[numerical_serial_fea].mean())"
180 | ]
181 | },
182 | {
183 | "cell_type": "code",
184 | "execution_count": 9,
185 | "metadata": {},
186 | "outputs": [
187 | {
188 | "data": {
189 | "text/plain": [
190 | "{'Dst Port': 0,\n",
191 | " 'Protocol': 0,\n",
192 | " 'Timestamp': 0,\n",
193 | " 'Flow Duration': 0,\n",
194 | " 'Tot Fwd Pkts': 0,\n",
195 | " 'Tot Bwd Pkts': 0,\n",
196 | " 'TotLen Fwd Pkts': 0,\n",
197 | " 'TotLen Bwd Pkts': 0,\n",
198 | " 'Fwd Pkt Len Max': 0,\n",
199 | " 'Fwd Pkt Len Min': 0,\n",
200 | " 'Fwd Pkt Len Mean': 0,\n",
201 | " 'Fwd Pkt Len Std': 0,\n",
202 | " 'Bwd Pkt Len Max': 0,\n",
203 | " 'Bwd Pkt Len Min': 0,\n",
204 | " 'Bwd Pkt Len Mean': 0,\n",
205 | " 'Bwd Pkt Len Std': 0,\n",
206 | " 'Flow Byts/s': 0,\n",
207 | " 'Flow Pkts/s': 0,\n",
208 | " 'Flow IAT Mean': 0,\n",
209 | " 'Flow IAT Std': 0,\n",
210 | " 'Flow IAT Max': 0,\n",
211 | " 'Flow IAT Min': 0,\n",
212 | " 'Fwd IAT Tot': 0,\n",
213 | " 'Fwd IAT Mean': 0,\n",
214 | " 'Fwd IAT Std': 0,\n",
215 | " 'Fwd IAT Max': 0,\n",
216 | " 'Fwd IAT Min': 0,\n",
217 | " 'Bwd IAT Tot': 0,\n",
218 | " 'Bwd IAT Mean': 0,\n",
219 | " 'Bwd IAT Std': 0,\n",
220 | " 'Bwd IAT Max': 0,\n",
221 | " 'Bwd IAT Min': 0,\n",
222 | " 'Fwd PSH Flags': 0,\n",
223 | " 'Bwd PSH Flags': 0,\n",
224 | " 'Fwd URG Flags': 0,\n",
225 | " 'Bwd URG Flags': 0,\n",
226 | " 'Fwd Header Len': 0,\n",
227 | " 'Bwd Header Len': 0,\n",
228 | " 'Fwd Pkts/s': 0,\n",
229 | " 'Bwd Pkts/s': 0,\n",
230 | " 'Pkt Len Min': 0,\n",
231 | " 'Pkt Len Max': 0,\n",
232 | " 'Pkt Len Mean': 0,\n",
233 | " 'Pkt Len Std': 0,\n",
234 | " 'Pkt Len Var': 0,\n",
235 | " 'FIN Flag Cnt': 0,\n",
236 | " 'SYN Flag Cnt': 0,\n",
237 | " 'RST Flag Cnt': 0,\n",
238 | " 'PSH Flag Cnt': 0,\n",
239 | " 'ACK Flag Cnt': 0,\n",
240 | " 'URG Flag Cnt': 0,\n",
241 | " 'CWE Flag Count': 0,\n",
242 | " 'ECE Flag Cnt': 0,\n",
243 | " 'Down/Up Ratio': 0,\n",
244 | " 'Pkt Size Avg': 0,\n",
245 | " 'Fwd Seg Size Avg': 0,\n",
246 | " 'Bwd Seg Size Avg': 0,\n",
247 | " 'Fwd Byts/b Avg': 0,\n",
248 | " 'Fwd Pkts/b Avg': 0,\n",
249 | " 'Fwd Blk Rate Avg': 0,\n",
250 | " 'Bwd Byts/b Avg': 0,\n",
251 | " 'Bwd Pkts/b Avg': 0,\n",
252 | " 'Bwd Blk Rate Avg': 0,\n",
253 | " 'Subflow Fwd Pkts': 0,\n",
254 | " 'Subflow Fwd Byts': 0,\n",
255 | " 'Subflow Bwd Pkts': 0,\n",
256 | " 'Subflow Bwd Byts': 0,\n",
257 | " 'Init Fwd Win Byts': 0,\n",
258 | " 'Init Bwd Win Byts': 0,\n",
259 | " 'Fwd Act Data Pkts': 0,\n",
260 | " 'Fwd Seg Size Min': 0,\n",
261 | " 'Active Mean': 0,\n",
262 | " 'Active Std': 0,\n",
263 | " 'Active Max': 0,\n",
264 | " 'Active Min': 0,\n",
265 | " 'Idle Mean': 0,\n",
266 | " 'Idle Std': 0,\n",
267 | " 'Idle Max': 0,\n",
268 | " 'Idle Min': 0,\n",
269 | " 'Label': 0}"
270 | ]
271 | },
272 | "execution_count": 9,
273 | "metadata": {},
274 | "output_type": "execute_result"
275 | }
276 | ],
277 | "source": [
278 | "data.isnull().sum().to_dict()"
279 | ]
280 | },
281 | {
282 | "cell_type": "code",
283 | "execution_count": 7,
284 | "metadata": {},
285 | "outputs": [],
286 | "source": [
287 | "data['Timestamp'] = pd.to_datetime(data['Timestamp'],format='%d/%m/%Y %H:%M:%S')"
288 | ]
289 | },
290 | {
291 | "cell_type": "code",
292 | "execution_count": 7,
293 | "metadata": {},
294 | "outputs": [],
295 | "source": [
296 | "data['Label'].replace(to_replace='Benign', value=0, inplace=True)\n",
297 | "data['Label'].replace(to_replace='DoS attacks-GoldenEye', value=1, inplace=True)\n",
298 | "data['Label'].replace(to_replace='DoS attacks-Slowloris', value=1, inplace=True)"
299 | ]
300 | },
301 | {
302 | "cell_type": "markdown",
303 | "metadata": {},
304 | "source": [
305 | "# 异常值处理"
306 | ]
307 | },
308 | {
309 | "cell_type": "code",
310 | "execution_count": 10,
311 | "metadata": {},
312 | "outputs": [],
313 | "source": [
314 | "def find_outliers_by_3segama(data, fea):\n",
315 | " data_std = np.std(data[fea])\n",
316 | " data_mean = np.mean(data[fea])\n",
317 | " outliers_cut_off = 3 * data_std\n",
318 | " lower_rule = data_mean - outliers_cut_off\n",
319 | " upper_rule = data_mean + outliers_cut_off\n",
320 | " data[fea+'_outliers'] = data[fea].apply(lambda x: str('异常值') if x > upper_rule or x < lower_rule else '正常值')\n",
321 | " return data"
322 | ]
323 | },
324 | {
325 | "cell_type": "code",
326 | "execution_count": 27,
327 | "metadata": {},
328 | "outputs": [
329 | {
330 | "name": "stdout",
331 | "output_type": "stream",
332 | "text": [
333 | "正常值 1039789\n",
334 | "异常值 8786\n",
335 | "Name: Dst Port_outliers, dtype: int64\n",
336 | "Dst Port_outliers\n",
337 | "异常值 0\n",
338 | "正常值 52498\n",
339 | "Name: Label, dtype: int64\n",
340 | "**********\n",
341 | "正常值 991764\n",
342 | "异常值 56811\n",
343 | "Name: Flow Duration_outliers, dtype: int64\n",
344 | "Flow Duration_outliers\n",
345 | "异常值 280\n",
346 | "正常值 52218\n",
347 | "Name: Label, dtype: int64\n",
348 | "**********\n",
349 | "正常值 1046745\n",
350 | "异常值 1830\n",
351 | "Name: Tot Fwd Pkts_outliers, dtype: int64\n",
352 | "Tot Fwd Pkts_outliers\n",
353 | "异常值 0\n",
354 | "正常值 52498\n",
355 | "Name: Label, dtype: int64\n",
356 | "**********\n",
357 | "正常值 1046952\n",
358 | "异常值 1623\n",
359 | "Name: Tot Bwd Pkts_outliers, dtype: int64\n",
360 | "Tot Bwd Pkts_outliers\n",
361 | "异常值 0\n",
362 | "正常值 52498\n",
363 | "Name: Label, dtype: int64\n",
364 | "**********\n",
365 | "正常值 1048549\n",
366 | "异常值 26\n",
367 | "Name: TotLen Fwd Pkts_outliers, dtype: int64\n",
368 | "TotLen Fwd Pkts_outliers\n",
369 | "异常值 0\n",
370 | "正常值 52498\n",
371 | "Name: Label, dtype: int64\n",
372 | "**********\n",
373 | "正常值 1047070\n",
374 | "异常值 1505\n",
375 | "Name: TotLen Bwd Pkts_outliers, dtype: int64\n",
376 | "TotLen Bwd Pkts_outliers\n",
377 | "异常值 0\n",
378 | "正常值 52498\n",
379 | "Name: Label, dtype: int64\n",
380 | "**********\n",
381 | "正常值 1038021\n",
382 | "异常值 10554\n",
383 | "Name: Fwd Pkt Len Max_outliers, dtype: int64\n",
384 | "Fwd Pkt Len Max_outliers\n",
385 | "异常值 0\n",
386 | "正常值 52498\n",
387 | "Name: Label, dtype: int64\n",
388 | "**********\n",
389 | "正常值 1045594\n",
390 | "异常值 2981\n",
391 | "Name: Fwd Pkt Len Min_outliers, dtype: int64\n",
392 | "Fwd Pkt Len Min_outliers\n",
393 | "异常值 98\n",
394 | "正常值 52400\n",
395 | "Name: Label, dtype: int64\n",
396 | "**********\n",
397 | "正常值 1039579\n",
398 | "异常值 8996\n",
399 | "Name: Fwd Pkt Len Mean_outliers, dtype: int64\n",
400 | "Fwd Pkt Len Mean_outliers\n",
401 | "异常值 1178\n",
402 | "正常值 51320\n",
403 | "Name: Label, dtype: int64\n",
404 | "**********\n",
405 | "正常值 1037266\n",
406 | "异常值 11309\n",
407 | "Name: Fwd Pkt Len Std_outliers, dtype: int64\n",
408 | "Fwd Pkt Len Std_outliers\n",
409 | "异常值 634\n",
410 | "正常值 51864\n",
411 | "Name: Label, dtype: int64\n",
412 | "**********\n",
413 | "正常值 1048568\n",
414 | "异常值 7\n",
415 | "Name: Bwd Pkt Len Max_outliers, dtype: int64\n",
416 | "Bwd Pkt Len Max_outliers\n",
417 | "异常值 0\n",
418 | "正常值 52498\n",
419 | "Name: Label, dtype: int64\n",
420 | "**********\n",
421 | "正常值 1033705\n",
422 | "异常值 14870\n",
423 | "Name: Bwd Pkt Len Min_outliers, dtype: int64\n",
424 | "Bwd Pkt Len Min_outliers\n",
425 | "异常值 0\n",
426 | "正常值 52498\n",
427 | "Name: Label, dtype: int64\n",
428 | "**********\n",
429 | "正常值 1022827\n",
430 | "异常值 25748\n",
431 | "Name: Bwd Pkt Len Mean_outliers, dtype: int64\n",
432 | "Bwd Pkt Len Mean_outliers\n",
433 | "异常值 0\n",
434 | "正常值 52498\n",
435 | "Name: Label, dtype: int64\n",
436 | "**********\n",
437 | "正常值 1047631\n",
438 | "异常值 944\n",
439 | "Name: Bwd Pkt Len Std_outliers, dtype: int64\n",
440 | "Bwd Pkt Len Std_outliers\n",
441 | "异常值 0\n",
442 | "正常值 52498\n",
443 | "Name: Label, dtype: int64\n",
444 | "**********\n",
445 | "正常值 1043268\n",
446 | "异常值 5307\n",
447 | "Name: Flow Byts/s_outliers, dtype: int64\n",
448 | "Flow Byts/s_outliers\n",
449 | "异常值 4\n",
450 | "正常值 52494\n",
451 | "Name: Label, dtype: int64\n",
452 | "**********\n",
453 | "正常值 1031972\n",
454 | "异常值 16603\n",
455 | "Name: Flow Pkts/s_outliers, dtype: int64\n",
456 | "Flow Pkts/s_outliers\n",
457 | "异常值 458\n",
458 | "正常值 52040\n",
459 | "Name: Label, dtype: int64\n",
460 | "**********\n",
461 | "正常值 1004984\n",
462 | "异常值 43591\n",
463 | "Name: Flow IAT Mean_outliers, dtype: int64\n",
464 | "Flow IAT Mean_outliers\n",
465 | "异常值 3115\n",
466 | "正常值 49383\n",
467 | "Name: Label, dtype: int64\n",
468 | "**********\n",
469 | "正常值 1033696\n",
470 | "异常值 14879\n",
471 | "Name: Flow IAT Std_outliers, dtype: int64\n",
472 | "Flow IAT Std_outliers\n",
473 | "异常值 3698\n",
474 | "正常值 48800\n",
475 | "Name: Label, dtype: int64\n",
476 | "**********\n",
477 | "正常值 1021705\n",
478 | "异常值 26870\n",
479 | "Name: Flow IAT Max_outliers, dtype: int64\n",
480 | "Flow IAT Max_outliers\n",
481 | "异常值 3778\n",
482 | "正常值 48720\n",
483 | "Name: Label, dtype: int64\n",
484 | "**********\n",
485 | "正常值 1006778\n",
486 | "异常值 41797\n",
487 | "Name: Flow IAT Min_outliers, dtype: int64\n",
488 | "Flow IAT Min_outliers\n",
489 | "异常值 1862\n",
490 | "正常值 50636\n",
491 | "Name: Label, dtype: int64\n",
492 | "**********\n",
493 | "正常值 991938\n",
494 | "异常值 56637\n",
495 | "Name: Fwd IAT Tot_outliers, dtype: int64\n",
496 | "Fwd IAT Tot_outliers\n",
497 | "异常值 281\n",
498 | "正常值 52217\n",
499 | "Name: Label, dtype: int64\n",
500 | "**********\n",
501 | "正常值 1002872\n",
502 | "异常值 45703\n",
503 | "Name: Fwd IAT Mean_outliers, dtype: int64\n",
504 | "Fwd IAT Mean_outliers\n",
505 | "异常值 4781\n",
506 | "正常值 47717\n",
507 | "Name: Label, dtype: int64\n",
508 | "**********\n",
509 | "正常值 1010889\n",
510 | "异常值 37686\n",
511 | "Name: Fwd IAT Std_outliers, dtype: int64\n",
512 | "Fwd IAT Std_outliers\n",
513 | "异常值 239\n",
514 | "正常值 52259\n",
515 | "Name: Label, dtype: int64\n",
516 | "**********\n",
517 | "正常值 1004804\n",
518 | "异常值 43771\n",
519 | "Name: Fwd IAT Max_outliers, dtype: int64\n",
520 | "Fwd IAT Max_outliers\n",
521 | "异常值 3778\n",
522 | "正常值 48720\n",
523 | "Name: Label, dtype: int64\n",
524 | "**********\n",
525 | "正常值 1002826\n",
526 | "异常值 45749\n",
527 | "Name: Fwd IAT Min_outliers, dtype: int64\n",
528 | "Fwd IAT Min_outliers\n",
529 | "异常值 4902\n",
530 | "正常值 47596\n",
531 | "Name: Label, dtype: int64\n",
532 | "**********\n",
533 | "正常值 979526\n",
534 | "异常值 69049\n",
535 | "Name: Bwd IAT Tot_outliers, dtype: int64\n",
536 | "Bwd IAT Tot_outliers\n",
537 | "异常值 7142\n",
538 | "正常值 45356\n",
539 | "Name: Label, dtype: int64\n",
540 | "**********\n",
541 | "正常值 1035909\n",
542 | "异常值 12666\n",
543 | "Name: Bwd IAT Mean_outliers, dtype: int64\n",
544 | "Bwd IAT Mean_outliers\n",
545 | "异常值 7353\n",
546 | "正常值 45145\n",
547 | "Name: Label, dtype: int64\n",
548 | "**********\n",
549 | "正常值 1019297\n",
550 | "异常值 29278\n",
551 | "Name: Bwd IAT Std_outliers, dtype: int64\n",
552 | "Bwd IAT Std_outliers\n",
553 | "异常值 5001\n",
554 | "正常值 47497\n",
555 | "Name: Label, dtype: int64\n",
556 | "**********\n",
557 | "正常值 1007151\n",
558 | "异常值 41424\n",
559 | "Name: Bwd IAT Max_outliers, dtype: int64\n",
560 | "Bwd IAT Max_outliers\n",
561 | "异常值 7196\n",
562 | "正常值 45302\n",
563 | "Name: Label, dtype: int64\n",
564 | "**********\n",
565 | "正常值 1037488\n",
566 | "异常值 11087\n",
567 | "Name: Bwd IAT Min_outliers, dtype: int64\n",
568 | "Bwd IAT Min_outliers\n",
569 | "异常值 7150\n",
570 | "正常值 45348\n",
571 | "Name: Label, dtype: int64\n",
572 | "**********\n",
573 | "正常值 1046973\n",
574 | "异常值 1602\n",
575 | "Name: Fwd Header Len_outliers, dtype: int64\n",
576 | "Fwd Header Len_outliers\n",
577 | "异常值 0\n",
578 | "正常值 52498\n",
579 | "Name: Label, dtype: int64\n",
580 | "**********\n",
581 | "正常值 1046933\n",
582 | "异常值 1642\n",
583 | "Name: Bwd Header Len_outliers, dtype: int64\n",
584 | "Bwd Header Len_outliers\n",
585 | "异常值 0\n",
586 | "正常值 52498\n",
587 | "Name: Label, dtype: int64\n",
588 | "**********\n",
589 | "正常值 1032449\n",
590 | "异常值 16126\n",
591 | "Name: Fwd Pkts/s_outliers, dtype: int64\n",
592 | "Fwd Pkts/s_outliers\n",
593 | "异常值 55\n",
594 | "正常值 52443\n",
595 | "Name: Label, dtype: int64\n",
596 | "**********\n",
597 | "正常值 1046807\n",
598 | "异常值 1768\n",
599 | "Name: Bwd Pkts/s_outliers, dtype: int64\n",
600 | "Bwd Pkts/s_outliers\n",
601 | "异常值 754\n",
602 | "正常值 51744\n",
603 | "Name: Label, dtype: int64\n",
604 | "**********\n",
605 | "正常值 1044348\n",
606 | "异常值 4227\n",
607 | "Name: Pkt Len Min_outliers, dtype: int64\n",
608 | "Pkt Len Min_outliers\n",
609 | "异常值 0\n",
610 | "正常值 52498\n",
611 | "Name: Label, dtype: int64\n",
612 | "**********\n",
613 | "正常值 1048557\n",
614 | "异常值 18\n",
615 | "Name: Pkt Len Max_outliers, dtype: int64\n",
616 | "Pkt Len Max_outliers\n",
617 | "异常值 0\n",
618 | "正常值 52498\n",
619 | "Name: Label, dtype: int64\n",
620 | "**********\n",
621 | "正常值 1029538\n",
622 | "异常值 19037\n",
623 | "Name: Pkt Len Mean_outliers, dtype: int64\n",
624 | "Pkt Len Mean_outliers\n",
625 | "异常值 0\n",
626 | "正常值 52498\n",
627 | "Name: Label, dtype: int64\n",
628 | "**********\n",
629 | "正常值 1029158\n",
630 | "异常值 19417\n",
631 | "Name: Pkt Len Std_outliers, dtype: int64\n",
632 | "Pkt Len Std_outliers\n",
633 | "异常值 0\n",
634 | "正常值 52498\n",
635 | "Name: Label, dtype: int64\n",
636 | "**********\n",
637 | "正常值 1048554\n",
638 | "异常值 21\n",
639 | "Name: Pkt Len Var_outliers, dtype: int64\n",
640 | "Pkt Len Var_outliers\n",
641 | "异常值 0\n",
642 | "正常值 52498\n",
643 | "Name: Label, dtype: int64\n",
644 | "**********\n",
645 | "正常值 1044334\n",
646 | "异常值 4241\n",
647 | "Name: Down/Up Ratio_outliers, dtype: int64\n",
648 | "Down/Up Ratio_outliers\n",
649 | "异常值 0\n",
650 | "正常值 52498\n",
651 | "Name: Label, dtype: int64\n",
652 | "**********\n",
653 | "正常值 1028615\n",
654 | "异常值 19960\n",
655 | "Name: Pkt Size Avg_outliers, dtype: int64\n",
656 | "Pkt Size Avg_outliers\n",
657 | "异常值 2\n",
658 | "正常值 52496\n",
659 | "Name: Label, dtype: int64\n",
660 | "**********\n",
661 | "正常值 1039579\n",
662 | "异常值 8996\n",
663 | "Name: Fwd Seg Size Avg_outliers, dtype: int64\n",
664 | "Fwd Seg Size Avg_outliers\n",
665 | "异常值 1178\n",
666 | "正常值 51320\n",
667 | "Name: Label, dtype: int64\n",
668 | "**********\n",
669 | "正常值 1022827\n",
670 | "异常值 25748\n",
671 | "Name: Bwd Seg Size Avg_outliers, dtype: int64\n",
672 | "Bwd Seg Size Avg_outliers\n",
673 | "异常值 0\n",
674 | "正常值 52498\n",
675 | "Name: Label, dtype: int64\n",
676 | "**********\n",
677 | "正常值 1046745\n",
678 | "异常值 1830\n",
679 | "Name: Subflow Fwd Pkts_outliers, dtype: int64\n",
680 | "Subflow Fwd Pkts_outliers\n",
681 | "异常值 0\n",
682 | "正常值 52498\n",
683 | "Name: Label, dtype: int64\n",
684 | "**********\n",
685 | "正常值 1048549\n",
686 | "异常值 26\n",
687 | "Name: Subflow Fwd Byts_outliers, dtype: int64\n",
688 | "Subflow Fwd Byts_outliers\n",
689 | "异常值 0\n",
690 | "正常值 52498\n",
691 | "Name: Label, dtype: int64\n",
692 | "**********\n",
693 | "正常值 1046952\n",
694 | "异常值 1623\n",
695 | "Name: Subflow Bwd Pkts_outliers, dtype: int64\n",
696 | "Subflow Bwd Pkts_outliers\n",
697 | "异常值 0\n",
698 | "正常值 52498\n",
699 | "Name: Label, dtype: int64\n",
700 | "**********\n",
701 | "正常值 1047070\n",
702 | "异常值 1505\n",
703 | "Name: Subflow Bwd Byts_outliers, dtype: int64\n",
704 | "Subflow Bwd Byts_outliers\n",
705 | "异常值 0\n",
706 | "正常值 52498\n",
707 | "Name: Label, dtype: int64\n",
708 | "**********\n",
709 | "正常值 1025928\n",
710 | "异常值 22647\n",
711 | "Name: Init Fwd Win Byts_outliers, dtype: int64\n",
712 | "Init Fwd Win Byts_outliers\n",
713 | "异常值 0\n",
714 | "正常值 52498\n",
715 | "Name: Label, dtype: int64\n",
716 | "**********\n",
717 | "正常值 966856\n",
718 | "异常值 81719\n",
719 | "Name: Init Bwd Win Byts_outliers, dtype: int64\n",
720 | "Init Bwd Win Byts_outliers\n",
721 | "异常值 0\n",
722 | "正常值 52498\n",
723 | "Name: Label, dtype: int64\n",
724 | "**********\n",
725 | "正常值 1044073\n",
726 | "异常值 4502\n",
727 | "Name: Fwd Act Data Pkts_outliers, dtype: int64\n",
728 | "Fwd Act Data Pkts_outliers\n",
729 | "异常值 0\n",
730 | "正常值 52498\n",
731 | "Name: Label, dtype: int64\n",
732 | "**********\n",
733 | "正常值 1033732\n",
734 | "异常值 14843\n",
735 | "Name: Fwd Seg Size Min_outliers, dtype: int64\n",
736 | "Fwd Seg Size Min_outliers\n",
737 | "异常值 2232\n",
738 | "正常值 50266\n",
739 | "Name: Label, dtype: int64\n",
740 | "**********\n",
741 | "正常值 1034611\n",
742 | "异常值 13964\n",
743 | "Name: Active Mean_outliers, dtype: int64\n",
744 | "Active Mean_outliers\n",
745 | "异常值 5598\n",
746 | "正常值 46900\n",
747 | "Name: Label, dtype: int64\n",
748 | "**********\n"
749 | ]
750 | },
751 | {
752 | "name": "stdout",
753 | "output_type": "stream",
754 | "text": [
755 | "正常值 1034351\n",
756 | "异常值 14224\n",
757 | "Name: Active Std_outliers, dtype: int64\n",
758 | "Active Std_outliers\n",
759 | "异常值 4935\n",
760 | "正常值 47563\n",
761 | "Name: Label, dtype: int64\n",
762 | "**********\n",
763 | "正常值 1032734\n",
764 | "异常值 15841\n",
765 | "Name: Active Max_outliers, dtype: int64\n",
766 | "Active Max_outliers\n",
767 | "异常值 5557\n",
768 | "正常值 46941\n",
769 | "Name: Label, dtype: int64\n",
770 | "**********\n",
771 | "正常值 1035242\n",
772 | "异常值 13333\n",
773 | "Name: Active Min_outliers, dtype: int64\n",
774 | "Active Min_outliers\n",
775 | "异常值 5463\n",
776 | "正常值 47035\n",
777 | "Name: Label, dtype: int64\n",
778 | "**********\n",
779 | "正常值 996157\n",
780 | "异常值 52418\n",
781 | "Name: Idle Mean_outliers, dtype: int64\n",
782 | "Idle Mean_outliers\n",
783 | "异常值 4229\n",
784 | "正常值 48269\n",
785 | "Name: Label, dtype: int64\n",
786 | "**********\n",
787 | "正常值 1037445\n",
788 | "异常值 11130\n",
789 | "Name: Idle Std_outliers, dtype: int64\n",
790 | "Idle Std_outliers\n",
791 | "异常值 4788\n",
792 | "正常值 47710\n",
793 | "Name: Label, dtype: int64\n",
794 | "**********\n",
795 | "正常值 988121\n",
796 | "异常值 60454\n",
797 | "Name: Idle Max_outliers, dtype: int64\n",
798 | "Idle Max_outliers\n",
799 | "异常值 8686\n",
800 | "正常值 43812\n",
801 | "Name: Label, dtype: int64\n",
802 | "**********\n",
803 | "正常值 996490\n",
804 | "异常值 52085\n",
805 | "Name: Idle Min_outliers, dtype: int64\n",
806 | "Idle Min_outliers\n",
807 | "异常值 4277\n",
808 | "正常值 48221\n",
809 | "Name: Label, dtype: int64\n",
810 | "**********\n"
811 | ]
812 | }
813 | ],
814 | "source": [
815 | "for fea in numerical_serial_fea:\n",
816 | " data = find_outliers_by_3segama(data, fea)\n",
817 | " print(data[fea+'_outliers'].value_counts())\n",
818 | " print(data.groupby(fea+'_outliers')['Label'].sum())\n",
819 | " print('*'*10)"
820 | ]
821 | },
822 | {
823 | "cell_type": "markdown",
824 | "metadata": {},
825 | "source": [
826 | "# 特征选择"
827 | ]
828 | },
829 | {
830 | "cell_type": "code",
831 | "execution_count": 47,
832 | "metadata": {},
833 | "outputs": [
834 | {
835 | "name": "stdout",
836 | "output_type": "stream",
837 | "text": [
838 | "Selected: ['Dst Port' 'Flow Duration' 'Tot Fwd Pkts' 'Tot Bwd Pkts'\n",
839 | " 'TotLen Fwd Pkts' 'TotLen Bwd Pkts' 'Fwd Pkt Len Max' 'Fwd Pkt Len Min'\n",
840 | " 'Fwd Pkt Len Mean' 'Fwd Pkt Len Std' 'Bwd Pkt Len Max' 'Bwd Pkt Len Min'\n",
841 | " 'Bwd Pkt Len Mean' 'Bwd Pkt Len Std' 'Flow Byts/s' 'Flow Pkts/s'\n",
842 | " 'Flow IAT Mean' 'Flow IAT Std' 'Flow IAT Max' 'Flow IAT Min'\n",
843 | " 'Fwd IAT Tot' 'Fwd IAT Mean' 'Fwd IAT Std' 'Fwd IAT Max' 'Fwd IAT Min'\n",
844 | " 'Bwd IAT Tot' 'Bwd IAT Mean' 'Bwd IAT Std' 'Bwd IAT Max' 'Bwd IAT Min'\n",
845 | " 'Fwd Header Len' 'Bwd Header Len' 'Fwd Pkts/s' 'Bwd Pkts/s' 'Pkt Len Min'\n",
846 | " 'Pkt Len Max' 'Pkt Len Mean' 'Pkt Len Std' 'Pkt Len Var' 'Pkt Size Avg'\n",
847 | " 'Fwd Seg Size Avg' 'Bwd Seg Size Avg' 'Subflow Fwd Pkts'\n",
848 | " 'Subflow Fwd Byts' 'Subflow Bwd Pkts' 'Subflow Bwd Byts'\n",
849 | " 'Init Fwd Win Byts' 'Init Bwd Win Byts' 'Fwd Act Data Pkts'\n",
850 | " 'Fwd Seg Size Min' 'Active Mean' 'Active Std' 'Active Max' 'Active Min'\n",
851 | " 'Idle Mean' 'Idle Std' 'Idle Max' 'Idle Min']\n",
852 | "Deleted: ['Down/Up Ratio']\n"
853 | ]
854 | }
855 | ],
856 | "source": [
857 | "selector = VarianceThreshold(threshold=3)\n",
858 | "selector = selector.fit(data[numerical_serial_fea])\n",
859 | "features_mask = selector.get_support(indices=True)\n",
860 | "selected_features = np.array(numerical_serial_fea)[features_mask]\n",
861 | "print('Selected:', selected_features)\n",
862 | "print('Deleted: ', [fea for fea in numerical_serial_fea if fea not in selected_features])"
863 | ]
864 | },
865 | {
866 | "cell_type": "code",
867 | "execution_count": 12,
868 | "metadata": {},
869 | "outputs": [
870 | {
871 | "data": {
872 | "text/plain": [
873 | "[('Fwd Seg Size Min', (0.4827216248545034, 0.0)),\n",
874 | " ('Bwd IAT Mean', (0.30192251129739955, 0.0)),\n",
875 | " ('Init Fwd Win Byts', (0.2684559690822708, 0.0)),\n",
876 | " ('Bwd IAT Min', (0.2500177147030524, 0.0)),\n",
877 | " ('Flow IAT Std', (0.21858340497356774, 0.0)),\n",
878 | " ('Bwd IAT Max', (0.19606408350148533, 0.0)),\n",
879 | " ('Bwd IAT Std', (0.18719441138249784, 0.0)),\n",
880 | " ('Idle Max', (0.18100280839192964, 0.0)),\n",
881 | " ('Fwd Pkt Len Std', (0.16886516326958462, 0.0)),\n",
882 | " ('Idle Std', (0.15481787043716266, 0.0))]"
883 | ]
884 | },
885 | "execution_count": 12,
886 | "metadata": {},
887 | "output_type": "execute_result"
888 | }
889 | ],
890 | "source": [
891 | "pearsonr_result = []\n",
892 | "for fea in numerical_serial_fea:\n",
893 | " pearsonr_result.append((fea, pearsonr(data[fea], data['Label'])))\n",
894 | "sorted(pearsonr_result, key=lambda x: abs(x[1][0]), reverse=True)[:10]"
895 | ]
896 | },
897 | {
898 | "cell_type": "code",
899 | "execution_count": 21,
900 | "metadata": {},
901 | "outputs": [
902 | {
903 | "name": "stdout",
904 | "output_type": "stream",
905 | "text": [
906 | "Dst Port\n",
907 | "(array([3.99751145e+08]), array([0.]))\n",
908 | "Flow Duration\n",
909 | "(array([3.59172003e+11]), array([0.]))\n",
910 | "Tot Fwd Pkts\n",
911 | "(array([3778.26711379]), array([0.]))\n",
912 | "Tot Bwd Pkts\n",
913 | "(array([147047.64832782]), array([0.]))\n",
914 | "TotLen Fwd Pkts\n",
915 | "(array([634718.39574114]), array([0.]))\n",
916 | "TotLen Bwd Pkts\n",
917 | "(array([2.24665013e+08]), array([0.]))\n",
918 | "Fwd Pkt Len Max\n",
919 | "(array([3126399.65719183]), array([0.]))\n",
920 | "Fwd Pkt Len Min\n",
921 | "(array([672290.30465466]), array([0.]))\n",
922 | "Fwd Pkt Len Mean\n",
923 | "(array([1008178.14621648]), array([0.]))\n",
924 | "Fwd Pkt Len Std\n",
925 | "(array([4910010.25013736]), array([0.]))\n",
926 | "Bwd Pkt Len Max\n",
927 | "(array([2426276.39917122]), array([0.]))\n",
928 | "Bwd Pkt Len Min\n",
929 | "(array([1859688.16916162]), array([0.]))\n",
930 | "Bwd Pkt Len Mean\n",
931 | "(array([14682.06813255]), array([0.]))\n",
932 | "Bwd Pkt Len Std\n",
933 | "(array([6324182.09067248]), array([0.]))\n",
934 | "Flow Byts/s\n",
935 | "(array([1.5768512e+10]), array([0.]))\n",
936 | "Flow Pkts/s\n",
937 | "(array([1.00582909e+09]), array([0.]))\n",
938 | "Flow IAT Mean\n",
939 | "(array([2.96727475e+11]), array([0.]))\n",
940 | "Flow IAT Std\n",
941 | "(array([9.98169939e+11]), array([0.]))\n",
942 | "Flow IAT Max\n",
943 | "(array([1.02174605e+12]), array([0.]))\n",
944 | "Flow IAT Min\n",
945 | "(array([2.28890909e+10]), array([0.]))\n",
946 | "Fwd IAT Tot\n",
947 | "(array([2.86848641e+11]), array([0.]))\n",
948 | "Fwd IAT Mean\n",
949 | "(array([9.49019226e+11]), array([0.]))\n",
950 | "Fwd IAT Std\n",
951 | "(array([4.93833027e+10]), array([0.]))\n",
952 | "Fwd IAT Max\n",
953 | "(array([9.7532279e+11]), array([0.]))\n",
954 | "Fwd IAT Min\n",
955 | "(array([1.06993311e+12]), array([0.]))\n",
956 | "Bwd IAT Tot\n",
957 | "(array([3.67104733e+11]), array([0.]))\n",
958 | "Bwd IAT Mean\n",
959 | "(array([3.57822439e+12]), array([0.]))\n",
960 | "Bwd IAT Std\n",
961 | "(array([5.31186971e+11]), array([0.]))\n",
962 | "Bwd IAT Max\n",
963 | "(array([1.79577127e+12]), array([0.]))\n",
964 | "Bwd IAT Min\n",
965 | "(array([4.16347887e+12]), array([0.]))\n",
966 | "Fwd Header Len\n",
967 | "(array([1134722.69775687]), array([0.]))\n",
968 | "Bwd Header Len\n",
969 | "(array([1334952.53878541]), array([0.]))\n",
970 | "Fwd Pkts/s\n",
971 | "(array([1.3340166e+09]), array([0.]))\n",
972 | "Bwd Pkts/s\n",
973 | "(array([1.27108414e+08]), array([0.]))\n",
974 | "Pkt Len Min\n",
975 | "(array([763088.03979813]), array([0.]))\n",
976 | "Pkt Len Max\n",
977 | "(array([3235866.47097282]), array([0.]))\n",
978 | "Pkt Len Mean\n",
979 | "(array([79343.84160227]), array([0.]))\n",
980 | "Pkt Len Std\n",
981 | "(array([2061704.30901782]), array([0.]))\n",
982 | "Pkt Len Var\n",
983 | "(array([2.48367375e+08]), array([0.]))\n",
984 | "Down/Up Ratio\n",
985 | "(array([3734.48793754]), array([0.]))\n",
986 | "Pkt Size Avg\n",
987 | "(array([40190.97203183]), array([0.]))\n",
988 | "Fwd Seg Size Avg\n",
989 | "(array([1008178.14621648]), array([0.]))\n",
990 | "Bwd Seg Size Avg\n",
991 | "(array([14682.06813255]), array([0.]))\n",
992 | "Subflow Fwd Pkts\n",
993 | "(array([3778.26711379]), array([0.]))\n",
994 | "Subflow Fwd Byts\n",
995 | "(array([634718.39574114]), array([0.]))\n",
996 | "Subflow Bwd Pkts\n",
997 | "(array([147047.64832782]), array([0.]))\n",
998 | "Subflow Bwd Byts\n",
999 | "(array([2.24665013e+08]), array([0.]))\n",
1000 | "Fwd Act Data Pkts\n",
1001 | "(array([870.91484755]), array([2.06394786e-191]))\n",
1002 | "Fwd Seg Size Min\n",
1003 | "(array([823283.21674079]), array([0.]))\n",
1004 | "Active Mean\n",
1005 | "(array([1.42494306e+11]), array([0.]))\n",
1006 | "Active Std\n",
1007 | "(array([3.00722196e+10]), array([0.]))\n",
1008 | "Active Max\n",
1009 | "(array([1.15913032e+11]), array([0.]))\n",
1010 | "Active Min\n",
1011 | "(array([1.15899376e+11]), array([0.]))\n",
1012 | "Idle Mean\n",
1013 | "(array([1.11834743e+12]), array([0.]))\n",
1014 | "Idle Std\n",
1015 | "(array([5.2196894e+11]), array([0.]))\n",
1016 | "Idle Max\n",
1017 | "(array([1.64438117e+12]), array([0.]))\n",
1018 | "Idle Min\n",
1019 | "(array([8.75959539e+11]), array([0.]))\n"
1020 | ]
1021 | }
1022 | ],
1023 | "source": [
1024 | "# find out the features can not be applied by chi2\n",
1025 | "for fea in [fea for fea in numerical_serial_fea if fea not in ['Init Fwd Win Byts', 'Init Bwd Win Byts']]:\n",
1026 | " print(fea)\n",
1027 | " print(chi2(np.array(data[fea]).reshape(-1, 1), np.array(data['Label']).reshape(-1, 1)))"
1028 | ]
1029 | },
1030 | {
1031 | "cell_type": "code",
1032 | "execution_count": 49,
1033 | "metadata": {},
1034 | "outputs": [
1035 | {
1036 | "name": "stdout",
1037 | "output_type": "stream",
1038 | "text": [
1039 | "Selected: ['Bwd IAT Mean' 'Bwd IAT Max' 'Bwd IAT Min' 'Idle Mean' 'Idle Max']\n"
1040 | ]
1041 | }
1042 | ],
1043 | "source": [
1044 | "chi2_test_fea = [fea for fea in numerical_serial_fea if fea not in ['Init Fwd Win Byts', 'Init Bwd Win Byts']]\n",
1045 | "selector = SelectKBest(chi2, k=5)\n",
1046 | "selector = selector.fit(data[chi2_test_fea], data['Label'])\n",
1047 | "features_mask = selector.get_support(indices=True)\n",
1048 | "selected_features = np.array(chi2_test_fea)[features_mask]\n",
1049 | "print('Selected:', selected_features)"
1050 | ]
1051 | },
1052 | {
1053 | "cell_type": "code",
1054 | "execution_count": 50,
1055 | "metadata": {},
1056 | "outputs": [
1057 | {
1058 | "name": "stdout",
1059 | "output_type": "stream",
1060 | "text": [
1061 | "Selected: ['Flow IAT Max' 'Fwd Header Len' 'Pkt Len Max' 'Init Fwd Win Byts'\n",
1062 | " 'Fwd Seg Size Min']\n"
1063 | ]
1064 | }
1065 | ],
1066 | "source": [
1067 | "selector = SelectKBest(mutual_info_classif, k=5)\n",
1068 | "selector = selector.fit(data[numerical_serial_fea], data['Label'])\n",
1069 | "features_mask = selector.get_support(indices=True)\n",
1070 | "selected_features = np.array(numerical_serial_fea)[features_mask]\n",
1071 | "print('Selected:', selected_features)"
1072 | ]
1073 | },
1074 | {
1075 | "cell_type": "code",
1076 | "execution_count": 56,
1077 | "metadata": {},
1078 | "outputs": [
1079 | {
1080 | "name": "stdout",
1081 | "output_type": "stream",
1082 | "text": [
1083 | "Selected: ['Dst Port' 'Fwd Pkts/s' 'Down/Up Ratio' 'Init Fwd Win Byts'\n",
1084 | " 'Fwd Seg Size Min']\n"
1085 | ]
1086 | }
1087 | ],
1088 | "source": [
1089 | "features = [fea for fea in data.columns if fea not in ['Timestamp', 'Label']]\n",
1090 | "selector = RFE(DecisionTreeClassifier(), n_features_to_select=5, step=1)\n",
1091 | "selector = selector.fit(data[features], data['Label'])\n",
1092 | "features_mask = selector.get_support(indices=True)\n",
1093 | "selected_features = np.array(features)[features_mask]\n",
1094 | "print('Selected:', selected_features)"
1095 | ]
1096 | },
1097 | {
1098 | "cell_type": "code",
1099 | "execution_count": 52,
1100 | "metadata": {},
1101 | "outputs": [
1102 | {
1103 | "name": "stdout",
1104 | "output_type": "stream",
1105 | "text": [
1106 | "Selected: ['Dst Port' 'Flow Duration' 'Flow Byts/s' 'Flow Pkts/s' 'Flow IAT Max'\n",
1107 | " 'Fwd IAT Tot' 'Fwd IAT Mean' 'Fwd IAT Std' 'Fwd IAT Max' 'Fwd IAT Min'\n",
1108 | " 'Bwd IAT Tot' 'Bwd IAT Mean' 'Bwd IAT Std' 'Bwd IAT Max' 'Bwd IAT Min'\n",
1109 | " 'Fwd Pkts/s' 'Pkt Len Var' 'Idle Mean' 'Idle Std' 'Idle Max' 'Idle Min']\n"
1110 | ]
1111 | }
1112 | ],
1113 | "source": [
1114 | "selector = SelectFromModel(LogisticRegression(penalty='l2', C=10))\n",
1115 | "selector = selector.fit(data[features], data['Label'])\n",
1116 | "features_mask = selector.get_support(indices=True)\n",
1117 | "selected_features = np.array(features)[features_mask]\n",
1118 | "print('Selected:', selected_features)"
1119 | ]
1120 | },
1121 | {
1122 | "cell_type": "code",
1123 | "execution_count": 55,
1124 | "metadata": {},
1125 | "outputs": [
1126 | {
1127 | "name": "stdout",
1128 | "output_type": "stream",
1129 | "text": [
1130 | "Selected: ['Dst Port' 'Fwd Seg Size Min']\n"
1131 | ]
1132 | }
1133 | ],
1134 | "source": [
1135 | "selector = SelectFromModel(DecisionTreeClassifier())\n",
1136 | "selector = selector.fit(data[features], data['Label'])\n",
1137 | "features_mask = selector.get_support(indices=True)\n",
1138 | "selected_features = np.array(features)[features_mask]\n",
1139 | "print('Selected:', selected_features)"
1140 | ]
1141 | },
1142 | {
1143 | "cell_type": "code",
1144 | "execution_count": null,
1145 | "metadata": {},
1146 | "outputs": [],
1147 | "source": []
1148 | }
1149 | ],
1150 | "metadata": {
1151 | "kernelspec": {
1152 | "display_name": "Python 3",
1153 | "language": "python",
1154 | "name": "python3"
1155 | },
1156 | "language_info": {
1157 | "codemirror_mode": {
1158 | "name": "ipython",
1159 | "version": 3
1160 | },
1161 | "file_extension": ".py",
1162 | "mimetype": "text/x-python",
1163 | "name": "python",
1164 | "nbconvert_exporter": "python",
1165 | "pygments_lexer": "ipython3",
1166 | "version": "3.7.9"
1167 | }
1168 | },
1169 | "nbformat": 4,
1170 | "nbformat_minor": 4
1171 | }
1172 |
--------------------------------------------------------------------------------
/draw_pics.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import matplotlib.pyplot as plt"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 2,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "lgb_accuracy_score_list = [0.999980926495482, 0.9999856948716115, 0.9999666213670935, 0.999990463247741, 0.9999570846148345]\n",
19 | "lgb_f1_score_list = [0.9998095419483859, 0.9998571496595401, 0.9996667142789125, 0.9999047619047619, 0.9995716121662145]\n",
20 | "lgb_auc_score_list = [0.99999999856567, 0.9999999985656699, 0.9999999684475853, 0.9999999885263947, 0.9999999792040903]\n",
21 | "\n",
22 | "xgb_accuracy_score_list = [0.999961852990964, 0.999980926495482, 0.9999475478625754, 0.9999380111103163, 0.9999427794864459]\n",
23 | "xgb_f1_score_list = [0.9996189750428652, 0.9998095056672064, 0.999476165531692, 0.99938074596294, 0.9994285714285714]\n",
24 | "xgb_auc_score_list = [0.9999995013312458, 0.9999994998969156, 0.999999535318984, 0.9999995109375728, 0.9999994994639676]"
25 | ]
26 | },
27 | {
28 | "cell_type": "code",
29 | "execution_count": 3,
30 | "metadata": {},
31 | "outputs": [
32 | {
33 | "data": {
34 | "image/png": "\n",
35 | "text/plain": [
36 | ""
37 | ]
38 | },
39 | "metadata": {
40 | "needs_background": "light"
41 | },
42 | "output_type": "display_data"
43 | }
44 | ],
45 | "source": [
46 | "plt.figure(figsize=(18, 4))\n",
47 | "plt.subplot(131)\n",
48 | "plt.plot([str(i) for i in range(5)], lgb_accuracy_score_list, label='LightGBM', marker='o')\n",
49 | "plt.plot([str(i) for i in range(5)], xgb_accuracy_score_list, label='XGBoost', marker='s')\n",
50 | "plt.ylim(0.9999, 1.0)\n",
51 | "plt.xlabel('$K^{th}$ Fold')\n",
52 | "plt.ylabel('Accuracy')\n",
53 | "plt.grid()\n",
54 | "plt.legend()\n",
55 | "plt.subplot(132)\n",
56 | "plt.plot([str(i) for i in range(5)], lgb_f1_score_list, label='LightGBM', marker='o')\n",
57 | "plt.plot([str(i) for i in range(5)], xgb_f1_score_list, label='XGBoost', marker='s')\n",
58 | "plt.ylim(0.999, 1.0)\n",
59 | "plt.xlabel('$K^{th}$ Fold')\n",
60 | "plt.ylabel('F1 Score')\n",
61 | "plt.grid()\n",
62 | "plt.legend()\n",
63 | "plt.subplot(133)\n",
64 | "plt.plot([str(i) for i in range(5)], lgb_auc_score_list, label='LightGBM', marker='o')\n",
65 | "plt.plot([str(i) for i in range(5)], xgb_auc_score_list, label='XGBoost', marker='s')\n",
66 | "plt.ylim(0.999999, 1.0)\n",
67 | "plt.xlabel('$K^{th}$ Fold')\n",
68 | "plt.ylabel('AUC')\n",
69 | "plt.grid()\n",
70 | "plt.legend()\n",
71 | "plt.savefig('./pics/acc_f1_auc_lgb_xgb.png', bbox_inches='tight')"
72 | ]
73 | },
74 | {
75 | "cell_type": "code",
76 | "execution_count": 69,
77 | "metadata": {},
78 | "outputs": [],
79 | "source": [
80 | "top_ten_important_features = [('Fwd_Seg_Size_Min', 9338911.301903933), ('Dst_Port', 1057550.6179607362), ('Fwd_Header_Len', 410228.37797785224), ('Init_Fwd_Win_Byts', 89286.21361017204), ('Flow_Byts/s', 77586.27008461952), ('Flow_IAT_Max', 55537.81665795727), ('Pkt_Len_Max', 45023.481026887894), ('Fwd_IAT_Min', 36902.22046112176), ('Fwd_Pkts/s', 17839.881527069956), ('Flow_Pkts/s', 17816.451269016834)]"
81 | ]
82 | },
83 | {
84 | "cell_type": "code",
85 | "execution_count": 70,
86 | "metadata": {},
87 | "outputs": [],
88 | "source": [
89 | "x = [fea[0] for fea in top_ten_important_features]\n",
90 | "y = [fea[1] for fea in top_ten_important_features]\n",
91 | "x.reverse()\n",
92 | "y.reverse()"
93 | ]
94 | },
95 | {
96 | "cell_type": "code",
97 | "execution_count": 72,
98 | "metadata": {},
99 | "outputs": [
100 | {
101 | "data": {
102 | "image/png": "\n",
103 | "text/plain": [
104 | ""
105 | ]
106 | },
107 | "metadata": {
108 | "needs_background": "light"
109 | },
110 | "output_type": "display_data"
111 | }
112 | ],
113 | "source": [
114 | "plt.figure(figsize=(8, 6))\n",
115 | "plt.barh(x, y)\n",
116 | "plt.xlabel('Importance')\n",
117 | "plt.ylabel('Features')\n",
118 | "plt.grid()\n",
119 | "plt.savefig('./pics/feature_importance.png', bbox_inches='tight')"
120 | ]
121 | },
122 | {
123 | "cell_type": "code",
124 | "execution_count": null,
125 | "metadata": {},
126 | "outputs": [],
127 | "source": []
128 | }
129 | ],
130 | "metadata": {
131 | "kernelspec": {
132 | "display_name": "Python 3",
133 | "language": "python",
134 | "name": "python3"
135 | },
136 | "language_info": {
137 | "codemirror_mode": {
138 | "name": "ipython",
139 | "version": 3
140 | },
141 | "file_extension": ".py",
142 | "mimetype": "text/x-python",
143 | "name": "python",
144 | "nbconvert_exporter": "python",
145 | "pygments_lexer": "ipython3",
146 | "version": "3.7.9"
147 | }
148 | },
149 | "nbformat": 4,
150 | "nbformat_minor": 4
151 | }
152 |
--------------------------------------------------------------------------------