├── .gitattributes
├── .idea
    ├── .gitignore
    ├── Bplusplus.iml
    ├── inspectionProfiles
    │   ├── Project_Default.xml
    │   └── profiles_settings.xml
    ├── misc.xml
    ├── modules.xml
    └── vcs.xml
├── B++ CV model
    └── CV_model_notes.md
├── README.md
├── Run_model.py
├── Species_included_in_pretrained_model.xlsx
├── collect_images.py
├── data
    └── names.csv
├── requirements.txt
├── train_validate.py
└── yolov8n-cls.pt


/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 | 


--------------------------------------------------------------------------------
/.idea/Bplusplus.iml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <module type="PYTHON_MODULE" version="4">
3 |   <component name="NewModuleRootManager">
4 |     <content url="file://$MODULE_DIR$" />
5 |     <orderEntry type="inheritedJdk" />
6 |     <orderEntry type="sourceFolder" forTests="false" />
7 |   </component>
8 | </module>


--------------------------------------------------------------------------------
/.idea/inspectionProfiles/Project_Default.xml:
--------------------------------------------------------------------------------
 1 | <component name="InspectionProjectProfileManager">
 2 |   <profile version="1.0">
 3 |     <option name="myName" value="Project Default" />
 4 |     <inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
 5 |       <option name="ignoredPackages">
 6 |         <value>
 7 |           <list size="1">
 8 |             <item index="0" class="java.lang.String" itemvalue="pillow" />
 9 |           </list>
10 |         </value>
11 |       </option>
12 |     </inspection_tool>
13 |   </profile>
14 | </component>


--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 | <component name="InspectionProjectProfileManager">
2 |   <settings>
3 |     <option name="USE_PROJECT_PROFILE" value="false" />
4 |     <version value="1.0" />
5 |   </settings>
6 | </component>


--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.11 (webscrape_bio)" project-jdk-type="Python SDK" />
4 | </project>


--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/Bplusplus.iml" filepath="$PROJECT_DIR$/.idea/Bplusplus.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="$PROJECT_DIR$" vcs="Git" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/B++ CV model/CV_model_notes.md:
--------------------------------------------------------------------------------
1 | # B++ CV Model
2 | 
3 | The CV model as presented in the paper can be downloaded from:
4 | https://drive.google.com/file/d/1wxAIdSzx5nhTOk4izc0RIycoecSdug_Q/view?usp=sharing
5 | 
6 | To run/use the model, please consult the Ultralytics documentation
7 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # B++ repository
 2 | 
 3 | [![DOI](https://zenodo.org/badge/765250194.svg)](https://zenodo.org/badge/latestdoi/765250194) 
 4 | [![PyPi version](https://img.shields.io/pypi/v/bplusplus.svg)](https://pypi.org/project/bplusplus/)
 5 | [![Python versions](https://img.shields.io/pypi/pyversions/bplusplus.svg)](https://pypi.org/project/bplusplus/)
 6 | [![License](https://img.shields.io/pypi/l/bplusplus.svg)](https://pypi.org/project/bplusplus/)
 7 | [![Downloads](https://static.pepy.tech/badge/bplusplus)](https://pepy.tech/project/bplusplus)
 8 | [![Downloads](https://static.pepy.tech/badge/bplusplus/month)](https://pepy.tech/project/bplusplus)
 9 | [![Downloads](https://static.pepy.tech/badge/bplusplus/week)](https://pepy.tech/project/bplusplus)
10 | 
11 | This repo can be used to quickly generate YOLOv8 models for biodiversity monitoring, relying on Ultralytics and a GBIF dataset.
12 | All code is tested on Windows 10 and Python 3.11, without GPU. GPU would obviously accelerate the below steps, Ultralytics should automatically select the available GPU if there is any.
13 | 
14 | # New release
15 | We have released a new version here: [github.com/Tvenver/Bplusplus/tree/package](https://github.com/Tvenver/Bplusplus/tree/package)
16 | We also launched a package, which can be installed directly: [https://github.com/Tvenver/Bplusplus/tree/package](https://pypi.org/project/bplusplus/)
17 | 
18 | # How does it work?
19 | 
20 | To create your own custom CV model:
21 | 1. Input names (scientific names) in the names.csv file, in the data folder
22 | 2. Download the GBIF repository of your choosing, or download a prepared dataset linking to 16M images of many insect species: https://doi.org/10.15468/dl.dk9czq
23 | 3. Update the path in collect_images.py on line 36 and line 54, to route to the unzipped GBIF downloaded files.
24 | 4. In collect_images.py, consider activating the sampling function, to reduce the number of images to download per species - in the case of many insect species, the download will take longer.
25 | 5. run collect_images.py, this fetches the names, iterates through them, and attempts to download images from a GBIF data repository.
26 | 6. As an example, for about 8 insect species, ending up with 4000 images, the entire operation might take +-20 minutes, depending on your internet speed and hardware.
27 | 7. run train_validate.py, this shuffles the images into a train and validation set, and Ultralytics takes care of the training.
28 | 8. You can tweak various parameters for the training, if you want to, please visit the Ultralytics YOLOv8 documentation for more information.
29 | 
30 | You have created a YOLOv8 model for image classification.
31 | 
32 | ![Figure 9](https://github.com/user-attachments/assets/a01f513b-0609-412d-a633-3aee1e5dded6)
33 | 
34 | To use the pretrained model:
35 | There is also a pretrained YOLOv8 classification model, containing 2584 species, included in this repo under B++ CV Model. The included species are listed in a separate file.
36 | 1. Download the pretrained model from the Google Drive link listed in the folder B++ CV Model
37 | 2. Take the run_model.py script, specify the path to the downloaded .pt file, and run the model.
38 | 
39 | # Citation
40 | 
41 | All information in this GitHub is available under MIT license, as long as credit is given to the authors.
42 | 
43 | **Venverloo, T., Duarte, F., B++: Towards Real-Time Monitoring of Insect Species. MIT Senseable City Laboratory, AMS Institute.**
44 | 


--------------------------------------------------------------------------------
/Run_model.py:
--------------------------------------------------------------------------------
 1 | from ultralytics import YOLO
 2 | 
 3 | # Load a model
 4 | path_to_model = "path_to_best.pt model"
 5 | model = YOLO(path_to_model)  # load a custom model
 6 | 
 7 | # Predict with the model
 8 | path_to_image = "path to folder, image, video"
 9 | results = model(path_to_image, save_txt = True)  # predict on an image
10 | 
11 | 
12 | 


--------------------------------------------------------------------------------
/Species_included_in_pretrained_model.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tvenver/Bplusplus/c955a424835eb69e8e1630a881334ab8bce872ac/Species_included_in_pretrained_model.xlsx


--------------------------------------------------------------------------------
/collect_images.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | import os
  3 | import pandas as pd
  4 | import csv
  5 | 
  6 | #Step 0: create folders to store images inside, names are based on a .csv file where each row contains one specie name
  7 | 
  8 | def create_folders_from_csv(csv_file, directory_path):
  9 |     # Check if the folder path exists, if not, create it
 10 |     if not os.path.exists(directory_path):
 11 |         os.makedirs(directory_path)
 12 | 
 13 |     # Open the CSV file and read the names
 14 |     with open(csv_file, 'r') as file:
 15 |         csv_reader = csv.reader(file)
 16 |         next(csv_reader)  # Skip the header
 17 |         for row in csv_reader:
 18 |             name = row[0].strip()  # Assuming the name is in the first column
 19 |             print(name)
 20 |             folder_name = os.path.join(directory_path, name)
 21 |             # Create a folder with the name from the CSV
 22 |             os.makedirs(folder_name, exist_ok=True)
 23 | 
 24 | # Provide the path to your CSV file and the folder where you want to create subfolders
 25 | csv_file = os.path.join('data', "names.csv")
 26 | directory_path = os.path.join('data', 'dataset')
 27 | 
 28 | 
 29 | create_folders_from_csv(csv_file, directory_path)
 30 | 
 31 | # Step 1: Filtering the occurence dataset to only include species of interest, download images afterwards
 32 | #set variables
 33 | batch_size = 100000
 34 | 
 35 | #specifiy path to occurence.txt file
 36 | csv_reader = pd.read_table("C:/Users/titusvenverloo/Downloads/beedata/occurrence1.txt", chunksize=batch_size)
 37 | folders = [f for f in os.listdir(directory_path) if os.path.isdir(os.path.join(directory_path, f))]
 38 | col = ['gbifID', 'species']
 39 | final_df = pd.DataFrame()  # Initialize the final DataFrame
 40 | 
 41 | # Iterate through the batches
 42 | for batch_df in csv_reader:
 43 |     batch_df = batch_df[col]
 44 |     batch_df = batch_df[batch_df['species'].isin(folders)]
 45 |     # Example: Print the first few rows of each batch to double check
 46 |     print(batch_df.head())
 47 |     final_df = pd.concat([final_df, batch_df], ignore_index=True)
 48 | 
 49 | #output final df to csv, where csv contains link to multimedia and species
 50 | final_df.to_csv(os.path.join('data','occurence_filtered.csv'), index=False)
 51 | 
 52 | #Step 1.b.: Load in links to multimedia, leftjoin with filtered occurence data
 53 | #df1 = pd.read_table(os.path.join('data','multimedia.txt'), chunksize=batch_size)
 54 | df1 = pd.read_table("C:/Users/titusvenverloo/Downloads/beedata/multimedia.txt", chunksize=batch_size)
 55 | folders = [f for f in os.listdir(directory_path) if os.path.isdir(os.path.join(directory_path, f))]
 56 | df2 = final_df
 57 | 
 58 | # Step 1: Define a function to perform the left join on a specific chunk
 59 | def left_join_chunk(chunk1, chunk2, key_column):
 60 |     return pd.merge(chunk1, chunk2, on=key_column, how='left')
 61 | 
 62 | # Step 3: Iterate over the chunks and left join them
 63 | final_df = pd.DataFrame()  # Initialize the final DataFrame
 64 | 
 65 | for chunk in df1:
 66 |     # Perform left join on the chunks
 67 |     joined_chunk = left_join_chunk(chunk, df2, 'gbifID')
 68 |     filtered_df = joined_chunk[joined_chunk['species'].isin(folders)]
 69 |     print(filtered_df)
 70 | 
 71 |     # Append the joined chunk to the final DataFrame
 72 |     final_df = pd.concat([final_df, filtered_df], ignore_index=True)
 73 | 
 74 | # Step 4: Save the final DataFrame to a new file or use it as needed
 75 | final_df.to_csv(os.path.join('data','filtered_insect_species.csv'), index=False)
 76 | 
 77 | #Step 1.c.: Download images in new folders named according to your convention
 78 | df = final_df
 79 | # df = pd.read_csv('directory/to/csv/from/observ.org/photos/sampled_super_small.csv')
 80 | df['ID_name'] = df.index + 1
 81 | 
 82 | #uncomment sampling function, to reduce the test size to 150 image minimum or XXX% of original included testset (in our case from 60k images to 12k images)
 83 | #df = pd.read_csv('directory/to/csv/from/observ.org/photos/sampled_super_small.csv')
 84 | def sample_minimum(group):
 85 |     # Sample a minimum of 150 images or the total number of images if less than 150
 86 |     return group.sample(n=min(150, len(group)), random_state=42)  # Added random_state for reproducibility
 87 | 
 88 | # Assuming df is your DataFrame
 89 | print('Start sampling per group')
 90 | sampled = df.groupby('species').apply(sample_minimum).reset_index(drop=True)
 91 | sampled.to_csv(os.path.join('data','sampled_super_small.csv'), index=False)
 92 | df = sampled
 93 | 
 94 | #function to download insect images
 95 | def down_image(url, species, ID_name):
 96 |     directory = os.path.join('data/dataset', f"{species}")
 97 |     os.makedirs(directory, exist_ok=True)
 98 |     image_response = requests.get(url)
 99 |     image_name = f"{species}{ID_name}.jpg"  # You can modify the naming convention as per your requirements
100 |     image_path = os.path.join(directory, image_name)
101 |     with open(image_path, "wb") as f:
102 |         f.write(image_response.content)
103 |     print(f"{species}{ID_name} downloaded successfully.")
104 | 
105 | df.apply(lambda row:down_image(row['identifier'], row['species'], row['ID_name']), axis=1)
106 | 
107 | 
108 | 


--------------------------------------------------------------------------------
/data/names.csv:
--------------------------------------------------------------------------------
 1 | ﻿names
 2 | Nabis rugosus
 3 | Forficula auricularia
 4 | Calosoma inquisitor
 5 | Bombus veteranus
 6 | Glyphotaelius pellucidus
 7 | Notoxus monoceros
 8 | Cacoxenus indagator
 9 | Chorthippus mollis
10 | Trioza remota
11 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | requests==2.25.1
2 | pandas==2.1.4
3 | ultralytics==8.0.195


--------------------------------------------------------------------------------
/train_validate.py:
--------------------------------------------------------------------------------
 1 | from ultralytics import YOLO
 2 | import os
 3 | import shutil
 4 | import random
 5 | 
 6 | 
 7 | 
 8 | # Define paths
 9 | dataset_path = os.path.join('data', 'dataset')  # Path to your dataset
10 | train_path = os.path.join('data', 'train')  # Path to the training folder
11 | val_path = os.path.join('data', 'val')   # Path to the validation folder
12 | 
13 | 
14 | # Define the ratio for splitting the dataset
15 | split_ratio = 0.8  # 80% for training, 20% for validation
16 | 
17 | # Create training and validation directories if they don't exist
18 | os.makedirs(train_path, exist_ok=True)
19 | os.makedirs(val_path, exist_ok=True)
20 | 
21 | # Walk through the dataset directory
22 | for root, dirs, files in os.walk(dataset_path):
23 |     for label in dirs:
24 |         label_path = os.path.join(root, label)
25 |         images = [f for f in os.listdir(label_path) if os.path.isfile(os.path.join(label_path, f))]
26 | 
27 |         # Shuffle the images
28 |         random.shuffle(images)
29 | 
30 |         # Calculate the split index
31 |         split_index = int(len(images) * split_ratio)
32 | 
33 |         # Split the images into training and validation sets
34 |         train_images = images[:split_index]
35 |         val_images = images[split_index:]
36 | 
37 |         # Create destination folders if they don't exist
38 |         train_label_path = os.path.join(train_path, label)
39 |         val_label_path = os.path.join(val_path, label)
40 |         os.makedirs(train_label_path, exist_ok=True)
41 |         os.makedirs(val_label_path, exist_ok=True)
42 | 
43 |         # Move images to the appropriate folders
44 |         for image in train_images:
45 |             src = os.path.join(label_path, image)
46 |             dst = os.path.join(train_label_path, image)
47 |             shutil.move(src, dst)
48 | 
49 |         for image in val_images:
50 |             src = os.path.join(label_path, image)
51 |             dst = os.path.join(val_label_path, image)
52 |             shutil.move(src, dst)
53 | 
54 | print("Dataset splitting completed successfully.")
55 | 
56 | # Create a new YOLO model from scratch
57 | model = YOLO('yolov8n-cls.pt')
58 | #
59 | #define parameters for YOLO training, be aware of epoch, batch, and imgsz, to not exceed system requirements (memory, CPU, GPU...)
60 | #Folder for training *bplusplus/data/train
61 | #Folder for validation *bplusplus/data/val
62 | #Specify path to folder where the val and train folder is located
63 | data = "C:/Users/titusvenverloo/Documents/GitHub/Bplusplus/data/"
64 | results = model.train(data=data, epochs=5, batch=16, imgsz=224)
65 | 
66 | #batch is adjusted to 1 to prevent a resizing bug - in training this bug doesnt emerge. A work around for larger batch size could be a resizing step in advance.
67 | model.val(batch=1)
68 | 
69 | 


--------------------------------------------------------------------------------
/yolov8n-cls.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tvenver/Bplusplus/c955a424835eb69e8e1630a881334ab8bce872ac/yolov8n-cls.pt


--------------------------------------------------------------------------------