├── .gitignore ├── README.md ├── chromesearch.py ├── chromesearchre.py ├── detecttext.py ├── font └── NotoSansCJK-Regular.ttc ├── input ├── test.mp4 ├── test1.mp4 ├── test2.mp4 └── test3.mp4 ├── main.py ├── mangarecog.py ├── readme.md ├── result ├── origin │ ├── IMG_1285.jpeg │ ├── IMG_1286.jpeg │ ├── IMG_1287.jpeg │ ├── IMG_1288.jpeg │ └── IMG_1289.jpeg └── result │ ├── Compare.jpg │ ├── photo_2025-02-19_04-59-55.jpg │ ├── photo_2025-02-19_04-59-59.jpg │ ├── photo_2025-02-19_05-00-04.jpg │ ├── photo_2025-02-19_05-00-09.jpg │ └── photo_2025-02-19_05-00-12.jpg ├── testchromesearch.py └── util.py /.gitignore: -------------------------------------------------------------------------------- 1 | /build 2 | /dist 3 | /output 4 | /venv 5 | /__pycache__ 6 | 7 | main.spec -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | OCR project 2 | -------------------------------------------------------------------------------- /chromesearch.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | from selenium.webdriver.chrome.service import Service 3 | from selenium.webdriver.chrome.options import Options 4 | from selenium.webdriver.common.by import By 5 | from selenium.webdriver.common.keys import Keys 6 | import time 7 | 8 | # Path to your ChromeDriver 9 | # chromedriver_path = './chromedriver' # Update this path 10 | 11 | # Initialize the Chrome driver 12 | # driver = webdriver.Chrome(chromedriver_path) 13 | 14 | def google_search(search_query): 15 | 16 | service = Service() 17 | # options = webdriver.ChromeOptions() 18 | chrome_options = Options() 19 | # chrome_options.add_argument('--headless') 20 | chrome_options.add_argument('--no-sandbox') 21 | chrome_options.add_argument('--disable-dev-shm-usage') 22 | chrome_options.add_argument("--window-size=1280,768") 23 | driver = webdriver.Chrome(service=service, options=chrome_options) 24 | 25 | driver.get("https://www.google.com") 26 | 27 | # Locate the search box using its name attribute value 28 | search_box = driver.find_element(By.NAME, "q") 29 | 30 | search_box.clear() 31 | 32 | # Type the search query 33 | search_box.send_keys(search_query) 34 | 35 | # Press Enter 36 | search_box.send_keys(Keys.RETURN) 37 | 38 | # Wait for a few seconds to see the results 39 | time.sleep(5) 40 | 41 | # Optionally, print the title of the page 42 | print(driver.title) -------------------------------------------------------------------------------- /chromesearchre.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | from selenium.webdriver.chrome.service import Service 3 | from selenium.webdriver.chrome.options import Options 4 | from selenium.webdriver.common.by import By 5 | from selenium.webdriver.common.keys import Keys 6 | import time 7 | 8 | # Path to your ChromeDriver 9 | # chromedriver_path = './chromedriver' # Update this path 10 | 11 | # Initialize the Chrome driver 12 | # driver = webdriver.Chrome(chromedriver_path) 13 | 14 | def google_search(search_query): 15 | 16 | service = Service() 17 | # options = webdriver.ChromeOptions() 18 | chrome_options = Options() 19 | # chrome_options.add_argument('--headless') 20 | # chrome_options.add_argument('--no-sandbox') 21 | # chrome_options.add_argument('--disable-dev-shm-usage') 22 | # chrome_options.add_argument("--window-size=1280,768") 23 | # chrome_options.add_argument("--disable-background-networking") 24 | chrome_options.add_argument("--disable-background-timer-throttling") 25 | chrome_options.add_argument("--disable-backgrounding-occluded-windows") 26 | chrome_options.add_argument("--disable-client-side-phishing-detection") 27 | chrome_options.add_argument("--disable-default-apps") 28 | chrome_options.add_argument("--disable-dev-shm-usage") 29 | chrome_options.add_argument("--disable-gpu") 30 | chrome_options.add_argument("--disable-hang-monitor") 31 | chrome_options.add_argument("--disable-popup-blocking") 32 | chrome_options.add_argument("--disable-prompt-on-repost") 33 | chrome_options.add_argument("--disable-sync") 34 | chrome_options.add_argument("--disable-translate") 35 | chrome_options.add_argument("--metrics-recording-only") 36 | chrome_options.add_argument("--no-first-run") 37 | chrome_options.add_argument("--safebrowsing-disable-auto-update") 38 | driver = webdriver.Chrome(service=service, options=chrome_options) 39 | 40 | driver.get("https://www.google.com") 41 | 42 | # Locate the search box using its name attribute value 43 | search_box = driver.find_element(By.NAME, "q") 44 | 45 | search_box.clear() 46 | 47 | # Type the search query 48 | search_box.send_keys(search_query) 49 | 50 | # Press Enter 51 | search_box.send_keys(Keys.RETURN) 52 | 53 | # Wait for a few seconds to see the results 54 | time.sleep(60) 55 | 56 | # Optionally, print the title of the page 57 | print(driver.title) -------------------------------------------------------------------------------- /detecttext.py: -------------------------------------------------------------------------------- 1 | import easyocr 2 | import cv2 3 | from pathlib import Path 4 | from PIL import Image, ImageDraw, ImageFont 5 | import numpy as np 6 | import re 7 | import mangarecog 8 | 9 | output_dir = Path('./output/cropped_images') 10 | output_dir.mkdir(parents=True, exist_ok=True) 11 | 12 | font_path = './font/NotoSansCJK-Regular.ttc' 13 | 14 | reader = easyocr.Reader(['ja']) 15 | 16 | def display_japanese_text(image, text, position, font_path, font_size=20): 17 | # Convert OpenCV image (BGR) to PIL image (RGB) 18 | image_pil = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB)) 19 | 20 | # Load the font 21 | font = ImageFont.truetype(font_path, font_size) 22 | 23 | # Create a drawing context 24 | draw = ImageDraw.Draw(image_pil) 25 | 26 | # Draw the text on the image 27 | draw.text(position, text, font=font, fill=(255, 0, 0)) # Green text color 28 | 29 | # Convert back to OpenCV image (RGB to BGR) 30 | image = cv2.cvtColor(np.array(image_pil), cv2.COLOR_RGB2BGR) 31 | return image 32 | 33 | def detect_text(image_path): 34 | result = reader.readtext(image_path) 35 | return result 36 | 37 | def is_japanese(text): 38 | # Regular expression to match Hiragana, Katakana, Kanji, and Japanese punctuation 39 | pattern = re.compile(r'[\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF\uFF66-\uFF9F\u3000-\u303F]+') 40 | return bool(pattern.search(text)) 41 | 42 | def remove_non_japanese(text_blocks): 43 | # Filter out any non-Japanese text blocks 44 | japanese_text_blocks = [] 45 | 46 | for (bbox, text, prob) in text_blocks: 47 | # Check if the text block is Japanese 48 | if is_japanese(text): 49 | japanese_text_blocks.append((bbox, text, prob)) 50 | else: 51 | print(f"Removed non-Japanese text: {text}") # Optional: print or log non-Japanese text 52 | 53 | return japanese_text_blocks 54 | 55 | def categorize_by_box_size(text_blocks): 56 | # Initialize variables to store the main text 57 | title = "" 58 | author = "" 59 | publisher = "" 60 | other = "" 61 | 62 | # Calculate the average bounding box size 63 | avg_box_size = np.mean([((bbox[2][0] - bbox[0][0]) * (bbox[2][1] - bbox[0][1])) for (bbox, _, _) in text_blocks]) 64 | 65 | # Now check each bounding box's area and categorize based on size 66 | for (bbox, text, prob) in text_blocks: 67 | # Calculate the area of the current bounding box 68 | width = bbox[2][0] - bbox[0][0] 69 | height = bbox[2][1] - bbox[0][1] 70 | area = width * height 71 | 72 | # Categorize based on box area size relative to the average box size 73 | if area > avg_box_size * 1.2: # 1.5x the average area as a threshold for "larger" boxes 74 | # Assuming larger boxes are the title 75 | # if len(title) < len(text): # Prefer the longest text as the title 76 | title += (text + " ") 77 | elif "著者" in text or "作家" in text: 78 | # If the text contains "author" keywords, it's likely the author 79 | author += (text + " ") 80 | elif "出版社" in text or "出版" in text: 81 | # If the text contains "publisher" keywords, it's likely the publisher 82 | publisher += (text + " ") 83 | else: 84 | # If the text doesn't fit into the categories above, consider it as potential author or other info 85 | if len(author) == 0: 86 | author += (text + " ") 87 | elif len(publisher) == 0: 88 | publisher += (text + " ") 89 | else: 90 | other += text + " " # Accumulate other text into a single string 91 | 92 | return title, author, publisher, other 93 | 94 | def show_textregion(image, result): 95 | combined_text = "" 96 | text_blocks = [] 97 | 98 | for i, (bbox, text, prob) in enumerate(result): 99 | # Ensure the bounding box coordinates are in integer format 100 | (top_left, top_right, bottom_right, bottom_left) = bbox 101 | 102 | # Convert the bounding box coordinates to integers 103 | top_left = tuple(map(int, top_left)) 104 | bottom_right = tuple(map(int, bottom_right)) 105 | 106 | # Draw rectangle around the detected text region 107 | cv2.rectangle(image, top_left, bottom_right, (0, 255, 0), 2) 108 | 109 | # Crop the image using array slicing (top-left to bottom-right) 110 | cropped_image = image[top_left[1]:bottom_right[1], top_left[0]:bottom_right[0]] 111 | 112 | # Save the cropped image for debugging or further use 113 | cropped_path = output_dir / f'cropped_{i}.jpg' 114 | cv2.imwrite(str(cropped_path), cropped_image) 115 | print(f"Cropped image saved to {cropped_path}") 116 | 117 | # Use MangaOCR to recognize text in the cropped image 118 | mocrtext = mangarecog.recognize_text(cropped_path) 119 | print(f"Recognized text (MangaOCR): {mocrtext}") 120 | 121 | # Accumulate the recognized text into the combined_text string 122 | combined_text += mocrtext.strip() + " " # Add space between each text block 123 | 124 | # Save the text block for categorization later 125 | text_blocks.append((bbox, mocrtext, prob)) 126 | 127 | # Display the recognized text on the image 128 | image = display_japanese_text(image, mocrtext, top_left, font_path) 129 | cv2.imshow("Text Detection", image) # Display the image with text regions 130 | 131 | # Filter out non-Japanese text blocks 132 | text_blocks = remove_non_japanese(text_blocks) 133 | 134 | # Categorize the text blocks based on their box size 135 | title, author, publisher, other = categorize_by_box_size(text_blocks) 136 | print(f"Title: {title}") 137 | print(f"Author: {author}") 138 | print(f"Publisher: {publisher}") 139 | print(f"Other: {other}") 140 | 141 | print(f"Combined Text: {combined_text}") 142 | 143 | return title, author, publisher, other, combined_text -------------------------------------------------------------------------------- /font/NotoSansCJK-Regular.ttc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WTEngineer/OCR_Project/ab84e30cd4b39d4e8235362b5904d6efa77efd53/font/NotoSansCJK-Regular.ttc -------------------------------------------------------------------------------- /input/test.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WTEngineer/OCR_Project/ab84e30cd4b39d4e8235362b5904d6efa77efd53/input/test.mp4 -------------------------------------------------------------------------------- /input/test1.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WTEngineer/OCR_Project/ab84e30cd4b39d4e8235362b5904d6efa77efd53/input/test1.mp4 -------------------------------------------------------------------------------- /input/test2.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WTEngineer/OCR_Project/ab84e30cd4b39d4e8235362b5904d6efa77efd53/input/test2.mp4 -------------------------------------------------------------------------------- /input/test3.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WTEngineer/OCR_Project/ab84e30cd4b39d4e8235362b5904d6efa77efd53/input/test3.mp4 -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import detecttext 3 | import util 4 | import chromesearch 5 | 6 | def process_video(): 7 | # Set the source 8 | source = 0 # Uncomment this to use the webcam 9 | # source = "./input/test2.mp4" # Path to the video file 10 | 11 | # Capture the video 12 | cap = cv2.VideoCapture(source) 13 | 14 | if not cap.isOpened(): 15 | print("Error: Unable to open the video source.") 16 | return 17 | 18 | # Loop to read and display video frames 19 | while True: 20 | ret, frame = cap.read() # Read a frame from the video source 21 | title="" 22 | author="" 23 | publisher="" 24 | other="" 25 | combined_text = "" # Initialize the search text 26 | if not ret: 27 | print("End of video or error reading the frame.") 28 | cap.set(cv2.CAP_PROP_POS_FRAMES, 0) 29 | continue 30 | 31 | if cv2.waitKey(1) & 0xFF == ord('c'): 32 | result = detecttext.detect_text(frame) # Detect text in the frame 33 | image = frame.copy() 34 | title, author, publisher, other, combined_text = detecttext.show_textregion(image, result) # Show the text region in the frame 35 | # title = util.get_longest_word(searchtext) 36 | # Combine title, author, and publisher (and any other info) 37 | search_query = f"{title}" 38 | chromesearch.google_search(search_query) # Search the text in Google 39 | 40 | # Display the frame in a window 41 | cv2.imshow("Video Frame", frame) 42 | 43 | # Exit on pressing the 'q' key 44 | if cv2.waitKey(1) & 0xFF == ord('q'): 45 | break 46 | 47 | # Release the video capture object and close display windows 48 | cap.release() 49 | cv2.destroyAllWindows() 50 | 51 | # Call the function 52 | process_video() 53 | -------------------------------------------------------------------------------- /mangarecog.py: -------------------------------------------------------------------------------- 1 | from manga_ocr import MangaOcr 2 | 3 | mocr = MangaOcr() 4 | 5 | def recognize_text(image_path): 6 | return mocr(image_path) -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | OCR project 2 | 3 | This is the Face recognition project. 4 | -------------------------------------------------------------------------------- /result/origin/IMG_1285.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WTEngineer/OCR_Project/ab84e30cd4b39d4e8235362b5904d6efa77efd53/result/origin/IMG_1285.jpeg -------------------------------------------------------------------------------- /result/origin/IMG_1286.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WTEngineer/OCR_Project/ab84e30cd4b39d4e8235362b5904d6efa77efd53/result/origin/IMG_1286.jpeg -------------------------------------------------------------------------------- /result/origin/IMG_1287.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WTEngineer/OCR_Project/ab84e30cd4b39d4e8235362b5904d6efa77efd53/result/origin/IMG_1287.jpeg -------------------------------------------------------------------------------- /result/origin/IMG_1288.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WTEngineer/OCR_Project/ab84e30cd4b39d4e8235362b5904d6efa77efd53/result/origin/IMG_1288.jpeg -------------------------------------------------------------------------------- /result/origin/IMG_1289.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WTEngineer/OCR_Project/ab84e30cd4b39d4e8235362b5904d6efa77efd53/result/origin/IMG_1289.jpeg -------------------------------------------------------------------------------- /result/result/Compare.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WTEngineer/OCR_Project/ab84e30cd4b39d4e8235362b5904d6efa77efd53/result/result/Compare.jpg -------------------------------------------------------------------------------- /result/result/photo_2025-02-19_04-59-55.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WTEngineer/OCR_Project/ab84e30cd4b39d4e8235362b5904d6efa77efd53/result/result/photo_2025-02-19_04-59-55.jpg -------------------------------------------------------------------------------- /result/result/photo_2025-02-19_04-59-59.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WTEngineer/OCR_Project/ab84e30cd4b39d4e8235362b5904d6efa77efd53/result/result/photo_2025-02-19_04-59-59.jpg -------------------------------------------------------------------------------- /result/result/photo_2025-02-19_05-00-04.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WTEngineer/OCR_Project/ab84e30cd4b39d4e8235362b5904d6efa77efd53/result/result/photo_2025-02-19_05-00-04.jpg -------------------------------------------------------------------------------- /result/result/photo_2025-02-19_05-00-09.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WTEngineer/OCR_Project/ab84e30cd4b39d4e8235362b5904d6efa77efd53/result/result/photo_2025-02-19_05-00-09.jpg -------------------------------------------------------------------------------- /result/result/photo_2025-02-19_05-00-12.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WTEngineer/OCR_Project/ab84e30cd4b39d4e8235362b5904d6efa77efd53/result/result/photo_2025-02-19_05-00-12.jpg -------------------------------------------------------------------------------- /testchromesearch.py: -------------------------------------------------------------------------------- 1 | import util 2 | import chromesearchre 3 | 4 | searchtext = "apple app" 5 | 6 | title = util.get_longest_word(searchtext) 7 | chromesearchre.google_search(title) -------------------------------------------------------------------------------- /util.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | def get_longest_word(text): 4 | # Use regex to find words, considering letters and digits 5 | words = re.findall(r'\b\w+\b', text) 6 | 7 | # Return the longest word or None if no words found 8 | return max(words, key=len) if words else None --------------------------------------------------------------------------------