Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions llmware/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3519,12 +3519,23 @@ def ocr_images_in_library(self, add_to_library=False, chunk_size=400, min_size=1
doc_update_list = {}
new_text_created = 0

# tesseract-supported image formats (excludes vector formats like .emf, .wmf, .svg)
supported_ocr_formats = {'.png', '.jpg', '.jpeg', '.gif', '.tiff', '.tif', '.bmp', '.ppm', '.pgm', '.pbm', '.webp'}

# iterate through the image blocks found
for i, block in enumerate(image_blocks):

# "external_files" points to the image name that will be found in the image_path above for the library
img_name = block["external_files"]

# skip unsupported image formats (e.g., .emf, .wmf, .svg)
if img_name:
_, ext = os.path.splitext(img_name.lower())
if ext not in supported_ocr_formats:
if realtime_progress:
logger.info(f"Parser - ocr_images_in_library - skipping unsupported format: {img_name}")
continue

# each doc_ID is unique for the library collection
doc_id = block["doc_ID"]

Expand Down
38 changes: 38 additions & 0 deletions tests/library/test_ocr_format_filtering.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
"""Tests for OCR unsupported format filtering.

Verifies fix for GitHub issue #1108:
.emf and other unsupported image formats should be skipped during OCR
processing to prevent tesseract errors.
"""

import os


def test_supported_format_detection():
"""Test that supported formats are correctly identified."""
supported_ocr_formats = {'.png', '.jpg', '.jpeg', '.gif', '.tiff', '.tif', '.bmp', '.ppm', '.pgm', '.pbm', '.webp'}

supported_files = ['image.png', 'photo.jpg', 'picture.jpeg', 'icon.gif', 'scan.tiff', 'doc.bmp']
for filename in supported_files:
_, ext = os.path.splitext(filename.lower())
assert ext in supported_ocr_formats, f"{filename} should be supported"


def test_unsupported_format_detection():
"""Test that unsupported formats are correctly identified."""
supported_ocr_formats = {'.png', '.jpg', '.jpeg', '.gif', '.tiff', '.tif', '.bmp', '.ppm', '.pgm', '.pbm', '.webp'}

unsupported_files = ['vector.emf', 'drawing.wmf', 'graphic.svg', 'icon.ico']
for filename in unsupported_files:
_, ext = os.path.splitext(filename.lower())
assert ext not in supported_ocr_formats, f"{filename} should not be supported"


def test_case_insensitive_extension():
"""Test that extension matching is case-insensitive."""
supported_ocr_formats = {'.png', '.jpg', '.jpeg', '.gif', '.tiff', '.tif', '.bmp', '.ppm', '.pgm', '.pbm', '.webp'}

test_files = ['IMAGE.PNG', 'Photo.JPG', 'image.Png']
for filename in test_files:
_, ext = os.path.splitext(filename.lower())
assert ext in supported_ocr_formats, f"{filename} should be supported (case-insensitive)"