llmware-ai · majiayu000 · Dec 30, 2025
diff --git a/llmware/parsers.py b/llmware/parsers.py
@@ -3519,12 +3519,23 @@ def ocr_images_in_library(self, add_to_library=False, chunk_size=400, min_size=1
         doc_update_list = {}
         new_text_created = 0
 
+        #   tesseract-supported image formats (excludes vector formats like .emf, .wmf, .svg)
+        supported_ocr_formats = {'.png', '.jpg', '.jpeg', '.gif', '.tiff', '.tif', '.bmp', '.ppm', '.pgm', '.pbm', '.webp'}
+
         #   iterate through the image blocks found
         for i, block in enumerate(image_blocks):
 
             #   "external_files" points to the image name that will be found in the image_path above for the library
             img_name = block["external_files"]
 
+            #   skip unsupported image formats (e.g., .emf, .wmf, .svg)
+            if img_name:
+                _, ext = os.path.splitext(img_name.lower())
+                if ext not in supported_ocr_formats:
+                    if realtime_progress:
+                        logger.info(f"Parser - ocr_images_in_library - skipping unsupported format: {img_name}")
+                    continue
+
             #   each doc_ID is unique for the library collection
             doc_id = block["doc_ID"]
 

diff --git a/tests/library/test_ocr_format_filtering.py b/tests/library/test_ocr_format_filtering.py
@@ -0,0 +1,38 @@
+"""Tests for OCR unsupported format filtering.
+
+Verifies fix for GitHub issue #1108:
+.emf and other unsupported image formats should be skipped during OCR
+processing to prevent tesseract errors.
+"""
+
+import os
+
+
+def test_supported_format_detection():
+    """Test that supported formats are correctly identified."""
+    supported_ocr_formats = {'.png', '.jpg', '.jpeg', '.gif', '.tiff', '.tif', '.bmp', '.ppm', '.pgm', '.pbm', '.webp'}
+
+    supported_files = ['image.png', 'photo.jpg', 'picture.jpeg', 'icon.gif', 'scan.tiff', 'doc.bmp']
+    for filename in supported_files:
+        _, ext = os.path.splitext(filename.lower())
+        assert ext in supported_ocr_formats, f"{filename} should be supported"
+
+
+def test_unsupported_format_detection():
+    """Test that unsupported formats are correctly identified."""
+    supported_ocr_formats = {'.png', '.jpg', '.jpeg', '.gif', '.tiff', '.tif', '.bmp', '.ppm', '.pgm', '.pbm', '.webp'}
+
+    unsupported_files = ['vector.emf', 'drawing.wmf', 'graphic.svg', 'icon.ico']
+    for filename in unsupported_files:
+        _, ext = os.path.splitext(filename.lower())
+        assert ext not in supported_ocr_formats, f"{filename} should not be supported"
+
+
+def test_case_insensitive_extension():
+    """Test that extension matching is case-insensitive."""
+    supported_ocr_formats = {'.png', '.jpg', '.jpeg', '.gif', '.tiff', '.tif', '.bmp', '.ppm', '.pgm', '.pbm', '.webp'}
+
+    test_files = ['IMAGE.PNG', 'Photo.JPG', 'image.Png']
+    for filename in test_files:
+        _, ext = os.path.splitext(filename.lower())
+        assert ext in supported_ocr_formats, f"{filename} should be supported (case-insensitive)"