diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py
index 414640d6..6d0e9926 100644
--- a/docling_core/types/doc/document.py
+++ b/docling_core/types/doc/document.py
@@ -5351,6 +5351,10 @@ def extract_bounding_box(text_chunk: str) -> Optional[BoundingBox]:
coords = coords[:4]
if len(coords) == 4:
l, t, r, b = map(float, coords)
+ eps = 1 / 500
+ # Ignore bounding boxes with width or height of < 0.2% of the image width or height.
+ if r - l < eps or b - t < eps:
+ return None
return BoundingBox(l=l / 500, t=t / 500, r=r / 500, b=b / 500)
return None
diff --git a/test/data/doc/2408.09869v3_enriched.dt.json b/test/data/doc/2408.09869v3_enriched.dt.json
index 5b3c9eaf..53aa324e 100644
--- a/test/data/doc/2408.09869v3_enriched.dt.json
+++ b/test/data/doc/2408.09869v3_enriched.dt.json
@@ -4960,22 +4960,7 @@
"children": [],
"content_layer": "body",
"label": "text",
- "prov": [
- {
- "page_no": 9,
- "bbox": {
- "l": 670.7520000000001,
- "t": 887.0400000000001,
- "r": 670.7520000000001,
- "b": 899.7119999999999,
- "coord_origin": "TOPLEFT"
- },
- "charspan": [
- 0,
- 1
- ]
- }
- ],
+ "prov": [],
"orig": ",",
"text": ","
},
@@ -5041,22 +5026,7 @@
"children": [],
"content_layer": "body",
"label": "text",
- "prov": [
- {
- "page_no": 9,
- "bbox": {
- "l": 692.784,
- "t": 899.7119999999999,
- "r": 692.784,
- "b": 912.3839999999999,
- "coord_origin": "TOPLEFT"
- },
- "charspan": [
- 0,
- 1
- ]
- }
- ],
+ "prov": [],
"orig": ".",
"text": "."
},
@@ -15297,4 +15267,4 @@
"page_no": 9
}
}
-}
+}
\ No newline at end of file
diff --git a/test/data/doc/2408.09869v3_enriched.out.dt.json b/test/data/doc/2408.09869v3_enriched.out.dt.json
index fec32692..651cb82c 100644
--- a/test/data/doc/2408.09869v3_enriched.out.dt.json
+++ b/test/data/doc/2408.09869v3_enriched.out.dt.json
@@ -4960,22 +4960,7 @@
"children": [],
"content_layer": "body",
"label": "text",
- "prov": [
- {
- "page_no": 9,
- "bbox": {
- "l": 670.7520000000001,
- "t": 887.0400000000001,
- "r": 670.7520000000001,
- "b": 899.7119999999999,
- "coord_origin": "TOPLEFT"
- },
- "charspan": [
- 0,
- 1
- ]
- }
- ],
+ "prov": [],
"orig": ",",
"text": ","
},
@@ -5041,22 +5026,7 @@
"children": [],
"content_layer": "body",
"label": "text",
- "prov": [
- {
- "page_no": 9,
- "bbox": {
- "l": 692.784,
- "t": 899.7119999999999,
- "r": 692.784,
- "b": 912.3839999999999,
- "coord_origin": "TOPLEFT"
- },
- "charspan": [
- 0,
- 1
- ]
- }
- ],
+ "prov": [],
"orig": ".",
"text": "."
},
@@ -16044,4 +16014,4 @@
"page_no": 9
}
}
-}
+}
\ No newline at end of file
diff --git a/test/data/doc/defect_bbox_page.dt b/test/data/doc/defect_bbox_page.dt
new file mode 100644
index 00000000..c93c7a8e
--- /dev/null
+++ b/test/data/doc/defect_bbox_page.dt
@@ -0,0 +1,4 @@
+Assistant:
+This is valid text with a zero-height bounding box.
+This is valid text with a negative-width bounding box.
+
\ No newline at end of file
diff --git a/test/data/doc/defect_bbox_page.dt.json b/test/data/doc/defect_bbox_page.dt.json
new file mode 100644
index 00000000..e176211f
--- /dev/null
+++ b/test/data/doc/defect_bbox_page.dt.json
@@ -0,0 +1,66 @@
+{
+ "body": {
+ "children": [
+ {
+ "$ref": "#/texts/0"
+ },
+ {
+ "$ref": "#/texts/1"
+ }
+ ],
+ "content_layer": "body",
+ "label": "unspecified",
+ "name": "_root_",
+ "self_ref": "#/body"
+ },
+ "form_items": [],
+ "furniture": {
+ "children": [],
+ "content_layer": "furniture",
+ "label": "unspecified",
+ "name": "_root_",
+ "self_ref": "#/furniture"
+ },
+ "groups": [],
+ "key_value_items": [],
+ "name": "Document",
+ "pages": {
+ "1": {
+ "page_no": 1,
+ "size": {
+ "height": 1.0,
+ "width": 1.0
+ }
+ }
+ },
+ "pictures": [],
+ "schema_name": "DoclingDocument",
+ "tables": [],
+ "texts": [
+ {
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "orig": "This is valid text with a zero-height bounding box.",
+ "parent": {
+ "$ref": "#/body"
+ },
+ "prov": [],
+ "self_ref": "#/texts/0",
+ "text": "This is valid text with a zero-height bounding box."
+ },
+ {
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "orig": "This is valid text with a negative-width bounding box.",
+ "parent": {
+ "$ref": "#/body"
+ },
+ "prov": [],
+ "self_ref": "#/texts/1",
+ "text": "This is valid text with a negative-width bounding box."
+ }
+ ],
+ "version": "1.8.0"
+}
\ No newline at end of file
diff --git a/test/test_doctags_load.py b/test/test_doctags_load.py
index 5355c2d1..c24989d4 100644
--- a/test/test_doctags_load.py
+++ b/test/test_doctags_load.py
@@ -168,3 +168,17 @@ def test_doctags_inline():
exp_file=exp,
actual=deser_doc.export_to_dict(),
)
+
+
+def test_doctags_handle_defect_bbox():
+
+ doctags_doc = DocTagsDocument.from_doctags_and_image_pairs(
+ [Path("test/data/doc/defect_bbox_page.dt")], None
+ )
+
+ doc = DoclingDocument.load_from_doctags(doctags_doc)
+ exp = "test/data/doc/defect_bbox_page.dt.json"
+ verify(
+ exp_file=exp,
+ actual=doc.export_to_dict(),
+ )