diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py index 414640d6..6d0e9926 100644 --- a/docling_core/types/doc/document.py +++ b/docling_core/types/doc/document.py @@ -5351,6 +5351,10 @@ def extract_bounding_box(text_chunk: str) -> Optional[BoundingBox]: coords = coords[:4] if len(coords) == 4: l, t, r, b = map(float, coords) + eps = 1 / 500 + # Ignore bounding boxes with width or height of < 0.2% of the image width or height. + if r - l < eps or b - t < eps: + return None return BoundingBox(l=l / 500, t=t / 500, r=r / 500, b=b / 500) return None diff --git a/test/data/doc/2408.09869v3_enriched.dt.json b/test/data/doc/2408.09869v3_enriched.dt.json index 5b3c9eaf..53aa324e 100644 --- a/test/data/doc/2408.09869v3_enriched.dt.json +++ b/test/data/doc/2408.09869v3_enriched.dt.json @@ -4960,22 +4960,7 @@ "children": [], "content_layer": "body", "label": "text", - "prov": [ - { - "page_no": 9, - "bbox": { - "l": 670.7520000000001, - "t": 887.0400000000001, - "r": 670.7520000000001, - "b": 899.7119999999999, - "coord_origin": "TOPLEFT" - }, - "charspan": [ - 0, - 1 - ] - } - ], + "prov": [], "orig": ",", "text": "," }, @@ -5041,22 +5026,7 @@ "children": [], "content_layer": "body", "label": "text", - "prov": [ - { - "page_no": 9, - "bbox": { - "l": 692.784, - "t": 899.7119999999999, - "r": 692.784, - "b": 912.3839999999999, - "coord_origin": "TOPLEFT" - }, - "charspan": [ - 0, - 1 - ] - } - ], + "prov": [], "orig": ".", "text": "." }, @@ -15297,4 +15267,4 @@ "page_no": 9 } } -} +} \ No newline at end of file diff --git a/test/data/doc/2408.09869v3_enriched.out.dt.json b/test/data/doc/2408.09869v3_enriched.out.dt.json index fec32692..651cb82c 100644 --- a/test/data/doc/2408.09869v3_enriched.out.dt.json +++ b/test/data/doc/2408.09869v3_enriched.out.dt.json @@ -4960,22 +4960,7 @@ "children": [], "content_layer": "body", "label": "text", - "prov": [ - { - "page_no": 9, - "bbox": { - "l": 670.7520000000001, - "t": 887.0400000000001, - "r": 670.7520000000001, - "b": 899.7119999999999, - "coord_origin": "TOPLEFT" - }, - "charspan": [ - 0, - 1 - ] - } - ], + "prov": [], "orig": ",", "text": "," }, @@ -5041,22 +5026,7 @@ "children": [], "content_layer": "body", "label": "text", - "prov": [ - { - "page_no": 9, - "bbox": { - "l": 692.784, - "t": 899.7119999999999, - "r": 692.784, - "b": 912.3839999999999, - "coord_origin": "TOPLEFT" - }, - "charspan": [ - 0, - 1 - ] - } - ], + "prov": [], "orig": ".", "text": "." }, @@ -16044,4 +16014,4 @@ "page_no": 9 } } -} +} \ No newline at end of file diff --git a/test/data/doc/defect_bbox_page.dt b/test/data/doc/defect_bbox_page.dt new file mode 100644 index 00000000..c93c7a8e --- /dev/null +++ b/test/data/doc/defect_bbox_page.dt @@ -0,0 +1,4 @@ +Assistant: +This is valid text with a zero-height bounding box. +This is valid text with a negative-width bounding box. + \ No newline at end of file diff --git a/test/data/doc/defect_bbox_page.dt.json b/test/data/doc/defect_bbox_page.dt.json new file mode 100644 index 00000000..e176211f --- /dev/null +++ b/test/data/doc/defect_bbox_page.dt.json @@ -0,0 +1,66 @@ +{ + "body": { + "children": [ + { + "$ref": "#/texts/0" + }, + { + "$ref": "#/texts/1" + } + ], + "content_layer": "body", + "label": "unspecified", + "name": "_root_", + "self_ref": "#/body" + }, + "form_items": [], + "furniture": { + "children": [], + "content_layer": "furniture", + "label": "unspecified", + "name": "_root_", + "self_ref": "#/furniture" + }, + "groups": [], + "key_value_items": [], + "name": "Document", + "pages": { + "1": { + "page_no": 1, + "size": { + "height": 1.0, + "width": 1.0 + } + } + }, + "pictures": [], + "schema_name": "DoclingDocument", + "tables": [], + "texts": [ + { + "children": [], + "content_layer": "body", + "label": "text", + "orig": "This is valid text with a zero-height bounding box.", + "parent": { + "$ref": "#/body" + }, + "prov": [], + "self_ref": "#/texts/0", + "text": "This is valid text with a zero-height bounding box." + }, + { + "children": [], + "content_layer": "body", + "label": "text", + "orig": "This is valid text with a negative-width bounding box.", + "parent": { + "$ref": "#/body" + }, + "prov": [], + "self_ref": "#/texts/1", + "text": "This is valid text with a negative-width bounding box." + } + ], + "version": "1.8.0" +} \ No newline at end of file diff --git a/test/test_doctags_load.py b/test/test_doctags_load.py index 5355c2d1..c24989d4 100644 --- a/test/test_doctags_load.py +++ b/test/test_doctags_load.py @@ -168,3 +168,17 @@ def test_doctags_inline(): exp_file=exp, actual=deser_doc.export_to_dict(), ) + + +def test_doctags_handle_defect_bbox(): + + doctags_doc = DocTagsDocument.from_doctags_and_image_pairs( + [Path("test/data/doc/defect_bbox_page.dt")], None + ) + + doc = DoclingDocument.load_from_doctags(doctags_doc) + exp = "test/data/doc/defect_bbox_page.dt.json" + verify( + exp_file=exp, + actual=doc.export_to_dict(), + )