Skip to content

Commit b2095b3

Browse files
authored
fix: use in-OTSL DocTags for rich table cells (#375)
Signed-off-by: Panos Vagenas <pva@zurich.ibm.com>
1 parent 1d04154 commit b2095b3

File tree

3 files changed

+21
-12
lines changed

3 files changed

+21
-12
lines changed

docling_core/types/doc/document.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -351,7 +351,9 @@ def _get_text(self, doc: Optional["DoclingDocument"] = None, **kwargs: Any) -> s
351351
from docling_core.transforms.serializer.markdown import MarkdownDocSerializer
352352

353353
if doc is not None:
354-
doc_serializer = MarkdownDocSerializer(doc=doc)
354+
doc_serializer = kwargs.pop(
355+
"doc_serializer", MarkdownDocSerializer(doc=doc)
356+
)
355357
ser_res = doc_serializer.serialize(item=self.ref.resolve(doc=doc), **kwargs)
356358
return ser_res.text
357359
else:
@@ -1692,6 +1694,9 @@ def export_to_otsl(
16921694
# Headers (column, row, section row):
16931695
# "ched", "rhed", "srow"
16941696

1697+
from docling_core.transforms.serializer.doctags import DocTagsDocSerializer
1698+
1699+
doc_serializer = DocTagsDocSerializer(doc=doc)
16951700
body = []
16961701
nrows = self.data.num_rows
16971702
ncols = self.data.num_cols
@@ -1705,7 +1710,9 @@ def export_to_otsl(
17051710
for i in range(nrows):
17061711
for j in range(ncols):
17071712
cell: TableCell = self.data.grid[i][j]
1708-
content = cell._get_text(doc=doc, **kwargs).strip()
1713+
content = cell._get_text(
1714+
doc=doc, doc_serializer=doc_serializer, **kwargs
1715+
).strip()
17091716
rowspan, rowstart = (
17101717
cell.row_span,
17111718
cell.start_row_offset_idx,

examples/rich_table_cells.ipynb

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -117,8 +117,9 @@
117117
"output_type": "stream",
118118
"text": [
119119
"<doctag><title>Rich tables</title>\n",
120-
"<otsl><fcel>cell 0,0<fcel>cell 0,1<nl><fcel>cell 1,0<fcel>- list item 1\n",
121-
"- list item 2<nl><fcel>cell 2,0<fcel>cell 2,1<nl></otsl>\n",
120+
"<otsl><fcel>cell 0,0<fcel>cell 0,1<nl><fcel>cell 1,0<fcel><unordered_list><list_item>list item 1</list_item>\n",
121+
"<list_item>list item 2</list_item>\n",
122+
"</unordered_list><nl><fcel>cell 2,0<fcel>cell 2,1<nl></otsl>\n",
122123
"</doctag>\n"
123124
]
124125
}
@@ -218,8 +219,9 @@
218219
"name": "stdout",
219220
"output_type": "stream",
220221
"text": [
221-
"<otsl><fcel>cell 0,0<fcel>cell 0,1<nl><fcel>cell 1,0<fcel>- list item 1\n",
222-
"- list item 2<nl><fcel>cell 2,0<fcel>cell 2,1<nl></otsl>\n"
222+
"<otsl><fcel>cell 0,0<fcel>cell 0,1<nl><fcel>cell 1,0<fcel><unordered_list><list_item>list item 1</list_item>\n",
223+
"<list_item>list item 2</list_item>\n",
224+
"</unordered_list><nl><fcel>cell 2,0<fcel>cell 2,1<nl></otsl>\n"
223225
]
224226
}
225227
],
@@ -237,8 +239,9 @@
237239
"name": "stdout",
238240
"output_type": "stream",
239241
"text": [
240-
"<fcel>cell 0,0<fcel>cell 0,1<nl><fcel>cell 1,0<fcel>- list item 1\n",
241-
"- list item 2<nl><fcel>cell 2,0<fcel>cell 2,1<nl>\n"
242+
"<fcel>cell 0,0<fcel>cell 0,1<nl><fcel>cell 1,0<fcel><unordered_list><list_item>list item 1</list_item>\n",
243+
"<list_item>list item 2</list_item>\n",
244+
"</unordered_list><nl><fcel>cell 2,0<fcel>cell 2,1<nl>\n"
242245
]
243246
}
244247
],

test/data/doc/rich_table.out.dt

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
<doctag><title>Rich tables</title>
2-
<otsl><fcel>cell 0,0<fcel>cell 0,1<nl><fcel>cell 1,0<fcel>*text in italic*<nl><fcel>- list item 1
3-
- list item 2<fcel>cell 2,1<nl><fcel>cell 3,0<fcel>| inner cell 0,0 | inner cell 0,1 | inner cell 0,2 |
4-
|------------------|------------------|------------------|
5-
| inner cell 1,0 | inner cell 1,1 | inner cell 1,2 |<nl></otsl>
2+
<otsl><fcel>cell 0,0<fcel>cell 0,1<nl><fcel>cell 1,0<fcel><text>text in italic</text><nl><fcel><unordered_list><list_item>list item 1</list_item>
3+
<list_item>list item 2</list_item>
4+
</unordered_list><fcel>cell 2,1<nl><fcel>cell 3,0<fcel><otsl><fcel>inner cell 0,0<fcel>inner cell 0,1<fcel>inner cell 0,2<nl><fcel>inner cell 1,0<fcel>inner cell 1,1<fcel>inner cell 1,2<nl></otsl><nl></otsl>
65
</doctag>

0 commit comments

Comments
 (0)