From a35bb874d343e285707889d23471aa1c327b59c0 Mon Sep 17 00:00:00 2001 From: Samuel Conjard <samuel.conjard@univ-grenoble-alpes.fr> Date: Mon, 6 Jan 2025 12:09:57 +0100 Subject: [PATCH 1/2] HTML Conversion : Allow semicolon recognition --- src/ptf/utils.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/ptf/utils.py b/src/ptf/utils.py index 9bedf991..eac53d1b 100644 --- a/src/ptf/utils.py +++ b/src/ptf/utils.py @@ -448,7 +448,8 @@ def create_citation_link_and_new_html(key, label, tooltip_html=""): def build_key_regex_with_allowed_tags(key): """ - Builds a regular expression pattern to match a citation key, allowing specific HTML tags around it. + Builds a regular expression pattern to match a citation key, allowing specific HTML tags around it, + and ignoring semicolons and commas in the text. This function constructs a regex pattern that matches the citation key while permitting certain HTML tags (`<span class="italique">`, `<strong>`, etc.) to surround it. This ensures that keys @@ -467,7 +468,7 @@ def build_key_regex_with_allowed_tags(key): "(?:<span class=\"italique\">|</span>|<strong>|</strong>)*Author(?:<span class=\"italique\">|</span>|<strong>|</strong>)* et(?:<span class=\"italique\">|</span>|<strong>|</strong>)* al\\." """ allowed_tag_pattern = r'(?:<span class="italique">|</span>|<strong>|</strong>)*' - key_pattern = allowed_tag_pattern.join(map(re.escape, key)) + key_pattern = allowed_tag_pattern.join(map(lambda x: f"{re.escape(x)}[;,]?", key)) return f"{allowed_tag_pattern}{key_pattern}{allowed_tag_pattern}" @@ -526,9 +527,11 @@ def create_interlink_for_citation(html_text, bibliography_dict): for transform in transformations: transformed_key = transform(key) - # Build regex for the key with extended match for trailing years and separators + # Build regex for the key with extended match for trailing years and separators (commas and semicolons) key_regex = build_key_regex_with_allowed_tags(transformed_key) - extended_key_regex = f"({key_regex}(?:,? ?(?:\\d{{4}}))*(?:,? ?\\d{{4}})?)" + extended_key_regex = ( + f"({key_regex}(?:[;,;]? ?(?:\\d{{4}}))*(?:[;,;]? ?\\d{{4}})?)" + ) pattern = re.compile(extended_key_regex, re.IGNORECASE) -- GitLab From bf96c2f27330babc2d8db39a2e0f1df30675c625 Mon Sep 17 00:00:00 2001 From: Samuel Conjard <samuel.conjard@univ-grenoble-alpes.fr> Date: Mon, 6 Jan 2025 14:52:29 +0100 Subject: [PATCH 2/2] Better pattern --- src/ptf/tests/test_html_conversion.py | 2 +- src/ptf/utils.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/ptf/tests/test_html_conversion.py b/src/ptf/tests/test_html_conversion.py index 3e8d39e1..8b80c19a 100644 --- a/src/ptf/tests/test_html_conversion.py +++ b/src/ptf/tests/test_html_conversion.py @@ -26,6 +26,6 @@ class InterlinkCreationTestCase(TestCase): "cited_in_article": False, } } - expected_html = 'Mauris lacinia tortor molestie purus luctus, <span class="tooltipPCJ"><a id="1" href="#r1" onclick="highlightReference(\'r1\', 3000, 500)" ><span class="tooltipPCJ"><a id="1" href="#r1" onclick="highlightReference(\'r1\', 3000, 500)" ><span class="tooltipPCJ"><a id="1" href="#r1" onclick="highlightReference(\'r1\', 3000, 500)" ><span class="tooltipPCJ"><a id="1" href="#r1" onclick="highlightReference(\'r1\', 3000, 500)" >Author et al., 2024</a><span style="position: absolute; visibility: hidden" class="tooltip tooltiptexthidden"><a href=#1></a> <span class="citation-author">A. Author; B. Foo; C. D. Bar</span><span class="citation-document-title"> Lorem ipsum sid dolor amet: Consectetur adipiscing elit</span><span class="citation-publication-title">, SITE</span><span class="citation-volume">, Volume 1</span> (2024) no. 1, pp. 0000-0000 | <a href=https://doi.org/00.0000/site.0000.0000 target="_blank">DOI</a></span></span></a><span style="position: absolute; visibility: hidden" class="tooltip tooltiptexthidden"><a href=#1></a> <span class="citation-author">A. Author; B. Foo; C. D. Bar</span><span class="citation-document-title"> Lorem ipsum sid dolor amet: Consectetur adipiscing elit</span><span class="citation-publication-title">, SITE</span><span class="citation-volume">, Volume 1</span> (2024) no. 1, pp. 0000-0000 | <a href=https://doi.org/00.0000/site.0000.0000 target="_blank">DOI</a></span></span></a><span style="position: absolute; visibility: hidden" class="tooltip tooltiptexthidden"><a href=#1></a> <span class="citation-author">A. Author; B. Foo; C. D. Bar</span><span class="citation-document-title"> Lorem ipsum sid dolor amet: Consectetur adipiscing elit</span><span class="citation-publication-title">, SITE</span><span class="citation-volume">, Volume 1</span> (2024) no. 1, pp. 0000-0000 | <a href=https://doi.org/00.0000/site.0000.0000 target="_blank">DOI</a></span></span></a><span style="position: absolute; visibility: hidden" class="tooltip tooltiptexthidden"><a href=#1></a> <span class="citation-author">A. Author; B. Foo; C. D. Bar</span><span class="citation-document-title"> Lorem ipsum sid dolor amet: Consectetur adipiscing elit</span><span class="citation-publication-title">, SITE</span><span class="citation-volume">, Volume 1</span> (2024) no. 1, pp. 0000-0000 | <a href=https://doi.org/00.0000/site.0000.0000 target="_blank">DOI</a></span></span>; aliquet fermentum odio porttitor' + expected_html = 'Mauris lacinia tortor molestie purus luctus, <span class="tooltipPCJ"><a id="1" href="#r1" onclick="highlightReference(\'r1\', 3000, 500)" ><span class="tooltipPCJ"><a id="1" href="#r1" onclick="highlightReference(\'r1\', 3000, 500)" ><span class="tooltipPCJ"><a id="1" href="#r1" onclick="highlightReference(\'r1\', 3000, 500)" ><span class="tooltipPCJ"><a id="1" href="#r1" onclick="highlightReference(\'r1\', 3000, 500)" ><span class="tooltipPCJ"><a id="1" href="#r1" onclick="highlightReference(\'r1\', 3000, 500)" >Author et al., 2024</a><span style="position: absolute; visibility: hidden" class="tooltip tooltiptexthidden"><a href=#1></a> <span class="citation-author">A. Author; B. Foo; C. D. Bar</span><span class="citation-document-title"> Lorem ipsum sid dolor amet: Consectetur adipiscing elit</span><span class="citation-publication-title">, SITE</span><span class="citation-volume">, Volume 1</span> (2024) no. 1, pp. 0000-0000 | <a href=https://doi.org/00.0000/site.0000.0000 target="_blank">DOI</a></span></span></a><span style="position: absolute; visibility: hidden" class="tooltip tooltiptexthidden"><a href=#1></a> <span class="citation-author">A. Author; B. Foo; C. D. Bar</span><span class="citation-document-title"> Lorem ipsum sid dolor amet: Consectetur adipiscing elit</span><span class="citation-publication-title">, SITE</span><span class="citation-volume">, Volume 1</span> (2024) no. 1, pp. 0000-0000 | <a href=https://doi.org/00.0000/site.0000.0000 target="_blank">DOI</a></span></span></a><span style="position: absolute; visibility: hidden" class="tooltip tooltiptexthidden"><a href=#1></a> <span class="citation-author">A. Author; B. Foo; C. D. Bar</span><span class="citation-document-title"> Lorem ipsum sid dolor amet: Consectetur adipiscing elit</span><span class="citation-publication-title">, SITE</span><span class="citation-volume">, Volume 1</span> (2024) no. 1, pp. 0000-0000 | <a href=https://doi.org/00.0000/site.0000.0000 target="_blank">DOI</a></span></span></a><span style="position: absolute; visibility: hidden" class="tooltip tooltiptexthidden"><a href=#1></a> <span class="citation-author">A. Author; B. Foo; C. D. Bar</span><span class="citation-document-title"> Lorem ipsum sid dolor amet: Consectetur adipiscing elit</span><span class="citation-publication-title">, SITE</span><span class="citation-volume">, Volume 1</span> (2024) no. 1, pp. 0000-0000 | <a href=https://doi.org/00.0000/site.0000.0000 target="_blank">DOI</a></span></span></a><span style="position: absolute; visibility: hidden" class="tooltip tooltiptexthidden"><a href=#1></a> <span class="citation-author">A. Author; B. Foo; C. D. Bar</span><span class="citation-document-title"> Lorem ipsum sid dolor amet: Consectetur adipiscing elit</span><span class="citation-publication-title">, SITE</span><span class="citation-volume">, Volume 1</span> (2024) no. 1, pp. 0000-0000 | <a href=https://doi.org/00.0000/site.0000.0000 target="_blank">DOI</a></span></span>; aliquet fermentum odio porttitor' returned_html = create_interlink_for_citation(input_html, input_bibliography_dict) assert returned_html == expected_html diff --git a/src/ptf/utils.py b/src/ptf/utils.py index eac53d1b..5b99d0ab 100644 --- a/src/ptf/utils.py +++ b/src/ptf/utils.py @@ -469,6 +469,7 @@ def build_key_regex_with_allowed_tags(key): """ allowed_tag_pattern = r'(?:<span class="italique">|</span>|<strong>|</strong>)*' key_pattern = allowed_tag_pattern.join(map(lambda x: f"{re.escape(x)}[;,]?", key)) + key_pattern = key_pattern.rstrip("[;,]?") return f"{allowed_tag_pattern}{key_pattern}{allowed_tag_pattern}" -- GitLab