From a35bb874d343e285707889d23471aa1c327b59c0 Mon Sep 17 00:00:00 2001
From: Samuel Conjard <samuel.conjard@univ-grenoble-alpes.fr>
Date: Mon, 6 Jan 2025 12:09:57 +0100
Subject: [PATCH 1/2] HTML Conversion : Allow semicolon recognition

---
 src/ptf/utils.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/ptf/utils.py b/src/ptf/utils.py
index 9bedf991..eac53d1b 100644
--- a/src/ptf/utils.py
+++ b/src/ptf/utils.py
@@ -448,7 +448,8 @@ def create_citation_link_and_new_html(key, label, tooltip_html=""):
 
 def build_key_regex_with_allowed_tags(key):
     """
-    Builds a regular expression pattern to match a citation key, allowing specific HTML tags around it.
+    Builds a regular expression pattern to match a citation key, allowing specific HTML tags around it,
+    and ignoring semicolons and commas in the text.
 
     This function constructs a regex pattern that matches the citation key while permitting certain
     HTML tags (`<span class="italique">`, `<strong>`, etc.) to surround it. This ensures that keys
@@ -467,7 +468,7 @@ def build_key_regex_with_allowed_tags(key):
             "(?:<span class=\"italique\">|</span>|<strong>|</strong>)*Author(?:<span class=\"italique\">|</span>|<strong>|</strong>)* et(?:<span class=\"italique\">|</span>|<strong>|</strong>)* al\\."
     """
     allowed_tag_pattern = r'(?:<span class="italique">|</span>|<strong>|</strong>)*'
-    key_pattern = allowed_tag_pattern.join(map(re.escape, key))
+    key_pattern = allowed_tag_pattern.join(map(lambda x: f"{re.escape(x)}[;,]?", key))
     return f"{allowed_tag_pattern}{key_pattern}{allowed_tag_pattern}"
 
 
@@ -526,9 +527,11 @@ def create_interlink_for_citation(html_text, bibliography_dict):
         for transform in transformations:
             transformed_key = transform(key)
 
-            # Build regex for the key with extended match for trailing years and separators
+            # Build regex for the key with extended match for trailing years and separators (commas and semicolons)
             key_regex = build_key_regex_with_allowed_tags(transformed_key)
-            extended_key_regex = f"({key_regex}(?:,? ?(?:\\d{{4}}))*(?:,? ?\\d{{4}})?)"
+            extended_key_regex = (
+                f"({key_regex}(?:[;,&#59;]? ?(?:\\d{{4}}))*(?:[;,&#59;]? ?\\d{{4}})?)"
+            )
 
             pattern = re.compile(extended_key_regex, re.IGNORECASE)
 
-- 
GitLab


From bf96c2f27330babc2d8db39a2e0f1df30675c625 Mon Sep 17 00:00:00 2001
From: Samuel Conjard <samuel.conjard@univ-grenoble-alpes.fr>
Date: Mon, 6 Jan 2025 14:52:29 +0100
Subject: [PATCH 2/2] Better pattern

---
 src/ptf/tests/test_html_conversion.py | 2 +-
 src/ptf/utils.py                      | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/ptf/tests/test_html_conversion.py b/src/ptf/tests/test_html_conversion.py
index 3e8d39e1..8b80c19a 100644
--- a/src/ptf/tests/test_html_conversion.py
+++ b/src/ptf/tests/test_html_conversion.py
@@ -26,6 +26,6 @@ class InterlinkCreationTestCase(TestCase):
                 "cited_in_article": False,
             }
         }
-        expected_html = 'Mauris lacinia tortor molestie purus luctus, <span class="tooltipPCJ"><a id="1" href="#r1" onclick="highlightReference(\'r1\', 3000, 500)" ><span class="tooltipPCJ"><a id="1" href="#r1" onclick="highlightReference(\'r1\', 3000, 500)" ><span class="tooltipPCJ"><a id="1" href="#r1" onclick="highlightReference(\'r1\', 3000, 500)" ><span class="tooltipPCJ"><a id="1" href="#r1" onclick="highlightReference(\'r1\', 3000, 500)" >Author et al., 2024</a><span style="position: absolute; visibility: hidden" class="tooltip tooltiptexthidden"><a href=#1></a> <span class="citation-author">A. Author; B. Foo; C. D. Bar</span><span class="citation-document-title"> Lorem ipsum sid dolor amet: Consectetur adipiscing elit</span><span class="citation-publication-title">, SITE</span><span class="citation-volume">, Volume 1</span> (2024) no. 1, pp. 0000-0000 | <a href=https://doi.org/00.0000/site.0000.0000 target="_blank">DOI</a></span></span></a><span style="position: absolute; visibility: hidden" class="tooltip tooltiptexthidden"><a href=#1></a> <span class="citation-author">A. Author; B. Foo; C. D. Bar</span><span class="citation-document-title"> Lorem ipsum sid dolor amet: Consectetur adipiscing elit</span><span class="citation-publication-title">, SITE</span><span class="citation-volume">, Volume 1</span> (2024) no. 1, pp. 0000-0000 | <a href=https://doi.org/00.0000/site.0000.0000 target="_blank">DOI</a></span></span></a><span style="position: absolute; visibility: hidden" class="tooltip tooltiptexthidden"><a href=#1></a> <span class="citation-author">A. Author; B. Foo; C. D. Bar</span><span class="citation-document-title"> Lorem ipsum sid dolor amet: Consectetur adipiscing elit</span><span class="citation-publication-title">, SITE</span><span class="citation-volume">, Volume 1</span> (2024) no. 1, pp. 0000-0000 | <a href=https://doi.org/00.0000/site.0000.0000 target="_blank">DOI</a></span></span></a><span style="position: absolute; visibility: hidden" class="tooltip tooltiptexthidden"><a href=#1></a> <span class="citation-author">A. Author; B. Foo; C. D. Bar</span><span class="citation-document-title"> Lorem ipsum sid dolor amet: Consectetur adipiscing elit</span><span class="citation-publication-title">, SITE</span><span class="citation-volume">, Volume 1</span> (2024) no. 1, pp. 0000-0000 | <a href=https://doi.org/00.0000/site.0000.0000 target="_blank">DOI</a></span></span>; aliquet fermentum odio porttitor'
+        expected_html = 'Mauris lacinia tortor molestie purus luctus, <span class="tooltipPCJ"><a id="1" href="#r1" onclick="highlightReference(\'r1\', 3000, 500)" ><span class="tooltipPCJ"><a id="1" href="#r1" onclick="highlightReference(\'r1\', 3000, 500)" ><span class="tooltipPCJ"><a id="1" href="#r1" onclick="highlightReference(\'r1\', 3000, 500)" ><span class="tooltipPCJ"><a id="1" href="#r1" onclick="highlightReference(\'r1\', 3000, 500)" ><span class="tooltipPCJ"><a id="1" href="#r1" onclick="highlightReference(\'r1\', 3000, 500)" >Author et al., 2024</a><span style="position: absolute; visibility: hidden" class="tooltip tooltiptexthidden"><a href=#1></a> <span class="citation-author">A. Author; B. Foo; C. D. Bar</span><span class="citation-document-title"> Lorem ipsum sid dolor amet: Consectetur adipiscing elit</span><span class="citation-publication-title">, SITE</span><span class="citation-volume">, Volume 1</span> (2024) no. 1, pp. 0000-0000 | <a href=https://doi.org/00.0000/site.0000.0000 target="_blank">DOI</a></span></span></a><span style="position: absolute; visibility: hidden" class="tooltip tooltiptexthidden"><a href=#1></a> <span class="citation-author">A. Author; B. Foo; C. D. Bar</span><span class="citation-document-title"> Lorem ipsum sid dolor amet: Consectetur adipiscing elit</span><span class="citation-publication-title">, SITE</span><span class="citation-volume">, Volume 1</span> (2024) no. 1, pp. 0000-0000 | <a href=https://doi.org/00.0000/site.0000.0000 target="_blank">DOI</a></span></span></a><span style="position: absolute; visibility: hidden" class="tooltip tooltiptexthidden"><a href=#1></a> <span class="citation-author">A. Author; B. Foo; C. D. Bar</span><span class="citation-document-title"> Lorem ipsum sid dolor amet: Consectetur adipiscing elit</span><span class="citation-publication-title">, SITE</span><span class="citation-volume">, Volume 1</span> (2024) no. 1, pp. 0000-0000 | <a href=https://doi.org/00.0000/site.0000.0000 target="_blank">DOI</a></span></span></a><span style="position: absolute; visibility: hidden" class="tooltip tooltiptexthidden"><a href=#1></a> <span class="citation-author">A. Author; B. Foo; C. D. Bar</span><span class="citation-document-title"> Lorem ipsum sid dolor amet: Consectetur adipiscing elit</span><span class="citation-publication-title">, SITE</span><span class="citation-volume">, Volume 1</span> (2024) no. 1, pp. 0000-0000 | <a href=https://doi.org/00.0000/site.0000.0000 target="_blank">DOI</a></span></span></a><span style="position: absolute; visibility: hidden" class="tooltip tooltiptexthidden"><a href=#1></a> <span class="citation-author">A. Author; B. Foo; C. D. Bar</span><span class="citation-document-title"> Lorem ipsum sid dolor amet: Consectetur adipiscing elit</span><span class="citation-publication-title">, SITE</span><span class="citation-volume">, Volume 1</span> (2024) no. 1, pp. 0000-0000 | <a href=https://doi.org/00.0000/site.0000.0000 target="_blank">DOI</a></span></span>; aliquet fermentum odio porttitor'
         returned_html = create_interlink_for_citation(input_html, input_bibliography_dict)
         assert returned_html == expected_html
diff --git a/src/ptf/utils.py b/src/ptf/utils.py
index eac53d1b..5b99d0ab 100644
--- a/src/ptf/utils.py
+++ b/src/ptf/utils.py
@@ -469,6 +469,7 @@ def build_key_regex_with_allowed_tags(key):
     """
     allowed_tag_pattern = r'(?:<span class="italique">|</span>|<strong>|</strong>)*'
     key_pattern = allowed_tag_pattern.join(map(lambda x: f"{re.escape(x)}[;,]?", key))
+    key_pattern = key_pattern.rstrip("[;,]?")
     return f"{allowed_tag_pattern}{key_pattern}{allowed_tag_pattern}"
 
 
-- 
GitLab