diff --git a/src/ptf/tests/test_html_conversion.py b/src/ptf/tests/test_html_conversion.py index ee7f1b3db98c5a1a8768bac5e573e8bbac206c9c..068141b4d167f19d2f83edc88b884b8b7d95d510 100644 --- a/src/ptf/tests/test_html_conversion.py +++ b/src/ptf/tests/test_html_conversion.py @@ -321,16 +321,21 @@ class InterlinkCreationTestCase(TestCase): "key": "Author et al., 2025", # Second to last date "valid_cases": [ "Lorem ipsum Author et al., 2025 amet, consectetur", - "Lorem ipsum Author et al., 2024</a>, 2025 amet, consectetur", - "Lorem ipsum Author et al., 2024</a>dolor sit amet </span>, 2025 amet, consectetur", - "<tooltipPCJ>Lorem ipsum Author et al., 2024</a>dolor sit amet </span>, 2025 amet, consectetur", - "Lorem ipsum Author et al., 2024</a>, 2025, 2026 amet, consectetur", - "Lorem ipsum Author et al., 2023</a>, 2024</a>, 2025 amet, consectetur", + "Lorem ipsum Author et al., 2024</a> </span></span>, 2025 amet, consectetur", + "Lorem ipsum Author et al., 2024</a>Foo</span></span>dolor sit amet </span>, 2025 amet, consectetur", + "<tooltipPCJ>Lorem ipsum Author et al., 2024</a> </span></span>dolor sit amet </span>, 2025 amet, consectetur", + "Lorem ipsum Author et al., 2024</a>Foo</span></span>, 2025, 2026 amet, consectetur", + "Lorem ipsum Author et al., 2023</a>Foo</span></span>, 2024</a>Bar</span></span>, 2025 amet, consectetur", ], "invalid_cases": [ - "Lorem ipsum Author et al., 2024, 2025 amet, consectetur", # No </a> tag - "Lorem ipsum Author et al., 2024,</a> 2025 amet, consectetur", # Misplaced </a> tag + "Lorem ipsum Author et al., 2024, 2025 amet, consectetur", # No </a> nor </span></span> tag + "Lorem ipsum Author et al., 2024</a>, 2025 amet, consectetur", # No </span></span> enclosing tag + "Lorem ipsum Author et al., 2024</span></span>, 2025 amet, consectetur", # No </a> opening tag + "Lorem ipsum Author et al., 2024,</a> 2025 amet, consectetur", # Misplaced </a> opening tag + "Lorem ipsum Author et al., 2024</a>,</span></span> 2025 amet, consectetur", # Misplaced </span></span> closing tag + "Lorem ipsum Author et al., 2024</a>, </span></span>2025 amet, consectetur", # Misplaced </span></span> closing tag #2 "<tooltipPCJ>Lorem ipsum Author et al., 2024, 2025 amet, consectetur", # No matching </a> tag + "Lorem ipsum Author et al., 2024</a></span></span>, 2025 amet, consectetur", # Nothing between a and span tags ], }, ] diff --git a/src/ptf/utils.py b/src/ptf/utils.py index bdd416ef105acb035ae66b38104d83ac1cfaad48..84f58c56615a59b1b444685696b913f729c6e39e 100644 --- a/src/ptf/utils.py +++ b/src/ptf/utils.py @@ -462,10 +462,12 @@ def build_key_regex_with_allowed_tags(key: str) -> str: if date_index == -1: return "" - separator = r"[;,]? ?" + separator = r"[;,]?\s?" return ( - f"({separator.join(tagged_words[:date_index])}" - r"[;,]? ?\(?(?:([1-2][0-9]{3})\<\/a\>.*?[;,]? ?)*" + f"({separator.join(tagged_words[:date_index])}" # <strong>Author, <strong>et;<strong> al.; <strong> + f"{separator}" + r"\(?(?:([1-2][0-9]{3})\<\/a\>.*?<\/span><\/span>)*" # Optional <a>2024</span></span> + f"{separator}" f"{words[date_index]}" r")" )