From a311601c74e9478e5a687db6fb369bfcb7f6af37 Mon Sep 17 00:00:00 2001 From: Nathan Tien You <nathan.tien-you@univ-grenoble-alpes.fr> Date: Tue, 21 Jan 2025 16:25:13 +0000 Subject: [PATCH 1/8] feat : csis crawler --- src/crawler/by_source/csis_crawler.py | 192 ++ src/crawler/factory.py | 2 + .../tests/data/by_source/CSIS_test_data.json | 1648 +++++++++++++++++ .../data_generation/generate_test_data.py | 13 +- src/crawler/tests/fixtures.py | 2 + 5 files changed, 1853 insertions(+), 4 deletions(-) create mode 100644 src/crawler/by_source/csis_crawler.py create mode 100644 src/crawler/tests/data/by_source/CSIS_test_data.json diff --git a/src/crawler/by_source/csis_crawler.py b/src/crawler/by_source/csis_crawler.py new file mode 100644 index 0000000..d357177 --- /dev/null +++ b/src/crawler/by_source/csis_crawler.py @@ -0,0 +1,192 @@ +from urllib.parse import urljoin + +import regex +from bs4 import BeautifulSoup, Tag +from ptf.model_data import ( + ContributorDict, + create_abstract, + create_articledata, + create_contributor, + create_issuedata, + create_subj, +) + +from crawler.base_crawler import BaseCollectionCrawler +from crawler.utils import add_pdf_link_to_xarticle, cleanup_str + + +class CsisCrawler(BaseCollectionCrawler): + source_name = "Computer Science and Information Systems" + source_domain = "CSIS" + source_website = "http://www.comsis.org/" + + issue_re = r"Volume (?P<volume>\d+), Issue (?P<number>\d+) \(\w+ (?P<year>\d+)\)" + + def parse_collection_content(self, content): + xissues = [] + soup = BeautifulSoup(content, "html.parser") + col_issue_tags = soup.select("#content > p") + for index, tag in enumerate(col_issue_tags): + xissue = self.parse_col_issue_tag(tag) + xissue.pid = self.collection_id + "_TEMPPID_" + str(index) + xissues.append(xissue) + return xissues + + def parse_col_issue_tag(self, col_issue_tag: Tag): + issue_title = col_issue_tag.select_one("a.hidden") + if not issue_title: + raise ValueError("Couldn't parse issue link") + issue_href = issue_title.get("href") + if not isinstance(issue_href, str): + raise ValueError("Couldn't parse issue href") + xissue = create_issuedata() + xissue.url = urljoin(self.source_website, issue_href) + return xissue + + def parse_issue_content(self, content, xissue): + soup = BeautifulSoup(content, "html.parser") + + content = soup.select_one("#content") + if not content: + raise ValueError("Couldn't find issue content") + title_tag = content.select_one("h1") + if not title_tag: + raise ValueError("Couldn't find issue title") + + title_search = regex.search(self.issue_re, title_tag.text) + if not title_search: + raise ValueError("Couldn't parse issue title") + title_group = title_search.groupdict() + + xissue.number = title_group["number"] + xissue.volume = title_group["volume"] + xissue.year = title_group["year"] + + xissue.pid = self.get_issue_pid( + self.collection_id, title_group["year"], title_group["volume"], title_group["number"] + ) + + for index, article_tag in enumerate(content.select("p")): + if article_tag.text == "Editorial": + continue + + article_title = article_tag.select_one("a.hidden") + if not article_title: + raise ValueError("Couldn't parse issue link") + article_href = article_title.get("href") + if not isinstance(article_href, str): + raise ValueError("Couldn't parse issue href") + + xarticle = create_articledata() + xarticle.url = urljoin(self.source_website, article_href) + xarticle.pid = "a" + str(index) + xissue.articles.append(xarticle) + + def parse_article_content(self, content, xissue, xarticle, url, pid): + xarticle.pid = pid + + soup = BeautifulSoup(content, "html.parser") + content = soup.select_one("#content") + if not content: + raise ValueError("Couldn't parse article content") + id_tag = content.select_one("p.id") + if id_tag: + id_tag.decompose() + + # Title + title_tag = content.select_one(".title") + if not title_tag: + raise ValueError("Couldn't find title") + xarticle.title_tex = title_tag.text + title_tag.decompose() + + # Authors + authors_tag = content.select_one(".authors") + if not authors_tag: + raise ValueError("Couldn't find authors") + current_contributor: ContributorDict | None = None + for c in authors_tag.children: + if isinstance(c, str): + author_str = cleanup_str(c) + if author_str == "": + continue + author_str = author_str.removeprefix(", ").removeprefix("and ").strip() + current_contributor = create_contributor(role="author", string_name=author_str) + xarticle.contributors.append(current_contributor) + continue + + if not isinstance(c, Tag): + continue + if not current_contributor: + raise ValueError("Couldn't find author") + + if c.name == "sup": + # affiliations + continue + if c.name == "a": + orcid_href = c.get("href") + if not isinstance(orcid_href, str): + print("Couldn't parse contributor orcid") + continue + if not orcid_href.startswith("https://orcid.org/"): + print( + "Couldn't parse contributor ocrid : ocrid must start with https://orcid.org/" + ) + continue + current_contributor["orcid"] = orcid_href.removeprefix("https://orcid.org/") + authors_tag.decompose() + + # Affiliations + affiliations_tag = content.select_one("ol") + if affiliations_tag: + affiliations_tag.decompose() + + current_header: str | None = None + categories: dict[str, Tag] = {} + for tag in content.findChildren(recursive=False): + if tag.name == "h3": + current_header = tag.text + continue + if tag.name == "p": + if current_header is None: + raise ValueError("Couldn't parse article content") + categories[current_header] = tag + continue + raise ValueError("Found foreign tag in article content") + del current_header + + # Abstract + if "Abstract" in categories: + xabstract = create_abstract( + tag="abstract", value_tex=categories["Abstract"].text, lang="en" + ) + xarticle.abstracts.append(xabstract) + + # PDF + pdf_tag = categories["Full text"].select_one("a.download") + if not pdf_tag: + raise ValueError("Couldn't find pdf url") + pdf_url = pdf_tag.get("href") + if not isinstance(pdf_url, str): + raise ValueError("Couldn't parse pdf url") + add_pdf_link_to_xarticle(xarticle, urljoin(self.source_website, pdf_url)) + + # DOI + if "Digital Object Identifier (DOI)" in categories: + doi_tag = categories["Digital Object Identifier (DOI)"].select_one("a") + if not doi_tag: + raise ValueError("Couldn't find doi url") + doi_url = doi_tag.get("href") + if not isinstance(doi_url, str): + raise ValueError("Couldn't parse doi url") + if not doi_url.startswith("https://doi.org/"): + raise ValueError("Malformed DOI url") + doi_url = doi_url.removeprefix("https://doi.org/") + xarticle.doi = doi_url + + # Keywords + if "Key words" in categories: + keywords = categories["Key words"].text.split(", ") + for k in keywords: + xarticle.kwds.append(create_subj(value=k, lang="en")) + return xarticle diff --git a/src/crawler/factory.py b/src/crawler/factory.py index 5aa2ab0..8c880c2 100644 --- a/src/crawler/factory.py +++ b/src/crawler/factory.py @@ -6,6 +6,7 @@ from .by_source.ami_crawler import AmiCrawler from .by_source.amp_crawler import AmpCrawler from .by_source.arsia_crawler import ArsiaCrawler from .by_source.bdim_crawler import BdimCrawler +from .by_source.csis_crawler import CsisCrawler from .by_source.da_crawler import DaCrawler from .by_source.dmlbul_crawler import DmlbulCrawler from .by_source.dmlcz_crawler import DmlczCrawler @@ -28,6 +29,7 @@ crawler_classes = ( AmpCrawler, ArsiaCrawler, BdimCrawler, + CsisCrawler, DaCrawler, DmlbulCrawler, DmlczCrawler, diff --git a/src/crawler/tests/data/by_source/CSIS_test_data.json b/src/crawler/tests/data/by_source/CSIS_test_data.json new file mode 100644 index 0000000..d17e144 --- /dev/null +++ b/src/crawler/tests/data/by_source/CSIS_test_data.json @@ -0,0 +1,1648 @@ +{ + "crawl_article": { + "CSIS": { + "CSIS": { + "CSIS_TEMPPID_61": { + "a2": { + "input": { + "article": { + "abstracts": [], + "bibitems": [], + "contributors": [], + "date_published_iso_8601_date_str": null, + "doi": null, + "ext_links": [], + "ids": [], + "kwds": [], + "lang": "und", + "pid": "a2", + "streams": [], + "title_tex": "", + "trans_title_html": "", + "trans_title_tex": "", + "translations": [], + "url": "http://www.comsis.org/archive.php?show=pprnst-3210" + }, + "issue": { + "articles": [ + { + "abstracts": [], + "bibitems": [], + "contributors": [], + "date_published_iso_8601_date_str": null, + "doi": null, + "ext_links": [], + "ids": [], + "kwds": [], + "lang": "und", + "pid": "a1", + "streams": [], + "title_tex": "", + "trans_title_html": "", + "trans_title_tex": "", + "translations": [], + "url": "http://www.comsis.org/archive.php?show=pprnst-2407" + }, + { + "abstracts": [], + "bibitems": [], + "contributors": [], + "date_published_iso_8601_date_str": null, + "doi": null, + "ext_links": [], + "ids": [], + "kwds": [], + "lang": "und", + "pid": "a2", + "streams": [], + "title_tex": "", + "trans_title_html": "", + "trans_title_tex": "", + "translations": [], + "url": "http://www.comsis.org/archive.php?show=pprnst-3210" + }, + { + "abstracts": [], + "bibitems": [], + "contributors": [], + "date_published_iso_8601_date_str": null, + "doi": null, + "ext_links": [], + "ids": [], + "kwds": [], + "lang": "und", + "pid": "a3", + "streams": [], + "title_tex": "", + "trans_title_html": "", + "trans_title_tex": "", + "translations": [], + "url": "http://www.comsis.org/archive.php?show=pprnst-1703" + }, + { + "abstracts": [], + "bibitems": [], + "contributors": [], + "date_published_iso_8601_date_str": null, + "doi": null, + "ext_links": [], + "ids": [], + "kwds": [], + "lang": "und", + "pid": "a4", + "streams": [], + "title_tex": "", + "trans_title_html": "", + "trans_title_tex": "", + "translations": [], + "url": "http://www.comsis.org/archive.php?show=pprnst-0419" + }, + { + "abstracts": [], + "bibitems": [], + "contributors": [], + "date_published_iso_8601_date_str": null, + "doi": null, + "ext_links": [], + "ids": [], + "kwds": [], + "lang": "und", + "pid": "a5", + "streams": [], + "title_tex": "", + "trans_title_html": "", + "trans_title_tex": "", + "translations": [], + "url": "http://www.comsis.org/archive.php?show=pprnst-2003" + }, + { + "abstracts": [], + "bibitems": [], + "contributors": [], + "date_published_iso_8601_date_str": null, + "doi": null, + "ext_links": [], + "ids": [], + "kwds": [], + "lang": "und", + "pid": "a6", + "streams": [], + "title_tex": "", + "trans_title_html": "", + "trans_title_tex": "", + "translations": [], + "url": "http://www.comsis.org/archive.php?show=pprnst-0702" + } + ], + "number": "", + "pid": "CSIS_TEMPPID_61", + "url": "http://www.comsis.org/archive.php?show=vol0302", + "volume": "", + "year": "" + }, + "urls": { + "http://www.comsis.org/archive.php?show=pprnst-3210": "eJylV91u2zYUvg/QdzjVzVLAluw47dLM0drayeIiTYI5WdergZIoiyklqiRlx8Xu9xp5lmAPtkNSsuUkLloMuYhJnv+f7xwNn48vRlefLo8h0zmHy+t3Z5MReN0g+DgYBcH4agx/nl59OIO+34OplizWQXB87oGXaV0eBsFisfAXA1/IWXD1e3BrpPQNW/2zqyyPn+jEC3eGVsltzgt19ISA/uvXrx0f0iIxJQny5FQTMMRd+qVi8yNvJApNC929WpbUg9idjjxNb3VgmH+BOCNSUX1U6bR74EGAUjTTnIYjkU8nU/gb8EdZaSrv76Yxo0VM7+9IkdzfTYpUyJxoJgp8WipNczUMHPPOkLPiM0jKjzyll5yqjFLtgUY7avWxUh5kkqZHnqQqsFR9394aIwLn0s4wEskSzyqWrNRtATdkTtytB0rGTszi619aCK5Z6d8oLxwGjuL7BSjOEvqQN2FzYMmRR7gLd3M2NlJps9UPfyxOyIBOoqAQhs+7XXCioNtty88JKzYUKjQuIlZjGb6NRIUOZRRuRCULwodBiQ8Vt+EPh6QOLysSeuuXWemFpyKnUJIZHQYEHUSqTVJTIQRr0BKP3AHY2oEtbITlyvG8xV+AboOKRblNizWIFTPHMqlP36GnFJxhbGtdxwnTQjLCobnf7pUSUrMqbxyzpb2+3sIXCSKTWtl7F2Jwd1sYclKQ2cqxD/XJ8bRYApuiMjwREkilMyHV1tStQtIEa3UB+Mu2tGRRpa2MJ42qwUPcoCd56leF8knsSxWs6iKIRa4YlvwlKbEGVRXlTKntaVgT1EnHIMYc6339gGUpRTXL7u8u3k+3pqVcSjbLVuVWH0FLUqgULTG+bmFOxKLggiSOd1yfQNH4Qf20gy1tk20LNpFxxubUSPxVZWJxxJUu6MIzrDrDGJlcEufrtmA/kjEXfK/f20f3KikRfAHjU21rjBa3iao9bHVlzuhiuzM/kPU/UJDtWSezlX/1hPJzuviWTlXPqZTENBLis49aAtdwdQsFHkIey2drzE0jvyxmHhCOs+nknReEUNP+pOCkFvRN2ErLuoYQoZVtDFXS2ECDDfe2bBVCs5TFrf5C7xwLtN8ehaEF3DUkP0DuetQaoDZE745/m5w7kmf4Nywh5kThZGc466/HI+j19v2fBwf+yz0TWxwnDYGdp4h1c8Ir1/YCM2VmDZFMmWMKU2oKB44LRBuq4FqZQjVz4Wx6CR8oAkxiB85abY06Bta+ChhXN7mYs3ioqjLs49zDf7YgTglTGYN3hLWfrIWChyaUY4p26NwUNhrSjECoJyBMSQEn2M0xw3mASxHRFK4LrGqpmF4OIxmEN2jBG5WqyqdJ1YHMqvQjwt7McPxxUz829MPAqMwGOPZwUcK5hC4N0KXwI040rCJjAkE8zPGQ0UKhEshFQrkthy8VKTCUGEG8putgGi/jjWAqF0zqgunDVdbIYQoiomgCFt5cdHMbXVdxdrS3hDteZGHxGqeZ04LlDaQspSCoCwUbeWQ2k3RGGhIsZbNAoTzjmXbs1JASjQ8pR6iDtCos4BGO8exApUjE3E/jGaK5HRcmFU+4NjF+oHKMXmmEGaNWrqYV50s7I5kypQwLpjNr52R6Aa/7e69AaVSCo825L1K9INIkQyQVSmsH4rJlCE5FvkShxiITeop5MVnZtG4j2sawjWeIsLyynMjPttRx44Pd6TFevnDhQWbClYCEmh0uQinM5aw0E853PTYIL6uI1y2+uX24ylrhRPAUrPcGvT0ET8Er3KkGHZhY2NiD3TGNaR5hF+z1eq9eWOwwhf4JHTA+t5QeWhL7OplOz/G433vV7ff6B7B7UeAKTV/YR8uCK7Q8hHp3GbV2F+fLCeYLzF7bsr7u9mZWNvt2maTWEwSqQunuAAcUrrsGkmsGTtP2TmzpVwDdzFqE6bdz7FAScWrCezk+scYOaR5eom32fiziyqLDiQ3uMMBHh6bO6lOxwPRBzDRt7G7QqAPv/Y7Bng6c+ofwfxHQf4xOpkXaC1W9nHcAk+qblJ4LH/Y6MOh3X77yYbfOZxk+23GwDsfnY7Cw3p4INfbXE6H1YNb5B3MiFQItwm+W0muEPHz06pt2bkKcihXXS+u78wbNvpIzDDgiK70hcBFJkmAc//2HGFcwx70eOjRnCMpJByMmI0Y60NpdGdfi0C0Fb1b7ghc+vHHpW1u6YVdTqQlEy8e1ChUuH9JVCXFfh9gKCNbU21xbYtzUDCig7hy57SdvTamCaNkt4m6RBPt+D5eJUU1r1BlieKtXUNs9F4W5ptIsA3gaU8nmllwBsmP6McIFcRAKZ04FGujawdb7I/lnjcn2m9V8Jkj0qrtgic4Oe3XfNLsQ85/0ZdOJg4PbQd+2WBBuCS/unAiuWAz3d81rq7JcpTyuODNDXHm6z2jzXY2f/eHOfwO/sKc=" + } + }, + "metadata": { + "collection_url": "http://www.comsis.org/archive.php" + }, + "output": { + "abstracts": [ + { + "lang": "en", + "tag": "abstract", + "value_tex": "We present a comprehensive model for quantitative evaluation and comparison of search engines. The model is based on the LSP method for system evaluation. The basic contribution of our approach is the aggregation of all relevant attributes that reflect functionality, usability, and performance of search engines. In this respect our model is fully consistent with the ISO 9126 standard for software product evaluation. Performance analysis of competitive search engines is based on our search engine benchmarking tool (SEben) that is also described in the paper." + } + ], + "bibitems": [], + "contributors": [ + { + "role": "author", + "string_name": "Jozo Dujmovic" + }, + { + "role": "author", + "string_name": "Haishi Bai" + } + ], + "date_published_iso_8601_date_str": null, + "doi": null, + "ext_links": [ + { + "location": "http://www.comsis.org/pdf.php?id=nst-3210", + "rel": "article-pdf" + }, + { + "location": "http://www.comsis.org/archive.php?show=pprnst-3210", + "rel": "source" + } + ], + "ids": [], + "kwds": [], + "lang": "und", + "pid": "CSIS_TEMPPID_61_a2", + "streams": [ + { + "location": "http://www.comsis.org/pdf.php?id=nst-3210", + "mimetype": "application/pdf", + "rel": "full-text", + "text": "Full Text" + } + ], + "title_tex": "Evaluation ond Comparison of Search Engines Using the LSP Method", + "trans_title_html": "", + "trans_title_tex": "", + "translations": [], + "url": "http://www.comsis.org/archive.php?show=pprnst-3210" + } + }, + "a5": { + "input": { + "article": { + "abstracts": [], + "bibitems": [], + "contributors": [], + "date_published_iso_8601_date_str": null, + "doi": null, + "ext_links": [], + "ids": [], + "kwds": [], + "lang": "und", + "pid": "a5", + "streams": [], + "title_tex": "", + "trans_title_html": "", + "trans_title_tex": "", + "translations": [], + "url": "http://www.comsis.org/archive.php?show=pprnst-2003" + }, + "issue": { + "articles": [ + { + "abstracts": [], + "bibitems": [], + "contributors": [], + "date_published_iso_8601_date_str": null, + "doi": null, + "ext_links": [], + "ids": [], + "kwds": [], + "lang": "und", + "pid": "a1", + "streams": [], + "title_tex": "", + "trans_title_html": "", + "trans_title_tex": "", + "translations": [], + "url": "http://www.comsis.org/archive.php?show=pprnst-2407" + }, + { + "abstracts": [], + "bibitems": [], + "contributors": [], + "date_published_iso_8601_date_str": null, + "doi": null, + "ext_links": [], + "ids": [], + "kwds": [], + "lang": "und", + "pid": "a2", + "streams": [], + "title_tex": "", + "trans_title_html": "", + "trans_title_tex": "", + "translations": [], + "url": "http://www.comsis.org/archive.php?show=pprnst-3210" + }, + { + "abstracts": [], + "bibitems": [], + "contributors": [], + "date_published_iso_8601_date_str": null, + "doi": null, + "ext_links": [], + "ids": [], + "kwds": [], + "lang": "und", + "pid": "a3", + "streams": [], + "title_tex": "", + "trans_title_html": "", + "trans_title_tex": "", + "translations": [], + "url": "http://www.comsis.org/archive.php?show=pprnst-1703" + }, + { + "abstracts": [], + "bibitems": [], + "contributors": [], + "date_published_iso_8601_date_str": null, + "doi": null, + "ext_links": [], + "ids": [], + "kwds": [], + "lang": "und", + "pid": "a4", + "streams": [], + "title_tex": "", + "trans_title_html": "", + "trans_title_tex": "", + "translations": [], + "url": "http://www.comsis.org/archive.php?show=pprnst-0419" + }, + { + "abstracts": [], + "bibitems": [], + "contributors": [], + "date_published_iso_8601_date_str": null, + "doi": null, + "ext_links": [], + "ids": [], + "kwds": [], + "lang": "und", + "pid": "a5", + "streams": [], + "title_tex": "", + "trans_title_html": "", + "trans_title_tex": "", + "translations": [], + "url": "http://www.comsis.org/archive.php?show=pprnst-2003" + }, + { + "abstracts": [], + "bibitems": [], + "contributors": [], + "date_published_iso_8601_date_str": null, + "doi": null, + "ext_links": [], + "ids": [], + "kwds": [], + "lang": "und", + "pid": "a6", + "streams": [], + "title_tex": "", + "trans_title_html": "", + "trans_title_tex": "", + "translations": [], + "url": "http://www.comsis.org/archive.php?show=pprnst-0702" + } + ], + "number": "", + "pid": "CSIS_TEMPPID_61", + "url": "http://www.comsis.org/archive.php?show=vol0302", + "volume": "", + "year": "" + }, + "urls": { + "http://www.comsis.org/archive.php?show=pprnst-2003": "eJylWM9y27gZv2dm3+FbXprMSKRspxnbldXYlt1oJ2u7ldPtnjogCYmwQYIFQMnK9LC3Pe0D9JZnyeyT9En6A0BJtB1ldqfjg0ni+/v7/kLDb8fX57c/3lxQYUtJNx/O3k/OKeonyQ8H50kyvh3TP97dfv+e9uIBTa0WmU2Si6uIosLa+jhJlstlvDyIlZ4nt39LHpyUPcfWPvaN54lzm0ejF0Ov5KGUlTn5goC9o6OjwAdaEHOWg6fklpEj7vN/NWJxEp2ryvLK9m9XNY8oC28nkeUPNnHMf6KsYNpwe9LYWf8wogRSrLCSj85VOZ1M6d+Eh7qxXH/+NM0ErzL++ROr8s+fJtVM6ZJZoSocrYzlpRkmgfnFUIrqnjSXJ5GxK8lNwbmNyMKOVn1mTESF5rOTSHOTeKq92H91RiTBpRfDVOUrvJtMi9p2BdyxBQtfIzI6C2KWH/9plZJW1PGdiUbDJFD8dgFGipw/5c3FgkR+EjEZ4F6/Oxu59tHaG/0+nMAAJyFoRMNv+30Koqjf78ovmageKTQwLmVeYz06TVUDhwpOd6rRFZPDpMZBIz38oyFr4RVVzh/iuqij0TtVcqrZnA8TBgdB9ZjUZQhDDnri8/BCYuvADjYmShN4TvFEcJtMpupdWrxBopoHlkn79hv01EoKYNvqusiFVVowSevvu70ySlvRlGvHfGpvP+/gSxXTeavsuwAxhW87GEpWsfnGse/bt8DTYUl8iOrRpdLEGlsobXaGbgPJGqzNB8KTL2kt0sZ6GV80qm0e6g6elLO4qUzMslibZJMXSaZKI5DyN6xGDpomLYUxu8OwJWiDDhAziXzfHiAttWrmxedP199Nd4alXmkxLzbp1r6S1awyM1jifN3BnKtlJRXLA++4fSPDsyf50wVb+yLbBTbTWSEW3En8synU8kQaW/Fl5FhtAYxcLFnwdRfYz2QslNzfG7yGe43WaL4EfJpdhdHhdqj6l52uLARf7nbmd0T97xDkazbI7MTffEH5FV9+Tadp59SMZTxV6j6GliQUXFtCSYSWJ8r5tufO0riu5hExidl0eRYlI2pp/2DoshX01bY1q9scQoc2vjBMzTPXGjzcu6JVKStmIuvUF7wLLNQ9ewZDp3G3LflJ525HrWvUjujs4i+Tq0DyDf6GNWWSGUx2gVn/YXxOg8Hr+PXRvgMWs2R96oep71clSuoHntJpXcvWKDq1aM/35GZ76LlTnjVa2BV9MC5VT6c38dXFrZ81W41tw4lGZ+qOVQB6IX79Rfz689A09WgPQw//vLQxd+dTUT459DYqOXJIIjiNhEI1o2s9Z5X46E0D7u0MNMNUJ6Ob6zP6437PP6dObXwHtZnI3s4x4qTLkR7lBqqytzOEIp27ZF01HvNh4pQVB5h32JAwkODQARwaOTzYFg/jij/4Lwypijur3HQslbGUs6ayDhXLzL0hq3K26lHKM9YYT7lhNoWYWZppVZJUqDCSfMGRVaCZTK/pejqBxJxLyOiqD2Q9D11HbNaWvRG2CXSioskt8WohtKpKnMXrSIEeXc9gnEHtrJFUcqxnlTClMxjlWbkVAS5BNPPBR643VkgOT7AKCAsaLH6oKTJu0t9Xail5PveWiLKW3OlzKDzBbuN9TLcF0Kv9IJipDF44LDfq2BwrCeB8in2PuIBh2tnpSCiHFdgdUl7xGexKVxhVUrruDO21FgtmeXfik/K8uTAslRxR0nPuUcO0pwnix902lsIaF1K7VCGs2bY0uu605h7T9K/vaVLdhangY3OulTFIa6if+v0OBoWowe+UGZ57f32V/Pen/wALDuAr10v4A559Wrsg7oYQ5oJUOqQIM8D55bJhHWW81hr5n/NnMD6KA/Ri3PAQSkBWZQIR9ImIQlBfiSItC5EVVLJ7Tk3tEasxuBxnzme8gjGd1KqRL0gLnh/DbEjwIVoinEhpBoxdtJBjcyDDskyhjnqtJtCBE0HCnEKF4fLxEo2vatF295lq/spjC4ddRvGHjNf+EMpz6QS8XHpHnY2Vm0M1SBmMFx3DAF+noXVt9zsQaDST4iMItwqkmvvlK2j3gjxIrzy05hGiBZc1aSZMsMO1c1sw26YR7C6Yr8DM546T6HZHlBqEytV2CeOOKOXWui1qHcnQ0w9GN026CdSjVTd0s81QSr60QwwOBvuY1Eo2yIWDHk38jNqnl2Oe8TKFvv3B4M0rP6hcl/2RwweEu6P02JP408l0eoXX14M3/b3B3iG9vK4QC/4qNGvHgvuaPqZ2UT7vLMrBl8tGov/hEtWxvp0v68Vsfbmr85n3BFMRjaMPGw5wt3Lzv2WQfNa9gHn6zTawXuywE5wuMCp8c0Dx3YwvvbFDXo5uYJv/Pka/cg2OLj24wwSHYXQHq9+FWsxQ+2u7N+OvR2dxr512PRrH3vn/Y+rGm3vzehR64u7+3t4Fe4Swxi6oVyqm/R4dHvSP3sT0so1oPfrmRdgi6OJqTH6L6C4g7arRLiCdA3d7fLKWzJSCRbgi19FayNPDqP3SjU53zq8He49u9RyQuxZ9x+g61Sx3YP7MnCvYeQcDOLQQNGU5gOU6FaxHnauSkFYdhx307WY9jUZPv4QAbi19ZNc6V3M3YJ5lKzVuYIY8YeHHCITRtb/o8Zac4WLgeliYJsb/wtJSmiRd9ausX+XJ63iA3fW8pW3zw7iE8MWPkPavVOU+c+12T7yNuZ90IDcEdoQfCFfrHel9UAEDQ0H4jH8m//3aZP8TibuVanjVX4rcFseDtnLWq7eIv+jLYycODx8O9nyRJaMd8OKKg9GDZPj8aX3ayayQKc8zDtt3m57hVxv3M44tsTD/D2YfmVs=" + } + }, + "metadata": { + "collection_url": "http://www.comsis.org/archive.php" + }, + "output": { + "abstracts": [ + { + "lang": "en", + "tag": "abstract", + "value_tex": "Web applications security is one of the most daunting tasks today, because of security shift from lower levels of ISO OSI model to application level, and because of current situation in IT environment. ASP.NET offers powerful mechanisms to render these attacks futile, but it requires some knowledge of implementing Web application security. This paper focuses on attacks against Web applications, either to gain direct benefit by collecting private information or to disable target sites. It describes the two most common Web application attacks: SQL Injection and Cross Site Scripting, and is based on author’s perennial experience in Web application security. It explains how to use ASP.NET to provide Web applications security. There are some principles of strong Web application security which make up the part of defense mechanisms presented: executing with least privileged account, securing sensitive data (connection string) and proper exception handling (where the new approach is presented using ASP.NET mechanisms for centralized exception logging and presentation). These principles help raise the bar that attacker has to cross and consequently contribute to better security." + } + ], + "bibitems": [], + "contributors": [ + { + "role": "author", + "string_name": "Bojan Jovičić" + }, + { + "role": "author", + "string_name": "Dejan Simić" + } + ], + "date_published_iso_8601_date_str": null, + "doi": null, + "ext_links": [ + { + "location": "http://www.comsis.org/pdf.php?id=nst-2003", + "rel": "article-pdf" + }, + { + "location": "http://www.comsis.org/archive.php?show=pprnst-2003", + "rel": "source" + } + ], + "ids": [], + "kwds": [], + "lang": "und", + "pid": "CSIS_TEMPPID_61_a5", + "streams": [ + { + "location": "http://www.comsis.org/pdf.php?id=nst-2003", + "mimetype": "application/pdf", + "rel": "full-text", + "text": "Full Text" + } + ], + "title_tex": "Common Web Application Attack Types and Security Using ASP.NET", + "trans_title_html": "", + "trans_title_tex": "", + "translations": [], + "url": "http://www.comsis.org/archive.php?show=pprnst-2003" + } + } + }, + "CSIS_TEMPPID_65": { + "a4": { + "input": { + "article": { + "abstracts": [], + "bibitems": [], + "contributors": [], + "date_published_iso_8601_date_str": null, + "doi": null, + "ext_links": [], + "ids": [], + "kwds": [], + "lang": "und", + "pid": "a4", + "streams": [], + "title_tex": "", + "trans_title_html": "", + "trans_title_tex": "", + "translations": [], + "url": "http://www.comsis.org/archive.php?show=pprnnn-1204" + }, + "issue": { + "articles": [ + { + "abstracts": [], + "bibitems": [], + "contributors": [], + "date_published_iso_8601_date_str": null, + "doi": null, + "ext_links": [], + "ids": [], + "kwds": [], + "lang": "und", + "pid": "a1", + "streams": [], + "title_tex": "", + "trans_title_html": "", + "trans_title_tex": "", + "translations": [], + "url": "http://www.comsis.org/archive.php?show=pprnnn-1201" + }, + { + "abstracts": [], + "bibitems": [], + "contributors": [], + "date_published_iso_8601_date_str": null, + "doi": null, + "ext_links": [], + "ids": [], + "kwds": [], + "lang": "und", + "pid": "a2", + "streams": [], + "title_tex": "", + "trans_title_html": "", + "trans_title_tex": "", + "translations": [], + "url": "http://www.comsis.org/archive.php?show=pprnnn-1202" + }, + { + "abstracts": [], + "bibitems": [], + "contributors": [], + "date_published_iso_8601_date_str": null, + "doi": null, + "ext_links": [], + "ids": [], + "kwds": [], + "lang": "und", + "pid": "a3", + "streams": [], + "title_tex": "", + "trans_title_html": "", + "trans_title_tex": "", + "translations": [], + "url": "http://www.comsis.org/archive.php?show=pprnnn-1203" + }, + { + "abstracts": [], + "bibitems": [], + "contributors": [], + "date_published_iso_8601_date_str": null, + "doi": null, + "ext_links": [], + "ids": [], + "kwds": [], + "lang": "und", + "pid": "a4", + "streams": [], + "title_tex": "", + "trans_title_html": "", + "trans_title_tex": "", + "translations": [], + "url": "http://www.comsis.org/archive.php?show=pprnnn-1204" + }, + { + "abstracts": [], + "bibitems": [], + "contributors": [], + "date_published_iso_8601_date_str": null, + "doi": null, + "ext_links": [], + "ids": [], + "kwds": [], + "lang": "und", + "pid": "a5", + "streams": [], + "title_tex": "", + "trans_title_html": "", + "trans_title_tex": "", + "translations": [], + "url": "http://www.comsis.org/archive.php?show=pprnnn-1205" + }, + { + "abstracts": [], + "bibitems": [], + "contributors": [], + "date_published_iso_8601_date_str": null, + "doi": null, + "ext_links": [], + "ids": [], + "kwds": [], + "lang": "und", + "pid": "a6", + "streams": [], + "title_tex": "", + "trans_title_html": "", + "trans_title_tex": "", + "translations": [], + "url": "http://www.comsis.org/archive.php?show=pprnnn-1206" + }, + { + "abstracts": [], + "bibitems": [], + "contributors": [], + "date_published_iso_8601_date_str": null, + "doi": null, + "ext_links": [], + "ids": [], + "kwds": [], + "lang": "und", + "pid": "a7", + "streams": [], + "title_tex": "", + "trans_title_html": "", + "trans_title_tex": "", + "translations": [], + "url": "http://www.comsis.org/archive.php?show=pprnnn-1207" + } + ], + "number": "", + "pid": "CSIS_TEMPPID_65", + "url": "http://www.comsis.org/archive.php?show=vol0102", + "volume": "", + "year": "" + }, + "urls": { + "http://www.comsis.org/archive.php?show=pprnnn-1204": "eJylWN1u28gVvg+w73CWF20CSKRkewMnK7ObSHbjwHGM2Gm7V8WIHInjDDnszFCydlugd32AXvUuzxLsk/RJ+s0MKdM/CnZRBEY4P+f/nO+c0eTb2fvp1Y8Xx1TYUtLFx9dnp1OKhkny5/1pksyuZvSXN1fvzmgcj+jSapHZJDk+jygqrK1fJsl6vY7X+7HSy+TqQ3LjuIwdWfs5NJ4mzm0epU8mXshNKStz9AiD8YsXLwId7uIyZzloSm4ZuctD/rdGrI6iqaosr+zwalPziLKwOoosv7GJI/6esoJpw+1RYxfDw4gScLHCSp5OVXl5ekl/J3zUjeX6y+fLTPAq418+syr/8vm0WihdMitUhaONsbw0kyQQP5lIUX0izeVRZOxGclNwbiOy0KMVnxkTUaH54ijS3CT+1jj2u06JJJj0ZDJX+QZrk2lR2z6Da7ZiYTcio7PAZv3TX61S0oo6vjZROknCjV/PwEiR8/u0uViRyI8iJoO7u7XTkWsfrXH62/wEAhgJRilNvh0OKbCi4bDPv2SiuiPQQLk58xLr9NVcNTCo4HStGl0xOUlqHDTSuz+dsNa9osr5TVwXdZS+USWnmi35JGEwELfuXnUZwpCD/vI0LEjcGrCDjInSBJpX+CKYTSZT9S4pXiFRLQPJabv6FXJqJQV828o6zoVVWjBJ3f5uq4zSVjRlZ5hP7dvtHXRzxXTeCnsbXExhbwdBySq23Br2rl0Fmh5J4kNUpydKE2tsobTZGbqtSzpnbTcIX76ktZg31vN4VKkWPNQ1LCkXcVOZmGWxNsk2L5JMlUYg5S9YjRw0zbwUxuwOw+2FNuhwYiaR77cHSEutmmXx5fP7t5c7w1JvtFgW23Rrl2Q1q8wCmjhbdxDnal1JxfJAO2tXZHh2L3/6zta+yHY5m+msECvuOP7BFGp9JI2t+DpypLaAj1wsWbB1l7Mf8FgpuTceHcC8RmuAL8E/za7C6FE7r/rFTlNWgq93G/Mbov4nMPI1G3j24m8eEX7O11+Tado+tWAZnyv1KYaUJBRcW0JJBMgT5fIWcxfzuK6WETGJ3nTyOkpSau/+3tBJy+irsLWo2xwCQhtfGKbmmYMG7+5d0aqUFQuR9eoL1gUS6p89cEMPuFtIvofcbat1QO0uvT7+4+l5uPIN/k1qyiQz6OwCvf7jbErPD8fxfjx6fuBci27Snft26hGrVsbl35WvjQ4DDKmFb9BC8pzeoqHRhVZLzYDCa2ELbM03Gfft5lZoizlR+kG5DkNnvFpKbiemqdMx2h7+G9BVIbjWG3Bvarbh/UOfLMcYVui1bipu4aDeqbdAydR5+gTaZpyuuOTIA/rvP/9NH343o5lYCY8vc52ke4cYQjiKi/KGkIvNDU2LX/5T8QG94xvJqgEFNv72z9rrHMug8wBA4/VEnnk9B8ShWDxvFfvHDwtPa4MGLht9ICeJ07DYRxPF2IUuBxftw0UpzDZdkaMjWa3yJuOmdeSAGBm1sGumOYFzyddKf/IJ5/qwKGvJS8Q9ALRzUxYi59eIlX0YvuxO+Oo2fDG9U8ZSLoxF2BthChf9BWe2Qck4uqAROU3Gz0hY49TlzhYvS4tOjwGJmMe4gQKvQYx9Qzwoau4oBSU6+cTu8DP0FMNloWCQK7FnA1oXIitoyStckXLju2lTQjWAo6GSlwrJ4zwwvfhIVmDusAWrblVw5vSF4aMuvDpqfg0UN9976r1gmndj5fDzUcOggVqbbRxybsTy6/4Hcjqp22BuBRiaMwNH+B7GXeplFjhyq0Cpci5j8okimYWLMH8Ij+yEXFLIS++NCkaAj1VusNHMjfdNhuiBmVPMY73l7U6AqUEYnTqdkOKi4shn13ccoEE508DrzHR2VcsBRi0m/YcrDIs/fx+MOJrPyi36Vsdd4ojABILygBUK9mrCLOZTbkde3stgMmGgHVBjglgwAPKZ1t+OEF1FNcAjLuXwU4VOjRyyfIlA/sS6uLTsd3CPAyrupxfNXLaYfHdcDMW7BfbksT48Go/20O2URJrSeECnHuf36Om5WvFyDtv3RqODZx7sHdT8yAHr0K0n9KW/4k9PLy/PsTwYPR+OR+NDevq+QhT4M3/oSfDm0S+pHTanvWEz2HLSSEnuIdLTvgXobrjpHkh1vvCWoLNUVTUc77mBwvfQlkDyRf8R4+9vO2o3HKGvvloxIdncQxtdzE68shNephfQze/PVNY4YKAT79xJgsPQ/oLWb9TapXSG1O30Puug+EM86NoF+gcWXXsY0HHsPfH/NLF4+xKl9oHlk7w/EV92yYggxy7E5yqmvQEd7sNn38X0tA1wnX7zJDRmOj6fkW/M/Z7edu+2p/cOfLu82+kXSkElvDrrqGNy/zBqd/rBSjHXNNJunO2tOdD7Si8RAcAVv2b0fq5Zrlbil38x2h8QxsjRCBatBF2yfECXXM8FG1Dv9SGkVS/DWPfDduKL0vs7IZ63mt7Rq0vdnOabh8lLDcZHHdKGhfc9agP1zqO7g2eGWduiAiG7BLX/0aK9aZL5ZlhlwypPDuIRxsFpe9eJc5fplQ2PGsR0eK4qt821G+ewmgEPV/66IZAj/vBw5eMPHD0LIqBgqA9fAA/4n3Uq+18d3ENPw6rhWuS2eDlqC6mbZkX8qC13jTg8vNkf+5pL0h3uxUABrEUyfPncnfYyK2TKw4xDZ2vTM/wQ4n4ZsSVm0P8BzixdQA==" + } + }, + "metadata": { + "collection_url": "http://www.comsis.org/archive.php" + }, + "output": { + "abstracts": [ + { + "lang": "en", + "tag": "abstract", + "value_tex": "This article introduces Jabyce, a software framework for the implementation and composition of transformations of compiled Java programs. Most distinguishing features of Jabyce are 1) its interaction orientation, i.e. it represents elements of transformed programs as interactions (method calls), which generally consumes less memory and CPU time than representing programs as graphs of objects; and 2) its component orientation, i.e. it allows for the design and composition of transformers as software components based on the Fractal component model. This latter point is strongly connected to infra-structural and architectural issues, and software engineering aspects such as composing, scaling, maintaining and evolving transformers. Jabyce is compared with other existing compiled Java programs transformation systems, using an extension of a previous well-known categorization of program transformation systems." + } + ], + "bibitems": [], + "contributors": [ + { + "role": "author", + "string_name": "Romain Lenglet" + }, + { + "role": "author", + "string_name": "Thierry Coupaye" + }, + { + "role": "author", + "string_name": "Eric Bruneton" + } + ], + "date_published_iso_8601_date_str": null, + "doi": null, + "ext_links": [ + { + "location": "http://www.comsis.org/pdf.php?id=nnn-1204", + "rel": "article-pdf" + }, + { + "location": "http://www.comsis.org/archive.php?show=pprnnn-1204", + "rel": "source" + } + ], + "ids": [], + "kwds": [], + "lang": "und", + "pid": "CSIS_TEMPPID_65_a4", + "streams": [ + { + "location": "http://www.comsis.org/pdf.php?id=nnn-1204", + "mimetype": "application/pdf", + "rel": "full-text", + "text": "Full Text" + } + ], + "title_tex": "Composing Transformations of Compiled Java Programs with Jabyce", + "trans_title_html": "", + "trans_title_tex": "", + "translations": [], + "url": "http://www.comsis.org/archive.php?show=pprnnn-1204" + } + }, + "a7": { + "input": { + "article": { + "abstracts": [], + "bibitems": [], + "contributors": [], + "date_published_iso_8601_date_str": null, + "doi": null, + "ext_links": [], + "ids": [], + "kwds": [], + "lang": "und", + "pid": "a7", + "streams": [], + "title_tex": "", + "trans_title_html": "", + "trans_title_tex": "", + "translations": [], + "url": "http://www.comsis.org/archive.php?show=pprnnn-1207" + }, + "issue": { + "articles": [ + { + "abstracts": [], + "bibitems": [], + "contributors": [], + "date_published_iso_8601_date_str": null, + "doi": null, + "ext_links": [], + "ids": [], + "kwds": [], + "lang": "und", + "pid": "a1", + "streams": [], + "title_tex": "", + "trans_title_html": "", + "trans_title_tex": "", + "translations": [], + "url": "http://www.comsis.org/archive.php?show=pprnnn-1201" + }, + { + "abstracts": [], + "bibitems": [], + "contributors": [], + "date_published_iso_8601_date_str": null, + "doi": null, + "ext_links": [], + "ids": [], + "kwds": [], + "lang": "und", + "pid": "a2", + "streams": [], + "title_tex": "", + "trans_title_html": "", + "trans_title_tex": "", + "translations": [], + "url": "http://www.comsis.org/archive.php?show=pprnnn-1202" + }, + { + "abstracts": [], + "bibitems": [], + "contributors": [], + "date_published_iso_8601_date_str": null, + "doi": null, + "ext_links": [], + "ids": [], + "kwds": [], + "lang": "und", + "pid": "a3", + "streams": [], + "title_tex": "", + "trans_title_html": "", + "trans_title_tex": "", + "translations": [], + "url": "http://www.comsis.org/archive.php?show=pprnnn-1203" + }, + { + "abstracts": [], + "bibitems": [], + "contributors": [], + "date_published_iso_8601_date_str": null, + "doi": null, + "ext_links": [], + "ids": [], + "kwds": [], + "lang": "und", + "pid": "a4", + "streams": [], + "title_tex": "", + "trans_title_html": "", + "trans_title_tex": "", + "translations": [], + "url": "http://www.comsis.org/archive.php?show=pprnnn-1204" + }, + { + "abstracts": [], + "bibitems": [], + "contributors": [], + "date_published_iso_8601_date_str": null, + "doi": null, + "ext_links": [], + "ids": [], + "kwds": [], + "lang": "und", + "pid": "a5", + "streams": [], + "title_tex": "", + "trans_title_html": "", + "trans_title_tex": "", + "translations": [], + "url": "http://www.comsis.org/archive.php?show=pprnnn-1205" + }, + { + "abstracts": [], + "bibitems": [], + "contributors": [], + "date_published_iso_8601_date_str": null, + "doi": null, + "ext_links": [], + "ids": [], + "kwds": [], + "lang": "und", + "pid": "a6", + "streams": [], + "title_tex": "", + "trans_title_html": "", + "trans_title_tex": "", + "translations": [], + "url": "http://www.comsis.org/archive.php?show=pprnnn-1206" + }, + { + "abstracts": [], + "bibitems": [], + "contributors": [], + "date_published_iso_8601_date_str": null, + "doi": null, + "ext_links": [], + "ids": [], + "kwds": [], + "lang": "und", + "pid": "a7", + "streams": [], + "title_tex": "", + "trans_title_html": "", + "trans_title_tex": "", + "translations": [], + "url": "http://www.comsis.org/archive.php?show=pprnnn-1207" + } + ], + "number": "", + "pid": "CSIS_TEMPPID_65", + "url": "http://www.comsis.org/archive.php?show=vol0102", + "volume": "", + "year": "" + }, + "urls": { + "http://www.comsis.org/archive.php?show=pprnnn-1207": "eJy1WNty47YZvt+ZvMMf3nR3RiQl2012HZnNWrK7yvigWdlpc9WBSEiEDQIsAEpWp/d9DT+LJw+WHwdK9EGZ5KKzF2uA//H7j9Dw2/H16OaX6RmUpuIwvT29mIwgitP0H4ejNB3fjOGfX24uL2CQ9GFmFMtNmp5dRRCVxtTHabper5P1YSLVMr35mj5YKQPLFv6MteNJClNE2buhU/JQcaFP3hAw+PTpk+dDWiSmpECeihoCljim/27Y6iQaSWGoMPHNpqYR5P50Ehn6YFLL/APkJVGampPGLOKPEaQoxTDDaTaS1Wwyg/8C/lE3hqqnx1nOqMjp0yMRxdPjRCykqohhUuCnjTa00sPUM78bcibuQVF+Emmz4VSXlJoIDNoR1OdaR1AqujiJFNWpoxok7tYakXqX3g3nstjgWeeK1aYr4I6siL+NQKvci1n/519GSm5YndzpKBumnuKPC9CcFfQlb8FWwIqTiHAPd3u2NlLlojXI/hxOyIBOoqAMht/GMXhREMdd+RVh4plCjcbNidNYZ5/nskGHSgp3slGC8GFa44eGO/izIQnwMlHQh6Qu6yj7IisKNVnSYUrQQaR6TmozhGAOOuKRPwDbObCHjbBKe57P+Beg26BzWe/T4gxiYulZJuH0B/TUkjPENug6K5iRihEO7f1+r7RUhjVV65hL7d31Hr65JKoIyn7yEIO/28NQEUGWW8cuw8nzdFhSF6I6O5cKSGNKqfTe0G0hacHaXgD+5UpasXljnIw3jQrNQ96hJ9UiaYROSJ4onW7zIs1lpRmm/JTUmIO6mVdM6/1h2BGEoCOIOcd8333AtFSyWZZPj9c/zfaGpd4otiy36RaOYBQReoGWWF/3MBdyLbgkhecdhxNomr/Iny7YyhXZPrCJyku2olbi33Qp1ydcG0HXkWU1JWJkY0m8r/vAfiVjJfnBoH+E7jVKYfMFxKfZVxgdbouqO+x1ZcXoer8zfyLqP6MgV7NeZif++g3lV3T9ezp1mFMLktO5lPcJakl9wYUSSiNseaxa7nruYp7UYhkB4Tibzk+jNINA+xcN50HQ77atRR1yCDu0doWha5rb1uDg3hctIQ1bsLxTX+idZ4Hut1cwdBp3aMkvOncYtbZRW6LTs79PrjzJN/hvWEPOicbJznDW345H8N1fk/7AwoqTpP3mRmmU3ci1bTlA4JJirygkl8sNGAln2jBsBRQntDYgF3A9v8P0j68Vjh9DC5jJhUFeCmO6olzWlc2/qZKWyo+gnSGhD0XZV1JIzckKLhP4Su7liv36v6Fu6gzNs/85IyXPLJBngqqlrK3EexOfoVKOxiRwYYpkOFdpdtpwukJwLllJGCcwbWomCAwOenBK+VJhOfZgRtWcEUevKuVU5j9S3GGsPJs/DvphapWWhzj2cFHCuYQOHKID2azJc6r1ouHg+i91biIcBHQLQO2d7sGc2nhZMNeU87igCyYQqfAdig5QeGfl9rAscJlSlkkAJYpvgLbIo5aWFe2sOQ4ys0lgIlBBjeWETtumUVt7WY6+bi3qasoxfhoa3WD2bqAkK4p2UoE0uE5UzkAUh0RUMRx4wBKaAFkY26tfeAjMWmGl+qSGie04aLDGK6yHHmBTQtWqZzPoXsg1dmpHEDyydlslPafFAobVRK13ld1pWmzXUt3rBG4sb+2mBrprdTiUiMElxy5b0g1Zoxq0jEDVzd+SGJTS8AKoIHNO30Z3L16uYTFUp1wIK6e6aJSFGxuYxphZ0mcBLYmmzmj6zBTM/6by4ZW+gmRbQaRGXElewhxZC3BzjcKtwM6Ax0tZUBffCyKWDWYevL+9vPjgTGtp9tWgTS0kn42nH1yQcJVDUd6KW401jQphrHAAiB64UWDQskbReETtyM97yEUVRnlFncKJyJUPPN8hGHLYo507tDVbCtfYhEGsMV2UDBK0zR222IQwOJRbSQG2EGT0qwsg5gDjvLFlaTxK9IHYPLQRtIBNr2c3b4cypG3iW99hNm3mPDTd5/ugr/Vt507fGrT9Qf8Ax5nkGE4YIECukR/A+yt0sZpjkh70+0cfXDe33eYXdNSa2FF67Ejc18lsdoXHo/538aA/+Ajvr4WN0Af30bHgo0YdQ9gmR51t0vty3nAO9qXRsT5023Z7aV9AdbFwnuDoEELEg4P+9/gAsUMyMHC66L5SHP12ZLbbDw7Oz7bluHJiAqbjc2fskFbZFG1z92OZNw76cwfuMMWPfr55q7/YloB1i/nW2t2OgR58TXAsHMP/aSQl27cmhCdUyOzdzhveTz3AKCc2xlc4bnCaDL4/jAefjhJ4H0JcZ9+887MXzq7G4GZvd2yHAR3GdueDfXO9GOYLKdEmfFjWUSvk5cco3HTDleHq0nCzsTgEf9DwG7XEGGiE944gPDgCHbYEDnuAm2K/jy6tGMxI0U7GHnQeGIwbeew3tx+3S12UvbzxEd1Z+syuNnkLmG9epy80uCEqnzjEP+GxOqjQNHq+W2K/ce3Hzgbkdr9LBEqdzjexyGNRpEdJHze+UaC16iwxfDb+3YJBja+ksNdU2Y0NT2OccytHrgHZMQEQYeESAJvbhVeBBvoKcSXwSv5Fa7L7YcG+5RR6Fa9ZYcrjfiildmFlyZu+PHfi48eHw4GrujTbAy8+DDC3MRmeHtuvnczymfI643Dqh/T0v3XYHz9MhWvmb6ifUNo=" + } + }, + "metadata": { + "collection_url": "http://www.comsis.org/archive.php" + }, + "output": { + "abstracts": [ + { + "lang": "en", + "tag": "abstract", + "value_tex": "Successful management of a software project, besides a well-defined project development process, requires an early estimate of project complexity. In a prevailing practice, software development costs usually have been determined a posteriori i.e. after software project implementation. It is essential, however, to know this estimate a priori, i.e., before commencement of works. This paper presents an attempt to construct a methodology that would enable an early estimate of software development cost and its refinements during subsequent development phases. The methodology assumes an object-oriented approach based on the Unified Modeling Language (UML) and Unified Software Development Process (USDP). It outlines an Use Case Driven, Architecture-Centric, Iterative and Incremental estimate process that could significantly improve and simplify early cost estimates. The presented methodology is illustrated on example of the POST software development project." + } + ], + "bibitems": [], + "contributors": [ + { + "role": "author", + "string_name": "Radoslav M. Raković" + } + ], + "date_published_iso_8601_date_str": null, + "doi": null, + "ext_links": [ + { + "location": "http://www.comsis.org/pdf.php?id=nnn-1207", + "rel": "article-pdf" + }, + { + "location": "http://www.comsis.org/archive.php?show=pprnnn-1207", + "rel": "source" + } + ], + "ids": [], + "kwds": [], + "lang": "und", + "pid": "CSIS_TEMPPID_65_a7", + "streams": [ + { + "location": "http://www.comsis.org/pdf.php?id=nnn-1207", + "mimetype": "application/pdf", + "rel": "full-text", + "text": "Full Text" + } + ], + "title_tex": "Towards a Methodology to Estimate Cost of Object-Oriented Software Development Projects", + "trans_title_html": "", + "trans_title_tex": "", + "translations": [], + "url": "http://www.comsis.org/archive.php?show=pprnnn-1207" + } + } + } + } + } + }, + "parse_collection_content": { + "CSIS": { + "CSIS": { + "input": { + "http://www.comsis.org/archive.php": "eJy1W91u47gVvh9g34Grm3aB2KJsx3amidtsktnNYpIJ1ului6JY0DJtc0YWtRQVj4ve9zXyLEEfrIekbMuOaVOKitxYEn/Oz3cOD/mF599ef7p6/PvDDZrJeYQe/vr9x9sr5DV8/9f2le9fP16jv/34ePcRBU2MhlKwUPr+zb2HvJmUyXvfXywWzUW7ycXUf/zZ/6pGCVS3/Gcj1X2aYzn2Bu/O9SRf51GcXuwZIDg7OzP9oC00pmQMfeZUEqQaN+jvGXu68K54LGksG4/LhHooNE8XnqRfpa86/wmFMyJSKi8yOWn0PeTDKJLJiA6u+Hx4O0T/RvAjySQVL8/DkNE4pC/PJB6/PN/GEy7mRDIew6dlKuk8PfdN53fnEYu/IEGjCy+Vy4imM0qlhyTIkU8fpqmHZoJOLjxBU1+3Cpr6rRLCNyq9Ox/x8RKe01CwRBYH+EyeiHnroVSEZpjFv36TnEeSJc3PqTc4900L9wHSiI3pbt8xe0JsfOGRyJh79axkpEJ7KxiUsxN0ACVhoAE6/7bRQGYo1GgUx58TFm9NmIJwI6JnTAaXI56BQjOKPvNMxCQ69xP4kEXa/INzkpuXxWP6tZnMEm/wI59TlJApPfcJKAittpsqhBDAoG58ZR4Q2yhg6UbYPDV9LuEXArVRGvLENosWiMVT0+U2f3KYJ+ERA9vmc92MmeSCkQit3tu1SrmQLJuvFNPQ3ry29BtxIsb5ZD8ZEyPzztJhTmIyXSt2lz+ZPoUuvnZRMvjABSKZnHGRWl23NsnKWOsXCH7pkBZslEk9xl6h8uTBP4Mm80kzi9MmCZsi9de48EM+TxlA/oEkgME0G81ZmtrdsGmQOx2MGEaA980HgKXg2XT28vzpp6HVLclSsOlsDbf8EUlB4nQCkihdLZ3HfBFHnIxN3+v8CaU03MFP0dhCB5nN2ESEM/ZE1Yh/Tmd8cRGlMqYLT3WVM7CR8iUxutqM/WqMJx61AtwB9TIhIPkisE9mC4xCb2VV/WBV5YnRhV2ZEl7/BQbSMWvGLPg/3TP5PV0cmjPN16kJCemI8y9NmMU3AZeHkO9BymPz6SbnTkbNJJ56iESwNn343vMHKG/7hxR9yAc6mLYmSY4hyNCpDow0oaFKDdrcNm/FXLIJCwvxBdqZLqj47ZUZCok7T8k7mTtfalWiVo2+v/nh9t40+Qb+1HKxdi/8Pp+1B7/wKIPk3ArQH1u41fkOPrQHYG8QNoxIqqoANh7TeLVo+haoBaBENh9B9AQn6CcSZ0QskRpRKzES/uAfaG2AA4MoKKguUAFs8u94oluChqbRKgp1w38qWFSQuLWWuHWCLhPBIiOvkvWczgfD3JW32i+QW64pTdBHSkSsIvKRhrOY/Q5efnlmMboFw0cRm6pYU79FTCXiE/Q4g8apXpPR6Q+qqplnce5fdE/lgosvgBOYr4yhWi6GatVkqPbaUG1wbRbT0n5tF8StKkVnLUXnBA1pAvWM9l1JUTqvRCkEAdZB0K4YBNgaBG13CbFLEOCaggBbgqBtD4I7AuMABFZx0BiRlEK5eQ3t9Ao8zJIEFi6Ul5xIxQZ/ROmqAi2FdOyCdFwT0vF+pJdyXttF3HZN4tpDopTMHReZO3tl3sROcKZjp1UtdoIza+y0nFUxgxxRxTR6u/lhnK3YWaGllLjHwW0a1SLuFri30FJK5uMIN40Oo6Wv0RJUREvfipbAXZW+C1r6NaGlb8m0gT3T3syp0Pu4IRVPLMzrDLXvvqdfZeMHGlNhyohf6ej9y/OPGewD0R2lMkVqawRVpB5vXZiEtFz2NUIft1A9AO3vz76lHOqAzX492RfGsWffUjIfz76m0eF46ul4whXjqWeNJ+yuSs8lnno1xVNvf/YtJa4DuHs1gbu3De5PoeQ5VkpJ7IDv3vHc21VYCc4qYqVrw0pw5q5K1wUr3Zqw0t2LlXLiOmClWxNWujasgMSrxeJnGqqN5uX4iUBaN0V28bz3QXB4neqTI9h4DmmYCSaXJfN/1wVw3eOAO9WA61cE3KkVcH13VU63AFfNMaf7cVRKilYNUtjgUUqQ11vygsM62mG9ig7rWB3Wc5ewU4PDOvsdVkqKtzusY6+8QZRVRFtC2Rwz8YhPlyewyWapOXanYx3Xd3xMI3QtYM4YDavssI10x2K8czzG2xoy3YqQaVsh03VXpV0DZNr7IVNKirdDpr0NmXv+tEZM99WGga02DI+CxmMFoJfnIZ/IBRF0Ax9NU5VCRvtwjmhph59WdHjL6vBTdwlbNTi8td/hpaR4u8Nb2w6/zKZZKo0c1v3hFY8iMuJqD/hEEW3kR8uyvKdbbz+ghTG29kNFyB7Q4XUds0Vco00dc4IeiCBRBPlO09vFVGgocGhUWu9Dp8GBpkSCipRIYKVEAvfz6iCoAeHBfoSXksKh/K2JbwgCWygcYGbu+IhFtBgRxZWT0Zx/WfMyqxOVkngJXNbKoKYDhh3Go1DnHTBEsYrI64ETUyWoEHp5/kjiaUam1PyzxKViq8ob4WDQaAolqEihBHspFKWz+4F24MKgBDUxKIGFQQkOMChrH6mN2mvkQi6vVsm5cCVBTVxJsJ8rKeemty862BojLvZn8VYB/fJ8E09ZTKnQy00NoeJCrQTHqRXNrAQVmRVsZVYC92N/7MKs4JqYFbyfWSkn7vFQwDUxK/gAswIyO8Fw8z98a9Rt0nZl/GEXsgbbyJryZtiKxWuo7ByssFMAaopjxRybJTsvBsPyyh8PPtPoYPBpoiqoSFRhK1EVuHME2IWowjURVXiHqLojubT7zik+KNdEyIrZJIny/3Qp6zwX5gnXxDzh/cxTOQ85BFpNzBPuWxe9jZeulvCq8TBbpuCAaP1/RuYwOD8bKO0Sh3g6zlRpoiqoSFThHaLqAx2JVUDhvRBdZVZ0GbN8T6lscM2mTIJhbtSOQBIWzyELlTSIC9+Fa+K7cM9SZ+7XuhCRP9OIqD1ycTf01uh0oc5wTdQZ7r2KTqN3CWkdgtNGm5WX1roKlhLZIdZ6R2NNE324ItGHd4k+bXjszprh7tsPL/AOd1ewZzlJXh/Rbcyk6SlckZ7Cu/SUMZM7F4NroKbwqd1M/c2iALkQNpvi/5QV9pFbGyNrSglXpJTwLqVkjOxO5OAa6CTcsRt5P4/zIPhUkLn+B/51Si5p1X0M1MaqmnXBFVkXvMu6GKu6cx24BsYFt+1WLSXJITNprgJX5CrwLldhzOTOEOAaeArcspuplCSHzKQPvHHFA28cWGsz7H7WjGs48cY7J94FLqKcJK9s9c07c68C3dxfI32vonglI798kV/JKHxQ9+l2LmpMOIfK8zfJE281yO5HL3+TKx7RifQGH0iYRXKpbhbk1/0gmz+KKbrmKZP0M0GfRoKM+RP773+IqpdaAcZYmYChIRmr4wkxYuRkozwIF0n+3tzK+cv6wo432H2jLVeQdEuuh2wUsXQGS8poifKLblfri24oi8dUmBxJzPVMWGVonNKVI/N7Q6Gg+ggU5p5Db33nNG+Z+qNlIw4b8djvNLHvDa7ytvpOBTRGl9IwQrB2Ne55rF5ToY4b4OmaCvakm6cIuudUgF7n1A7WTAECDvQVIX0j6NX4H1ci60uj6p6eAK0aCzaWs/c4v8y5uozEmnt12Vai3//aDvQlJH9gMW8YUdirARhenldfC8gySHmNOBJFOTzNPVZ1sVXOo8G7/wErYK5C" + }, + "metadata": { + "collection_url": "http://www.comsis.org/archive.php" + }, + "output": [ + { + "articles": [], + "number": "", + "pid": "CSIS_TEMPPID_0", + "url": "http://www.comsis.org/archive.php?show=vol2101", + "volume": "", + "year": "" + }, + { + "articles": [], + "number": "", + "pid": "CSIS_TEMPPID_1", + "url": "http://www.comsis.org/archive.php?show=vol2102", + "volume": "", + "year": "" + }, + { + "articles": [], + "number": "", + "pid": "CSIS_TEMPPID_2", + "url": "http://www.comsis.org/archive.php?show=vol2103", + "volume": "", + "year": "" + }, + { + "articles": [], + "number": "", + "pid": "CSIS_TEMPPID_3", + "url": "http://www.comsis.org/archive.php?show=vol2104", + "volume": "", + "year": "" + }, + { + "articles": [], + "number": "", + "pid": "CSIS_TEMPPID_4", + "url": "http://www.comsis.org/archive.php?show=vol2001", + "volume": "", + "year": "" + }, + { + "articles": [], + "number": "", + "pid": "CSIS_TEMPPID_5", + "url": "http://www.comsis.org/archive.php?show=vol2002", + "volume": "", + "year": "" + }, + { + "articles": [], + "number": "", + "pid": "CSIS_TEMPPID_6", + "url": "http://www.comsis.org/archive.php?show=vol2003", + "volume": "", + "year": "" + }, + { + "articles": [], + "number": "", + "pid": "CSIS_TEMPPID_7", + "url": "http://www.comsis.org/archive.php?show=vol2004", + "volume": "", + "year": "" + }, + { + "articles": [], + "number": "", + "pid": "CSIS_TEMPPID_8", + "url": "http://www.comsis.org/archive.php?show=vol1901", + "volume": "", + "year": "" + }, + { + "articles": [], + "number": "", + "pid": "CSIS_TEMPPID_9", + "url": "http://www.comsis.org/archive.php?show=vol1902", + "volume": "", + "year": "" + }, + { + "articles": [], + "number": "", + "pid": "CSIS_TEMPPID_10", + "url": "http://www.comsis.org/archive.php?show=vol1903", + "volume": "", + "year": "" + }, + { + "articles": [], + "number": "", + "pid": "CSIS_TEMPPID_11", + "url": "http://www.comsis.org/archive.php?show=vol1801", + "volume": "", + "year": "" + }, + { + "articles": [], + "number": "", + "pid": "CSIS_TEMPPID_12", + "url": "http://www.comsis.org/archive.php?show=vol1802", + "volume": "", + "year": "" + }, + { + "articles": [], + "number": "", + "pid": "CSIS_TEMPPID_13", + "url": "http://www.comsis.org/archive.php?show=vol1803", + "volume": "", + "year": "" + }, + { + "articles": [], + "number": "", + "pid": "CSIS_TEMPPID_14", + "url": "http://www.comsis.org/archive.php?show=vol1804", + "volume": "", + "year": "" + }, + { + "articles": [], + "number": "", + "pid": "CSIS_TEMPPID_15", + "url": "http://www.comsis.org/archive.php?show=vol1701", + "volume": "", + "year": "" + }, + { + "articles": [], + "number": "", + "pid": "CSIS_TEMPPID_16", + "url": "http://www.comsis.org/archive.php?show=vol1702", + "volume": "", + "year": "" + }, + { + "articles": [], + "number": "", + "pid": "CSIS_TEMPPID_17", + "url": "http://www.comsis.org/archive.php?show=vol1703", + "volume": "", + "year": "" + }, + { + "articles": [], + "number": "", + "pid": "CSIS_TEMPPID_18", + "url": "http://www.comsis.org/archive.php?show=vol1601", + "volume": "", + "year": "" + }, + { + "articles": [], + "number": "", + "pid": "CSIS_TEMPPID_19", + "url": "http://www.comsis.org/archive.php?show=vol1602", + "volume": "", + "year": "" + }, + { + "articles": [], + "number": "", + "pid": "CSIS_TEMPPID_20", + "url": "http://www.comsis.org/archive.php?show=vol1603", + "volume": "", + "year": "" + }, + { + "articles": [], + "number": "", + "pid": "CSIS_TEMPPID_21", + "url": "http://www.comsis.org/archive.php?show=vol1501", + "volume": "", + "year": "" + }, + { + "articles": [], + "number": "", + "pid": "CSIS_TEMPPID_22", + "url": "http://www.comsis.org/archive.php?show=vol1502", + "volume": "", + "year": "" + }, + { + "articles": [], + "number": "", + "pid": "CSIS_TEMPPID_23", + "url": "http://www.comsis.org/archive.php?show=vol1503", + "volume": "", + "year": "" + }, + { + "articles": [], + "number": "", + "pid": "CSIS_TEMPPID_24", + "url": "http://www.comsis.org/archive.php?show=vol1401", + "volume": "", + "year": "" + }, + { + "articles": [], + "number": "", + "pid": "CSIS_TEMPPID_25", + "url": "http://www.comsis.org/archive.php?show=vol1402", + "volume": "", + "year": "" + }, + { + "articles": [], + "number": "", + "pid": "CSIS_TEMPPID_26", + "url": "http://www.comsis.org/archive.php?show=vol1403", + "volume": "", + "year": "" + }, + { + "articles": [], + "number": "", + "pid": "CSIS_TEMPPID_27", + "url": "http://www.comsis.org/archive.php?show=vol1301", + "volume": "", + "year": "" + }, + { + "articles": [], + "number": "", + "pid": "CSIS_TEMPPID_28", + "url": "http://www.comsis.org/archive.php?show=vol1302", + "volume": "", + "year": "" + }, + { + "articles": [], + "number": "", + "pid": "CSIS_TEMPPID_29", + "url": "http://www.comsis.org/archive.php?show=vol1303", + "volume": "", + "year": "" + }, + { + "articles": [], + "number": "", + "pid": "CSIS_TEMPPID_30", + "url": "http://www.comsis.org/archive.php?show=vol1201", + "volume": "", + "year": "" + }, + { + "articles": [], + "number": "", + "pid": "CSIS_TEMPPID_31", + "url": "http://www.comsis.org/archive.php?show=vol1202", + "volume": "", + "year": "" + }, + { + "articles": [], + "number": "", + "pid": "CSIS_TEMPPID_32", + "url": "http://www.comsis.org/archive.php?show=vol1203", + "volume": "", + "year": "" + }, + { + "articles": [], + "number": "", + "pid": "CSIS_TEMPPID_33", + "url": "http://www.comsis.org/archive.php?show=vol1204", + "volume": "", + "year": "" + }, + { + "articles": [], + "number": "", + "pid": "CSIS_TEMPPID_34", + "url": "http://www.comsis.org/archive.php?show=vol1101", + "volume": "", + "year": "" + }, + { + "articles": [], + "number": "", + "pid": "CSIS_TEMPPID_35", + "url": "http://www.comsis.org/archive.php?show=vol1102", + "volume": "", + "year": "" + }, + { + "articles": [], + "number": "", + "pid": "CSIS_TEMPPID_36", + "url": "http://www.comsis.org/archive.php?show=vol1103", + "volume": "", + "year": "" + }, + { + "articles": [], + "number": "", + "pid": "CSIS_TEMPPID_37", + "url": "http://www.comsis.org/archive.php?show=vol1104", + "volume": "", + "year": "" + }, + { + "articles": [], + "number": "", + "pid": "CSIS_TEMPPID_38", + "url": "http://www.comsis.org/archive.php?show=vol1001", + "volume": "", + "year": "" + }, + { + "articles": [], + "number": "", + "pid": "CSIS_TEMPPID_39", + "url": "http://www.comsis.org/archive.php?show=vol1002", + "volume": "", + "year": "" + }, + { + "articles": [], + "number": "", + "pid": "CSIS_TEMPPID_40", + "url": "http://www.comsis.org/archive.php?show=vol1003", + "volume": "", + "year": "" + }, + { + "articles": [], + "number": "", + "pid": "CSIS_TEMPPID_41", + "url": "http://www.comsis.org/archive.php?show=vol1004", + "volume": "", + "year": "" + }, + { + "articles": [], + "number": "", + "pid": "CSIS_TEMPPID_42", + "url": "http://www.comsis.org/archive.php?show=vol0901", + "volume": "", + "year": "" + }, + { + "articles": [], + "number": "", + "pid": "CSIS_TEMPPID_43", + "url": "http://www.comsis.org/archive.php?show=vol0902", + "volume": "", + "year": "" + }, + { + "articles": [], + "number": "", + "pid": "CSIS_TEMPPID_44", + "url": "http://www.comsis.org/archive.php?show=vol0903", + "volume": "", + "year": "" + }, + { + "articles": [], + "number": "", + "pid": "CSIS_TEMPPID_45", + "url": "http://www.comsis.org/archive.php?show=vol0904", + "volume": "", + "year": "" + }, + { + "articles": [], + "number": "", + "pid": "CSIS_TEMPPID_46", + "url": "http://www.comsis.org/archive.php?show=vol0801", + "volume": "", + "year": "" + }, + { + "articles": [], + "number": "", + "pid": "CSIS_TEMPPID_47", + "url": "http://www.comsis.org/archive.php?show=vol0802", + "volume": "", + "year": "" + }, + { + "articles": [], + "number": "", + "pid": "CSIS_TEMPPID_48", + "url": "http://www.comsis.org/archive.php?show=vol0803", + "volume": "", + "year": "" + }, + { + "articles": [], + "number": "", + "pid": "CSIS_TEMPPID_49", + "url": "http://www.comsis.org/archive.php?show=vol0804", + "volume": "", + "year": "" + }, + { + "articles": [], + "number": "", + "pid": "CSIS_TEMPPID_50", + "url": "http://www.comsis.org/archive.php?show=vol0701", + "volume": "", + "year": "" + }, + { + "articles": [], + "number": "", + "pid": "CSIS_TEMPPID_51", + "url": "http://www.comsis.org/archive.php?show=vol0702", + "volume": "", + "year": "" + }, + { + "articles": [], + "number": "", + "pid": "CSIS_TEMPPID_52", + "url": "http://www.comsis.org/archive.php?show=vol0703", + "volume": "", + "year": "" + }, + { + "articles": [], + "number": "", + "pid": "CSIS_TEMPPID_53", + "url": "http://www.comsis.org/archive.php?show=vol0704", + "volume": "", + "year": "" + }, + { + "articles": [], + "number": "", + "pid": "CSIS_TEMPPID_54", + "url": "http://www.comsis.org/archive.php?show=vol0601", + "volume": "", + "year": "" + }, + { + "articles": [], + "number": "", + "pid": "CSIS_TEMPPID_55", + "url": "http://www.comsis.org/archive.php?show=vol0602", + "volume": "", + "year": "" + }, + { + "articles": [], + "number": "", + "pid": "CSIS_TEMPPID_56", + "url": "http://www.comsis.org/archive.php?show=vol0501", + "volume": "", + "year": "" + }, + { + "articles": [], + "number": "", + "pid": "CSIS_TEMPPID_57", + "url": "http://www.comsis.org/archive.php?show=vol0502", + "volume": "", + "year": "" + }, + { + "articles": [], + "number": "", + "pid": "CSIS_TEMPPID_58", + "url": "http://www.comsis.org/archive.php?show=vol0401", + "volume": "", + "year": "" + }, + { + "articles": [], + "number": "", + "pid": "CSIS_TEMPPID_59", + "url": "http://www.comsis.org/archive.php?show=vol0402", + "volume": "", + "year": "" + }, + { + "articles": [], + "number": "", + "pid": "CSIS_TEMPPID_60", + "url": "http://www.comsis.org/archive.php?show=vol0301", + "volume": "", + "year": "" + }, + { + "articles": [], + "number": "", + "pid": "CSIS_TEMPPID_61", + "url": "http://www.comsis.org/archive.php?show=vol0302", + "volume": "", + "year": "" + }, + { + "articles": [], + "number": "", + "pid": "CSIS_TEMPPID_62", + "url": "http://www.comsis.org/archive.php?show=vol0201", + "volume": "", + "year": "" + }, + { + "articles": [], + "number": "", + "pid": "CSIS_TEMPPID_63", + "url": "http://www.comsis.org/archive.php?show=vol0202", + "volume": "", + "year": "" + }, + { + "articles": [], + "number": "", + "pid": "CSIS_TEMPPID_64", + "url": "http://www.comsis.org/archive.php?show=vol0101", + "volume": "", + "year": "" + }, + { + "articles": [], + "number": "", + "pid": "CSIS_TEMPPID_65", + "url": "http://www.comsis.org/archive.php?show=vol0102", + "volume": "", + "year": "" + } + ] + } + } + }, + "parse_issue_content": { + "CSIS": { + "CSIS": { + "CSIS_TEMPPID_61": { + "input": { + "issue": { + "articles": [], + "number": "", + "pid": "CSIS_TEMPPID_61", + "url": "http://www.comsis.org/archive.php?show=vol0302", + "volume": "", + "year": "" + }, + "urls": { + "http://www.comsis.org/archive.php?show=vol0302": "eJy1WN1y27YSvvdM32HLm9MzI5L6SeufyjzHtpzYru14IrVpe+ZMByIhETYJsAAo2Zlz36u8g/MAfQknfa8uAEqi7chjxzqTGUcAdxf77X5YYNH9uvd6b/DL2T6kOs/g7Mfd48M98PwwfNvZC8PeoAc/HwxOjqEVNKGvJYt1GO6feuClWhdbYTidToNpJxByHA7ehJfGSsuoVT99ZXWCRCdetNa1i1zmGVfbnzHQ2tzcdHooi8KUJKiTU03ACPv095JNtr09wTXl2h9cFdSD2I22PU0vdWiUv4c4JVJRvV3qkb/hQYhWNNMZjfZE3j/sw/8AfxSlpvLmuh8zymN6c014cnN9yEdC5kQzwfHTldI0V93QKa91M8YvQNJs21P6KqMqpVR7oNGPavlYKQ9SSUfbnqQqtFKtwM4aJ0IHaa07FMkVjlUsWaHrBs7JhLhZD5SMnZnpu9+0EJlmRXCuvKgbOonHG1AZS+hd3YRNgCXbHslcuGdj4yOVNlut6GlxQgUEiYYi6H7t++BMge/X7eeE8VsLKnRuSOyKRbQzFCUCSimci1JyknXDAj+UmQ1/1CVVeBlP6GVQpIUXHYicQkHGtBsSBIhSt0UNQwhy0ArvuQGwBYAlaoTlyuns4C9A2KBiUSxbxTrE+NipHFajR6xTiIxhbKu19hOmhWQkg9n8clRKSM3KfAbMUnsxvURvKIhMqsWOXIjBzS1RyAkn4zmwk2rkdGoqoU1REb0UEkipUyHV0tTNQzIL1nwC8Jfd0pINS21tfNapqniIc0SSj4KSq4DEgVThnBdhLHLFkPJnpEAOqnKYM6WWp2EhUCUdgxhnyPfFB6SlFOU4vbl+fdRfmpbiSrJxOqdbNQQtCVcj9MRgXaKciCnPBEmcbq8agaLxHf7Ugy3tJlsWbCLjlE2osfgvlYrpdqY0p1PPqOoUY2RySRzWZcG+Z2Misnar+QLhlVJi8QWMT7lsY9S0TVTtYCmUCaPT5WCekPWf0JDds85mLf/qM4uf0ulDa6rqnBqRmA6FuAhwldBtuGoLhR6WPJaPFzV3NAwKPvaAZHg2vdz1wggq2X8oeFkZerBsjYqKQ1ihld0YqqCxKQ023MuyxYVmIxbX9heicypQ/3YvDLXCXZXkO5W7OmpNoTZCu/uvDk+dyFf4zxwXP4msxGLcacChXa8N3/RoTPMh8r7dbH73T3tGdNN2dHONP9sRxn7ueHiPZxidZqfZrlVF53RhTRzyCdM0AbvBVc1cnBFlLhgsSSj3llsvJFfab79ornvRjwpdPJNixDKzI0y0zSH0lg67QxlGXZpHJywl8Ep+ek9kA3olPydwkuHe4+zTe0u1EyIvBEqIIc04u+iGqGQdNhb+Aw8Brbti+GrU8JqyOCSSkZXGNCwEZ+ViIVwhjzEuXmT+2k//dRBUQXhNwpLAc9PRDNZJ0FiAQpQ4nOMxX7dgeaCC+a0KqsuCjUq9ulc3hQYgUQJDk1MRQLsBLb+9GcA3FUWsR/P/ii/KagcLFPJmQrLSLS3QFeMfkUyZ4Qj61CjDPsfzjCoEZvAYLMf9MziheIQl89wfiXcCc36eiwmLLawDwlTKYJewp+fZOfeIPDvBFeZ5BqEBR5hadL4BB5jU58bpGanvtPxvv3sg97ir39BxmWE1euZGb603O3gC3UFngOySsb3TGJ/tWAilzYRDV8eM5YzZu8BAUgzGnvEAS6rxa8aVH9C2/vgnx/JA4lRMPn7AncOkUBmZwFmp3mnSgJd4HdDsrw/0AlfHzffxw4Vd/oyaIO7ShH/8IJ/OLIfxEcxygitk1gLrD8isGcwTyzKHD0GbkYOGIsi7/1synkHIb9f99fZqi1HzRWsT73MEG9kTxo33AxqnnP2OpzjiVXiIoUOvJL2qvIJBSvG8o+7YHzA8U/vY/Vocxsyca8eshF3GG/BrStDsQYkb2sz12YhWMeyZD78QMS7F0/nkPH8En5zgCvlkUR0jYSymX/GHw+QmHR5zQG3BCuP6DNqsd/yNFdMGTXVsY5fjunjAwk5RZNXtDXY09rEXYB5BXHPap3Epmb6q9sdO/yw43R/MmbIrzjHYR1j8P71nn/5w1KBmrs9ynPiC20rzkbXGCa6QG3MUWE4sMXL725DhedF6Rv43Ov7mQ+fYl5SNdXP3PRAZnid460A8UmCVBS0QX2EuYVrEAtsBXsFBIG+ZpNjJKTileirkhfFdEqVlGetS0jkd3pAEez97k0sIN/FcBSWcx48pF1ZwhZSo4WjAmzukWGkIn8GRzXW/1eosJ8lXa66zgv3THtjOqt6UVe1X1ZTVPpgXtTut2kgIdPA3LQpvZuTuR6+aqQKa0RF2dtiYlhnuCnPtc+AQxUCOoScUZgNbntdDidQxcSYGGV5Qm03EN2HQJwmGncohw4O/9nzEMi22XF/+73nLjtm9M+Nau4Wnt/w6K4eYwhQL+vAKqqeuvflTF5TY/0tHEuIeaHHnU66od/vlIJYUMzShsa0Syr46V5IqHF75PPZ5Er4ImtjP71WyVUlRpobYVynMsH8quJmm0vTjOOpRySZWXAGqIxswwtyyAfv1Y7cEOhjZRwL7JnDP/vHMZftsbF7qJKLypyzR6Vazes6dPUew4LNYboPY2LjstOwzRBgtCW+c4e3ekME05O5rjVmOKfcZR7Ksoqd7yTZP2zrPorW/AQf0LCc=" + } + }, + "metadata": { + "collection_url": "http://www.comsis.org/archive.php", + "issue_url": "http://www.comsis.org/archive.php?show=vol0302" + }, + "output": [ + { + "abstracts": [], + "bibitems": [], + "contributors": [], + "date_published_iso_8601_date_str": null, + "doi": null, + "ext_links": [], + "ids": [], + "kwds": [], + "lang": "und", + "pid": "a1", + "streams": [], + "title_tex": "", + "trans_title_html": "", + "trans_title_tex": "", + "translations": [], + "url": "http://www.comsis.org/archive.php?show=pprnst-2407" + }, + { + "abstracts": [], + "bibitems": [], + "contributors": [], + "date_published_iso_8601_date_str": null, + "doi": null, + "ext_links": [], + "ids": [], + "kwds": [], + "lang": "und", + "pid": "a2", + "streams": [], + "title_tex": "", + "trans_title_html": "", + "trans_title_tex": "", + "translations": [], + "url": "http://www.comsis.org/archive.php?show=pprnst-3210" + }, + { + "abstracts": [], + "bibitems": [], + "contributors": [], + "date_published_iso_8601_date_str": null, + "doi": null, + "ext_links": [], + "ids": [], + "kwds": [], + "lang": "und", + "pid": "a3", + "streams": [], + "title_tex": "", + "trans_title_html": "", + "trans_title_tex": "", + "translations": [], + "url": "http://www.comsis.org/archive.php?show=pprnst-1703" + }, + { + "abstracts": [], + "bibitems": [], + "contributors": [], + "date_published_iso_8601_date_str": null, + "doi": null, + "ext_links": [], + "ids": [], + "kwds": [], + "lang": "und", + "pid": "a4", + "streams": [], + "title_tex": "", + "trans_title_html": "", + "trans_title_tex": "", + "translations": [], + "url": "http://www.comsis.org/archive.php?show=pprnst-0419" + }, + { + "abstracts": [], + "bibitems": [], + "contributors": [], + "date_published_iso_8601_date_str": null, + "doi": null, + "ext_links": [], + "ids": [], + "kwds": [], + "lang": "und", + "pid": "a5", + "streams": [], + "title_tex": "", + "trans_title_html": "", + "trans_title_tex": "", + "translations": [], + "url": "http://www.comsis.org/archive.php?show=pprnst-2003" + }, + { + "abstracts": [], + "bibitems": [], + "contributors": [], + "date_published_iso_8601_date_str": null, + "doi": null, + "ext_links": [], + "ids": [], + "kwds": [], + "lang": "und", + "pid": "a6", + "streams": [], + "title_tex": "", + "trans_title_html": "", + "trans_title_tex": "", + "translations": [], + "url": "http://www.comsis.org/archive.php?show=pprnst-0702" + } + ] + }, + "CSIS_TEMPPID_65": { + "input": { + "issue": { + "articles": [], + "number": "", + "pid": "CSIS_TEMPPID_65", + "url": "http://www.comsis.org/archive.php?show=vol0102", + "volume": "", + "year": "" + }, + "urls": { + "http://www.comsis.org/archive.php?show=vol0102": "eJy1WW1v4zYS/h6g/2GqL+0BlmQ7ySa75+huE2ezCfKG2Lu9w6EoaIm2mFCkjqTseNHv/RvpXwl6/+uGlPySFwVp7CJAIlEz5DwzD2eGTOf77sVB/9+Xh5CajMPll/3T4wPw/DD8afMgDLv9Lvzrc//sFFpBE3pGsdiE4eG5B15qTP4hDCeTSTDZDKQahf2r8NbO0rJq1aOvnU6QmMSLNjpukduMC733zASt9+/fl3ooi8KUJKiTUUPACvv0vwUb73kHUhgqjN+f5tSDuHzb8wy9NaFV/jvEKVGamr3CDP1dD0KcxTDDaXQgs95xD34FfMgLQ9X9XS9mVMT0/o6I5P7uWAylyohhUuCnqTY0052wVN7ocCZuQFG+52kz5VSnlBoPDNpRLR9r7UGq6HDPU1SHTqoVuFFrRFhC2ugMZDLFdx0rlpvlCa7JmJSjHmgVl9NMvv1ipOSG5cG19qJOWEq8fgLNWUIf6yZsDCzZ8wgv3T17tzZS5aLViv6cn1ABQeJEEXS+930opwLfX54/I0w8WFCjcQPiVsyjjwNZIKCUwrUslCC8E+b4oeDO/VGHVO5lIqG3QZ7mXvRZZhRyMqKdkCBAlHooahlCkINO+KB8AbYAUKNGWKZLnY/4BAgbdCzzulWcQUyMSpXj6u0V6+SSM/RttdZhwoxUjHCYjdej0lIZVmQzYI7ai+EavYEkKqkWOyldDOVYjUJGBBnNgZ1Vb6XOkkroQpRHn6QCUphUKl0burlLZs6aDwA+uS2t2KAwbo5njaqSh7xGJNkwKIQOSBwoHc55EcYy0wwpf0ly5KAuBhnTuj4MC4Eq6OjEmCPfFx+QlkoWo/T+7uKkVxuWfKrYKJ3TrXoFo4jQQ7TEYq1RTuREcEmSUrdbvYGm8SP+LDtbuU1W52yi4pSNqZ3xHzqVkz2ujaATz6qaFH1kY0lKrHXOfjLHWPJ2q7mF8AqlMPkC+qeo2xhL2tar7qUWypjRST2YPxH1rziR27PlnEvx188sfk4nL62pqzo1JDEdSHkT4CphueGqLRR6mPJYNlrk3OEgyMXIA8KxNn3a98IIKtkfNHyqJnoxbQ3zikOYobXbGDqnsU0Nzt110RLSsCGLl/YXoitVYPnbEzcsJe4qJT/K3FWptYnaCu0fHh2flyLf4Y8tF18lLzAZtxpw7NZrw4/nckyzAfK+3Wxu/c3ViE7aju7v8LEdoe/nhodPeIbeabaa7aWsWBqduymOxZgZmoDb4HppupgTbRsMliRUePWz50oI4bfazZYX9bHiHBTcFIqCHOKcBqsecmWgwqhDs+iEZFTDRYCdQ86xEnZCHHTWWIn/wEsoltexZLRq2IMsKkAydNLo44XgLBcshCtYMYL2Ivvbffq5NFHnRCxJuAh75XBUmdyAkwARfIAarMG8K4Kq2LsNtJydq0rfAAx0YMN8LgNoN6Dlt98F8GMVYrfo/E/+1qi0LfU1dVZx6ta39vRYxjhRzEyxuIKV8PfxVwJXlGgpMJ/Ng/YZU67fZdRC2i/UTepK1hsC135t4NprDdzM5gZ8xrC9zRkrRLW9429vvxBW3HBXdFTg+mvYg5tYHOCMYuOQOAMJ9LHndWnviuQMt7mSRmKnawsWsvaUqBH1ezHhFGOrmaBaP4dqToYjqRJsXuCMcanJmF9jYYjdUvtYmm8kXFLF/vc7++O3N1Fk87UU2VwrRR6gacBR0FjgaMA+0uYv8eoKrNre8Xfba08WW64DzqW2QPqu15pZoi0w+5Fx3BgneECyqEeKYFc/YSbFocE0pnOiXEl7RIFTKka40xqYLhlVaopTFDmZlkAP8VSLvCkENfJttWDrtXzZWitf5qiukCkVIoSILzM0DTi0yWY1Z67Aj91NRP1S2nkbQbbxJEDxWAHn1CAI4c6YPYqnG+x94Sc6AB+OCux7uCX9zFpFtFFFbEvlnB9dRUbo4CPMs+O3Zovt10Z/e63Rn9ncgC7GeDV/rNIuYGVpbbfWHuN32CkOh/a0NKYucyFXLawvGo08xuZV2Q4eusp+r6yDLtVsJFDGkv3L2ek8zj1Dx+jKMywb/BqbJ6lpniJW8u0bVdKB/coJ1hT8hILyzWR491oyvFtv6aiA9XDzz1DZHrGxhAdDh0RZ3amrkGUbE8LO+ivGDjb9cmIvPrAqliVScjmagpFwqA1Ds2y7pY0FfDG4Rg/4Fwptt2eOnhwa1EXYdEy5zDN7CsZEaKUWLccVSaTmZAxnAZbbm7czZOe1DNlZK0NmNttqgRjw5PDXOGwVduwgO95v1bPju43ytAqH511wp9Xlg251pK0OuksfXAvw8Pg7lBIt/MXI3JtN8vijV41U/uR0iKdlPOzjYWtqvVKhQxh9NYIuFlhDrwk6SyFTrKcJbGLf3Wo2mwhwzKBHsPnvUTVgpLGIOxrHjfxQ3nX8c34NgsF9NFIelxeWPrDrshhwplMMzsA2N+768GB+fQiFSKgqOULKS2/OYio09R7exsSKEpsbcO0Mtd1NfiWpw8HUF7EvknAraIbYolWydjkrDB9NedOHIfbPpbDDVNk7DnzrYh87duIaUL3MNcLRgXA4LZdAAyN38eLuWZ7Mfzoz2V3F29tPhaj8CUtM+qFZXZHPrnhY8CyWhyB2d283W+5qJ4xq3BtzikxHMthLjvLrErNKpjxlHOG8omf53wH77wKT8Wjj/wy/m0U=" + } + }, + "metadata": { + "collection_url": "http://www.comsis.org/archive.php", + "issue_url": "http://www.comsis.org/archive.php?show=vol0102" + }, + "output": [ + { + "abstracts": [], + "bibitems": [], + "contributors": [], + "date_published_iso_8601_date_str": null, + "doi": null, + "ext_links": [], + "ids": [], + "kwds": [], + "lang": "und", + "pid": "a1", + "streams": [], + "title_tex": "", + "trans_title_html": "", + "trans_title_tex": "", + "translations": [], + "url": "http://www.comsis.org/archive.php?show=pprnnn-1201" + }, + { + "abstracts": [], + "bibitems": [], + "contributors": [], + "date_published_iso_8601_date_str": null, + "doi": null, + "ext_links": [], + "ids": [], + "kwds": [], + "lang": "und", + "pid": "a2", + "streams": [], + "title_tex": "", + "trans_title_html": "", + "trans_title_tex": "", + "translations": [], + "url": "http://www.comsis.org/archive.php?show=pprnnn-1202" + }, + { + "abstracts": [], + "bibitems": [], + "contributors": [], + "date_published_iso_8601_date_str": null, + "doi": null, + "ext_links": [], + "ids": [], + "kwds": [], + "lang": "und", + "pid": "a3", + "streams": [], + "title_tex": "", + "trans_title_html": "", + "trans_title_tex": "", + "translations": [], + "url": "http://www.comsis.org/archive.php?show=pprnnn-1203" + }, + { + "abstracts": [], + "bibitems": [], + "contributors": [], + "date_published_iso_8601_date_str": null, + "doi": null, + "ext_links": [], + "ids": [], + "kwds": [], + "lang": "und", + "pid": "a4", + "streams": [], + "title_tex": "", + "trans_title_html": "", + "trans_title_tex": "", + "translations": [], + "url": "http://www.comsis.org/archive.php?show=pprnnn-1204" + }, + { + "abstracts": [], + "bibitems": [], + "contributors": [], + "date_published_iso_8601_date_str": null, + "doi": null, + "ext_links": [], + "ids": [], + "kwds": [], + "lang": "und", + "pid": "a5", + "streams": [], + "title_tex": "", + "trans_title_html": "", + "trans_title_tex": "", + "translations": [], + "url": "http://www.comsis.org/archive.php?show=pprnnn-1205" + }, + { + "abstracts": [], + "bibitems": [], + "contributors": [], + "date_published_iso_8601_date_str": null, + "doi": null, + "ext_links": [], + "ids": [], + "kwds": [], + "lang": "und", + "pid": "a6", + "streams": [], + "title_tex": "", + "trans_title_html": "", + "trans_title_tex": "", + "translations": [], + "url": "http://www.comsis.org/archive.php?show=pprnnn-1206" + }, + { + "abstracts": [], + "bibitems": [], + "contributors": [], + "date_published_iso_8601_date_str": null, + "doi": null, + "ext_links": [], + "ids": [], + "kwds": [], + "lang": "und", + "pid": "a7", + "streams": [], + "title_tex": "", + "trans_title_html": "", + "trans_title_tex": "", + "translations": [], + "url": "http://www.comsis.org/archive.php?show=pprnnn-1207" + } + ] + } + } + } + } +} \ No newline at end of file diff --git a/src/crawler/tests/data_generation/generate_test_data.py b/src/crawler/tests/data_generation/generate_test_data.py index 21f68dd..7c037ca 100644 --- a/src/crawler/tests/data_generation/generate_test_data.py +++ b/src/crawler/tests/data_generation/generate_test_data.py @@ -224,6 +224,8 @@ class TestDataGenerator: def generate_parse_issue_content(self, crawler: BaseCollectionCrawler, issue: IssueData): """Generates test data for an issue""" + # We save the pid because parse_issue_content can change it + issue_pid = issue.pid original_download_file = crawler.download_file def patched_download_file(url: str, *args, **kwargs): @@ -231,7 +233,7 @@ class TestDataGenerator: if not isinstance(crawler, DmlplCrawler): content = html.unescape(content) self.JSON_DATA["parse_issue_content"][crawler.source_domain][crawler.collection_id][ - issue.pid + issue_pid ]["input"]["urls"][url] = base64.b64encode(zlib.compress(content.encode())).decode( "ascii" ) @@ -244,11 +246,11 @@ class TestDataGenerator: content = crawler.download_file(issue.url) self.JSON_DATA["parse_issue_content"][crawler.source_domain][crawler.collection_id][ - issue.pid + issue_pid ]["input"]["issue"] = { "year": issue.year, "number": issue.number, - "pid": issue.pid, + "pid": issue_pid, "volume": issue.volume, "url": issue.url, "articles": [self.article_to_test_data(a) for a in issue.articles], @@ -259,7 +261,7 @@ class TestDataGenerator: output: list[dict] = [self.article_to_test_data(article) for article in issue.articles] self.JSON_DATA["parse_issue_content"][crawler.source_domain][crawler.collection_id][ - issue.pid + issue_pid ]["output"] = output crawler.download_file = original_download_file @@ -506,6 +508,9 @@ else: # AMI generator.include_article_in_tests("AMI", "AMI_2024_60", "a0") + # CSIS + generator.include_article_in_tests("CSIS", "CSIS_2024_21_1", "a0") + generator.include_col_in_tests("AUM") generator.include_col_in_tests("IJOPCM") generator.include_col_in_tests("FORUM") diff --git a/src/crawler/tests/fixtures.py b/src/crawler/tests/fixtures.py index b2573a3..c15ed0c 100644 --- a/src/crawler/tests/fixtures.py +++ b/src/crawler/tests/fixtures.py @@ -11,6 +11,7 @@ from crawler.by_source.ami_crawler import AmiCrawler from crawler.by_source.amp_crawler import AmpCrawler from crawler.by_source.arsia_crawler import ArsiaCrawler from crawler.by_source.bdim_crawler import BdimCrawler +from crawler.by_source.csis_crawler import CsisCrawler from crawler.by_source.dmlbul_crawler import DmlbulCrawler from crawler.by_source.dmlcz_crawler import DmlczCrawler from crawler.by_source.dmlpl_crawler import DmlplCrawler @@ -41,6 +42,7 @@ crawlers_to_test: list[type[BaseCollectionCrawler]] = [ AmpCrawler, ArsiaCrawler, BdimCrawler, + CsisCrawler, DmlbulCrawler, DmlczCrawler, DmlplCrawler, -- GitLab From 39fff4446f6236fe5f187b82b50524a74e6df8ea Mon Sep 17 00:00:00 2001 From: Nathan Tien You <nathan.tien-you@univ-grenoble-alpes.fr> Date: Wed, 22 Jan 2025 08:35:04 +0000 Subject: [PATCH 2/8] fix: csis : add exception for incorrect doi in source --- src/crawler/by_source/csis_crawler.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/crawler/by_source/csis_crawler.py b/src/crawler/by_source/csis_crawler.py index d357177..538af83 100644 --- a/src/crawler/by_source/csis_crawler.py +++ b/src/crawler/by_source/csis_crawler.py @@ -184,6 +184,9 @@ class CsisCrawler(BaseCollectionCrawler): doi_url = doi_url.removeprefix("https://doi.org/") xarticle.doi = doi_url + if xarticle.pid == "CSIS_2023_20_4_a2": + xarticle.doi = None + # Keywords if "Key words" in categories: keywords = categories["Key words"].text.split(", ") -- GitLab From f86e4da8b3183217a738f48a9a01fe247023e957 Mon Sep 17 00:00:00 2001 From: Nathan Tien You <nathan.tien-you@univ-grenoble-alpes.fr> Date: Wed, 22 Jan 2025 09:09:13 +0000 Subject: [PATCH 3/8] chore : patch incorrect doi from source --- src/crawler/by_source/csis_crawler.py | 8 ++++++-- src/crawler/tests/data_generation/generate_test_data.py | 3 --- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/crawler/by_source/csis_crawler.py b/src/crawler/by_source/csis_crawler.py index 538af83..9c32228 100644 --- a/src/crawler/by_source/csis_crawler.py +++ b/src/crawler/by_source/csis_crawler.py @@ -16,7 +16,7 @@ from crawler.utils import add_pdf_link_to_xarticle, cleanup_str class CsisCrawler(BaseCollectionCrawler): - source_name = "Computer Science and Information Systems" + source_name = "Computer Science and Information Systems website" source_domain = "CSIS" source_website = "http://www.comsis.org/" @@ -185,7 +185,11 @@ class CsisCrawler(BaseCollectionCrawler): xarticle.doi = doi_url if xarticle.pid == "CSIS_2023_20_4_a2": - xarticle.doi = None + xarticle.doi = "10.2298/CSIS230400viiL" + if xarticle.pid == "CSIS_2023_20_1_a0": + xarticle.doi = "10.2298/CSIS230100iI" + if xarticle.pid == "CSIS_2021_18_1_a4": + xarticle.doi = "10.2298/CSIS200330035A" # Keywords if "Key words" in categories: diff --git a/src/crawler/tests/data_generation/generate_test_data.py b/src/crawler/tests/data_generation/generate_test_data.py index 7c037ca..5d84f66 100644 --- a/src/crawler/tests/data_generation/generate_test_data.py +++ b/src/crawler/tests/data_generation/generate_test_data.py @@ -508,9 +508,6 @@ else: # AMI generator.include_article_in_tests("AMI", "AMI_2024_60", "a0") - # CSIS - generator.include_article_in_tests("CSIS", "CSIS_2024_21_1", "a0") - generator.include_col_in_tests("AUM") generator.include_col_in_tests("IJOPCM") generator.include_col_in_tests("FORUM") -- GitLab From 773101f09e5da5bc606194beda79b20bfecc9ee1 Mon Sep 17 00:00:00 2001 From: Nathan Tien You <nathan.tien-you@univ-grenoble-alpes.fr> Date: Wed, 22 Jan 2025 09:21:37 +0000 Subject: [PATCH 4/8] chore: csis : temporarily disable doi detection --- src/crawler/by_source/csis_crawler.py | 45 ++++++++++++++++----------- 1 file changed, 27 insertions(+), 18 deletions(-) diff --git a/src/crawler/by_source/csis_crawler.py b/src/crawler/by_source/csis_crawler.py index 9c32228..1fe5031 100644 --- a/src/crawler/by_source/csis_crawler.py +++ b/src/crawler/by_source/csis_crawler.py @@ -172,24 +172,33 @@ class CsisCrawler(BaseCollectionCrawler): add_pdf_link_to_xarticle(xarticle, urljoin(self.source_website, pdf_url)) # DOI - if "Digital Object Identifier (DOI)" in categories: - doi_tag = categories["Digital Object Identifier (DOI)"].select_one("a") - if not doi_tag: - raise ValueError("Couldn't find doi url") - doi_url = doi_tag.get("href") - if not isinstance(doi_url, str): - raise ValueError("Couldn't parse doi url") - if not doi_url.startswith("https://doi.org/"): - raise ValueError("Malformed DOI url") - doi_url = doi_url.removeprefix("https://doi.org/") - xarticle.doi = doi_url - - if xarticle.pid == "CSIS_2023_20_4_a2": - xarticle.doi = "10.2298/CSIS230400viiL" - if xarticle.pid == "CSIS_2023_20_1_a0": - xarticle.doi = "10.2298/CSIS230100iI" - if xarticle.pid == "CSIS_2021_18_1_a4": - xarticle.doi = "10.2298/CSIS200330035A" + # TODO : contact CSIS to make them fix their DOIs + # if "Digital Object Identifier (DOI)" in categories: + # doi_tag = categories["Digital Object Identifier (DOI)"].select_one("a") + # if not doi_tag: + # raise ValueError("Couldn't find doi url") + # doi_url = doi_tag.get("href") + # if not isinstance(doi_url, str): + # raise ValueError("Couldn't parse doi url") + # if not doi_url.startswith("https://doi.org/"): + # raise ValueError("Malformed DOI url") + # doi_url = doi_url.removeprefix("https://doi.org/") + # xarticle.doi = doi_url + + # if xarticle.pid == "CSIS_2023_20_4_a2": + # xarticle.doi = "10.2298/CSIS230400viiL" + # if xarticle.pid == "CSIS_2023_20_1_a0": + # xarticle.doi = "10.2298/CSIS230100iI" + # if xarticle.pid == "CSIS_2021_18_1_a4": + # xarticle.doi = "10.2298/CSIS200330035A" + # if xarticle.pid == "CSIS_2020_17_1_a14": + # xarticle.doi = "10.2298/CSIS180717038L" + # if xarticle.pid == "CSIS_2020_17_1_a15": + # xarticle.doi = "10.2298/CSIS190430041C" + # if xarticle.pid == "CSIS_2020_17_1_a16": + # xarticle.doi = "10.2298/CSIS190501042A" + # if xarticle.pid == "CSIS_2020_17_1_a17": + # xarticle.doi = "10.2298/CSIS190511043L" # Keywords if "Key words" in categories: -- GitLab From b1b722e3f6b93c50c7b54ed71a86038f7f942b7c Mon Sep 17 00:00:00 2001 From: Nathan Tien You <nathan.tien-you@univ-grenoble-alpes.fr> Date: Wed, 22 Jan 2025 09:56:37 +0000 Subject: [PATCH 5/8] fix : csis article parsing --- src/crawler/by_source/csis_crawler.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/crawler/by_source/csis_crawler.py b/src/crawler/by_source/csis_crawler.py index 1fe5031..221eb4b 100644 --- a/src/crawler/by_source/csis_crawler.py +++ b/src/crawler/by_source/csis_crawler.py @@ -67,15 +67,18 @@ class CsisCrawler(BaseCollectionCrawler): ) for index, article_tag in enumerate(content.select("p")): + if len(article_tag.contents) == 1: + continue + if article_tag.text == "Editorial": continue article_title = article_tag.select_one("a.hidden") if not article_title: - raise ValueError("Couldn't parse issue link") + raise ValueError("Couldn't parse article link") article_href = article_title.get("href") if not isinstance(article_href, str): - raise ValueError("Couldn't parse issue href") + raise ValueError("Couldn't parse article href") xarticle = create_articledata() xarticle.url = urljoin(self.source_website, article_href) -- GitLab From 3c79ab602e909fcf3e6c4ae5c7c4b38a84152166 Mon Sep 17 00:00:00 2001 From: Nathan Tien You <nathan.tien-you@univ-grenoble-alpes.fr> Date: Wed, 22 Jan 2025 10:01:31 +0000 Subject: [PATCH 6/8] chore: add comment --- src/crawler/by_source/csis_crawler.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/crawler/by_source/csis_crawler.py b/src/crawler/by_source/csis_crawler.py index 221eb4b..de01a3f 100644 --- a/src/crawler/by_source/csis_crawler.py +++ b/src/crawler/by_source/csis_crawler.py @@ -1,3 +1,8 @@ +""" +This source has invalid DOIs in some article. +For now, those are ignored in order to be able to crawl the collection. +""" + from urllib.parse import urljoin import regex -- GitLab From e03607c174a21f634b26d97e2098d3220cbf14e6 Mon Sep 17 00:00:00 2001 From: Nathan Tien You <nathan.tien-you@univ-grenoble-alpes.fr> Date: Wed, 22 Jan 2025 10:33:45 +0000 Subject: [PATCH 7/8] fix : patch source title for one specific article --- src/crawler/by_source/csis_crawler.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/crawler/by_source/csis_crawler.py b/src/crawler/by_source/csis_crawler.py index de01a3f..364f52c 100644 --- a/src/crawler/by_source/csis_crawler.py +++ b/src/crawler/by_source/csis_crawler.py @@ -102,11 +102,14 @@ class CsisCrawler(BaseCollectionCrawler): id_tag.decompose() # Title - title_tag = content.select_one(".title") - if not title_tag: - raise ValueError("Couldn't find title") - xarticle.title_tex = title_tag.text - title_tag.decompose() + if xarticle.pid == "CSIS_2012_9_3_a13": + xarticle.title_tex = "Modeling a Holonic Agent based Solution" + else: + title_tag = content.select_one(".title") + if not title_tag: + raise ValueError("Couldn't find title") + xarticle.title_tex = title_tag.text + title_tag.decompose() # Authors authors_tag = content.select_one(".authors") -- GitLab From 7fdab11c1e99428b25ee6f337d6bcbab77a5d78d Mon Sep 17 00:00:00 2001 From: Nathan Tien You <nathan.tien-you@univ-grenoble-alpes.fr> Date: Wed, 22 Jan 2025 10:46:18 +0000 Subject: [PATCH 8/8] Chore : handle articles that don't have pdfs --- src/crawler/by_source/csis_crawler.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/src/crawler/by_source/csis_crawler.py b/src/crawler/by_source/csis_crawler.py index 364f52c..af2ee02 100644 --- a/src/crawler/by_source/csis_crawler.py +++ b/src/crawler/by_source/csis_crawler.py @@ -174,13 +174,16 @@ class CsisCrawler(BaseCollectionCrawler): xarticle.abstracts.append(xabstract) # PDF - pdf_tag = categories["Full text"].select_one("a.download") - if not pdf_tag: - raise ValueError("Couldn't find pdf url") - pdf_url = pdf_tag.get("href") - if not isinstance(pdf_url, str): - raise ValueError("Couldn't parse pdf url") - add_pdf_link_to_xarticle(xarticle, urljoin(self.source_website, pdf_url)) + if "Full text" in categories: + pdf_tag = categories["Full text"].select_one("a.download") + if not pdf_tag: + raise ValueError("Couldn't find pdf url") + pdf_url = pdf_tag.get("href") + if not isinstance(pdf_url, str): + raise ValueError("Couldn't parse pdf url") + add_pdf_link_to_xarticle(xarticle, urljoin(self.source_website, pdf_url)) + else: + print(f"No PDF Found for article {xarticle.pid}. Skipping pdf") # DOI # TODO : contact CSIS to make them fix their DOIs -- GitLab