From 77bdbb69fbf5d22860d8447aa36100bdd4c3f549 Mon Sep 17 00:00:00 2001 From: Nathan Tien You <nathan.tien-you@univ-grenoble-alpes.fr> Date: Tue, 25 Feb 2025 13:53:17 +0000 Subject: [PATCH 1/2] feat : add threads and progress bar --- requirements.txt | 1 + src/ptf/management/commands/reindex_solr.py | 225 +++++++++++--------- 2 files changed, 124 insertions(+), 102 deletions(-) diff --git a/requirements.txt b/requirements.txt index ff606d19..1f98e6ef 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ +alive-progress<3.2 beautifulsoup4==4.12.2 bleach[css]==6.0.0 Django<4.3 diff --git a/src/ptf/management/commands/reindex_solr.py b/src/ptf/management/commands/reindex_solr.py index 62815a85..ebf95d5d 100644 --- a/src/ptf/management/commands/reindex_solr.py +++ b/src/ptf/management/commands/reindex_solr.py @@ -1,16 +1,14 @@ +import concurrent.futures + +from alive_progress import alive_bar from django.conf import settings from django.core.management.base import BaseCommand from django.db.models import Q -from ptf.cmds.solr_cmds import addArticleSolrCmd -from ptf.cmds.solr_cmds import addBookPartSolrCmd -from ptf.cmds.solr_cmds import addContainerSolrCmd +from ptf.cmds.solr_cmds import addArticleSolrCmd, addBookPartSolrCmd, addContainerSolrCmd from ptf.cmds.xml import xml_utils -from ptf.model_data_converter import db_to_article_data -from ptf.model_data_converter import db_to_book_data -from ptf.model_data_converter import db_to_bookpart_data -from ptf.models import Article -from ptf.models import Container +from ptf.model_data_converter import db_to_article_data, db_to_book_data, db_to_bookpart_data +from ptf.models import Article, Container class Command(BaseCommand): @@ -42,6 +40,15 @@ class Command(BaseCommand): Using this option will index the resources using the site where the command has been executed.""", ) + parser.add_argument( + "--threads", + action="store", + dest="threads", + default=4, + type=str, + help="Changes the amount of threads to reindex the articles. Default is 4.", + ) + def get_body(self, xobj): # Get body_xml from PostgreSQL # regenerate body (XML -> TXT) @@ -56,103 +63,117 @@ class Command(BaseCommand): if options["pid"]: qs = qs.filter(my_container__my_collection__pid=options["pid"]) - start = options["start_id"] is None - - for article in qs.order_by("pid"): - if options["start_id"] and article.pid == options["start_id"]: - start = True - - if start: - site_id = None - if options["use_settings_site"]: - site_id = settings.SITE_ID - # The Solr search is by default filtered on the site, which is by default equivalent to the article's collection - # Since the reindex_solr is run from any given site, force the 'sites' param - else: - site_id = settings.SITE_REGISTER[article.get_top_collection().pid.lower()][ - "site_id" - ] - params = { - "id": article.id, - "pid": article.pid, - "db_obj": article, - "sites": [site_id], - } - - if ( - article.my_container.ctype.startswith("book") - or article.my_container.ctype == "lecture-notes" - ): - xobj = db_to_bookpart_data(article) - params["xobj"] = xobj - cmd = addBookPartSolrCmd(params) - else: - xobj = db_to_article_data(article) - params["xobj"] = xobj - cmd = addArticleSolrCmd(params) - - xobj.body = self.get_body(xobj) - if xobj.body: - print(article.pid) - else: - print(article.pid, "No body") - - cmd.add_collection(article.get_collection()) - cmd.set_container(article.my_container) - cmd.do() # pre_do starts by deleting the existing Solr document - - for trans_article in article.translations.all(): - print(trans_article.pid) - xtrans_obj = db_to_article_data(trans_article) - if xtrans_obj.body_html: - xtrans_obj.body = xml_utils.get_text_from_xml_with_mathml( - xtrans_obj.body_html - ) - xtrans_obj.trans_title_tex = xobj.title_tex - xtrans_obj.trans_title_html = xobj.title_html - - if article.trans_lang == trans_article.lang: - if article.trans_title_tex: - xtrans_obj.title_tex = article.trans_title_tex - xtrans_obj.title_html = article.trans_title_html - for abstract in xobj.abstracts: - if ( - abstract["tag"] == "abstract" - and abstract["lang"] == trans_article.lang - ): - xtrans_obj.abstracts = [abstract] - - params = { - "id": trans_article.id, - "pid": trans_article.pid, - "db_obj": trans_article, - "sites": [site_id], - "xobj": xtrans_obj, - } - cmd = addArticleSolrCmd(params) - cmd.add_collection(article.get_collection()) - cmd.set_container(article.my_container) - cmd.do() # pre_do starts by deleting the existing Solr document + if options["start_id"] is not None: + qs = qs.filter(pid__gte=options["start_id"]) + + with concurrent.futures.ThreadPoolExecutor(max_workers=options["threads"]) as executor: + + def follow_article_progress(article: Article, options): + self.reindex_article(article, options) + return article.pid + + tasks = ( + executor.submit(follow_article_progress, article, options) + for article in qs.order_by("pid") + ) + + with alive_bar( + qs.count(), + dual_line=True, + title="SolR : reindex articles", + stats="({rate})", + force_tty=True, + ) as progress_bar: + for future in concurrent.futures.as_completed(tasks): + progress_bar.text = future.result() + progress_bar() + + concurrent.futures.wait(tasks) qs = Container.objects.filter(Q(ctype__startswith="book") | Q(ctype="lecture-notes")) if options["pid"]: qs = qs.filter(my_collection__pid=options["pid"]) + if options["start_id"]: + qs = qs.filter(pid__gte=options["start_id"]) + for book in qs.order_by("pid"): - if options["start_id"] and article.pid == options["start_id"]: - start = True - - if start: - xobj = db_to_book_data(book) - xobj.body = self.get_body(xobj) - - if xobj.body: - print(book.pid) - else: - print(book.pid, "No body") - - cmd = addContainerSolrCmd( - {"xobj": xobj, "id": book.id, "pid": book.pid, "db_obj": book} - ) - cmd.add_collection(book.get_collection()) - cmd.do() + self.reindex_book(book) + + def reindex_article(self, article: Article, options: dict): + site_id = None + if options["use_settings_site"]: + site_id = settings.SITE_ID + # The Solr search is by default filtered on the site, which is by default equivalent to the article's collection + # Since the reindex_solr is run from any given site, force the 'sites' param + else: + site_id = settings.SITE_REGISTER[article.get_top_collection().pid.lower()]["site_id"] + params = { + "id": article.id, + "pid": article.pid, + "db_obj": article, + "sites": [site_id], + } + + if ( + article.my_container.ctype.startswith("book") + or article.my_container.ctype == "lecture-notes" + ): + xobj = db_to_bookpart_data(article) + params["xobj"] = xobj + cmd = addBookPartSolrCmd(params) + else: + xobj = db_to_article_data(article) + params["xobj"] = xobj + cmd = addArticleSolrCmd(params) + + xobj.body = self.get_body(xobj) + # if xobj.body: + # print(article.pid) + # else: + # print(article.pid, "No body") + + cmd.add_collection(article.get_collection()) + cmd.set_container(article.my_container) + cmd.do() # pre_do starts by deleting the existing Solr document + + for trans_article in article.translations.all(): + print(trans_article.pid) + xtrans_obj = db_to_article_data(trans_article) + if xtrans_obj.body_html: + xtrans_obj.body = xml_utils.get_text_from_xml_with_mathml(xtrans_obj.body_html) + xtrans_obj.trans_title_tex = xobj.title_tex + xtrans_obj.trans_title_html = xobj.title_html + + if article.trans_lang == trans_article.lang: + if article.trans_title_tex: + xtrans_obj.title_tex = article.trans_title_tex + xtrans_obj.title_html = article.trans_title_html + for abstract in xobj.abstracts: + if abstract["tag"] == "abstract" and abstract["lang"] == trans_article.lang: + xtrans_obj.abstracts = [abstract] + + params = { + "id": trans_article.id, + "pid": trans_article.pid, + "db_obj": trans_article, + "sites": [site_id], + "xobj": xtrans_obj, + } + cmd = addArticleSolrCmd(params) + cmd.add_collection(article.get_collection()) + cmd.set_container(article.my_container) + cmd.do() # pre_do starts by deleting the existing Solr document + + def reindex_book(self, book: Container): + xobj = db_to_book_data(book) + xobj.body = self.get_body(xobj) + + if xobj.body: + print(book.pid) + else: + print(book.pid, "No body") + + cmd = addContainerSolrCmd({"xobj": xobj, "id": book.id, "pid": book.pid, "db_obj": book}) + cmd.add_collection(book.get_collection()) + cmd.do() -- GitLab From 3f8a2a8e65e5719863fed8786c365b269758d926 Mon Sep 17 00:00:00 2001 From: Nathan Tien You <nathan.tien-you@univ-grenoble-alpes.fr> Date: Tue, 25 Feb 2025 13:56:42 +0000 Subject: [PATCH 2/2] fix : solR pdf urls for Geodesic --- src/ptf/cmds/solr_cmds.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/src/ptf/cmds/solr_cmds.py b/src/ptf/cmds/solr_cmds.py index 06e1928e..0350c014 100644 --- a/src/ptf/cmds/solr_cmds.py +++ b/src/ptf/cmds/solr_cmds.py @@ -1,10 +1,10 @@ import pysolr - from django.conf import settings -from ptf.cmds.base_cmds import baseCmd -from ptf.cmds.base_cmds import make_int +from ptf.cmds.base_cmds import baseCmd, make_int from ptf.display import resolver +from ptf.model_data import ResourceData +from ptf.models import Resource from ptf.site_register import SITE_REGISTER from ptf.solr import search_helpers from ptf.utils import get_display_name @@ -150,7 +150,7 @@ class solrDeleteCmd(solrCmd): class solrAddCmd(solrCmd): def __init__(self, params={}): self.commit = True - self.db_obj = None + self.db_obj: Resource | None = None self.id = None self.pid = None self.data = {} @@ -194,7 +194,7 @@ class solrAddCmd(solrCmd): ###################################################################### class addResourceSolrCmd(solrAddCmd): def __init__(self, params={}): - self.xobj = None # model_data object + self.xobj: ResourceData | None = None # model_data object # fields of the xobj to pass to SolR self.fields = [ @@ -300,9 +300,12 @@ class addResourceSolrCmd(solrAddCmd): for stream in self.xobj.streams: mimetype = stream["mimetype"] if mimetype in solr_fields: - href = self.db_obj.get_binary_file_href_full_path( - "self", mimetype, stream["location"] - ) + if stream["location"].startswith("/"): + href = self.db_obj.get_binary_file_href_full_path( + "self", mimetype, stream["location"] + ) + else: + href = stream["location"] self.data[solr_fields[mimetype]] = href if self.db_obj is not None: -- GitLab