From 77bdbb69fbf5d22860d8447aa36100bdd4c3f549 Mon Sep 17 00:00:00 2001
From: Nathan Tien You <nathan.tien-you@univ-grenoble-alpes.fr>
Date: Tue, 25 Feb 2025 13:53:17 +0000
Subject: [PATCH 1/2] feat : add threads and progress bar

---
 requirements.txt                            |   1 +
 src/ptf/management/commands/reindex_solr.py | 225 +++++++++++---------
 2 files changed, 124 insertions(+), 102 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index ff606d19..1f98e6ef 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,4 @@
+alive-progress<3.2
 beautifulsoup4==4.12.2
 bleach[css]==6.0.0
 Django<4.3
diff --git a/src/ptf/management/commands/reindex_solr.py b/src/ptf/management/commands/reindex_solr.py
index 62815a85..ebf95d5d 100644
--- a/src/ptf/management/commands/reindex_solr.py
+++ b/src/ptf/management/commands/reindex_solr.py
@@ -1,16 +1,14 @@
+import concurrent.futures
+
+from alive_progress import alive_bar
 from django.conf import settings
 from django.core.management.base import BaseCommand
 from django.db.models import Q
 
-from ptf.cmds.solr_cmds import addArticleSolrCmd
-from ptf.cmds.solr_cmds import addBookPartSolrCmd
-from ptf.cmds.solr_cmds import addContainerSolrCmd
+from ptf.cmds.solr_cmds import addArticleSolrCmd, addBookPartSolrCmd, addContainerSolrCmd
 from ptf.cmds.xml import xml_utils
-from ptf.model_data_converter import db_to_article_data
-from ptf.model_data_converter import db_to_book_data
-from ptf.model_data_converter import db_to_bookpart_data
-from ptf.models import Article
-from ptf.models import Container
+from ptf.model_data_converter import db_to_article_data, db_to_book_data, db_to_bookpart_data
+from ptf.models import Article, Container
 
 
 class Command(BaseCommand):
@@ -42,6 +40,15 @@ class Command(BaseCommand):
             Using this option will index the resources using the site where the command has been executed.""",
         )
 
+        parser.add_argument(
+            "--threads",
+            action="store",
+            dest="threads",
+            default=4,
+            type=str,
+            help="Changes the amount of threads to reindex the articles. Default is 4.",
+        )
+
     def get_body(self, xobj):
         # Get body_xml from PostgreSQL
         # regenerate body (XML -> TXT)
@@ -56,103 +63,117 @@ class Command(BaseCommand):
         if options["pid"]:
             qs = qs.filter(my_container__my_collection__pid=options["pid"])
 
-        start = options["start_id"] is None
-
-        for article in qs.order_by("pid"):
-            if options["start_id"] and article.pid == options["start_id"]:
-                start = True
-
-            if start:
-                site_id = None
-                if options["use_settings_site"]:
-                    site_id = settings.SITE_ID
-                # The Solr search is by default filtered on the site, which is by default equivalent to the article's collection
-                # Since the reindex_solr is run from any given site, force the 'sites' param
-                else:
-                    site_id = settings.SITE_REGISTER[article.get_top_collection().pid.lower()][
-                        "site_id"
-                    ]
-                params = {
-                    "id": article.id,
-                    "pid": article.pid,
-                    "db_obj": article,
-                    "sites": [site_id],
-                }
-
-                if (
-                    article.my_container.ctype.startswith("book")
-                    or article.my_container.ctype == "lecture-notes"
-                ):
-                    xobj = db_to_bookpart_data(article)
-                    params["xobj"] = xobj
-                    cmd = addBookPartSolrCmd(params)
-                else:
-                    xobj = db_to_article_data(article)
-                    params["xobj"] = xobj
-                    cmd = addArticleSolrCmd(params)
-
-                xobj.body = self.get_body(xobj)
-                if xobj.body:
-                    print(article.pid)
-                else:
-                    print(article.pid, "No body")
-
-                cmd.add_collection(article.get_collection())
-                cmd.set_container(article.my_container)
-                cmd.do()  # pre_do starts by deleting the existing Solr document
-
-                for trans_article in article.translations.all():
-                    print(trans_article.pid)
-                    xtrans_obj = db_to_article_data(trans_article)
-                    if xtrans_obj.body_html:
-                        xtrans_obj.body = xml_utils.get_text_from_xml_with_mathml(
-                            xtrans_obj.body_html
-                        )
-                    xtrans_obj.trans_title_tex = xobj.title_tex
-                    xtrans_obj.trans_title_html = xobj.title_html
-
-                    if article.trans_lang == trans_article.lang:
-                        if article.trans_title_tex:
-                            xtrans_obj.title_tex = article.trans_title_tex
-                            xtrans_obj.title_html = article.trans_title_html
-                        for abstract in xobj.abstracts:
-                            if (
-                                abstract["tag"] == "abstract"
-                                and abstract["lang"] == trans_article.lang
-                            ):
-                                xtrans_obj.abstracts = [abstract]
-
-                    params = {
-                        "id": trans_article.id,
-                        "pid": trans_article.pid,
-                        "db_obj": trans_article,
-                        "sites": [site_id],
-                        "xobj": xtrans_obj,
-                    }
-                    cmd = addArticleSolrCmd(params)
-                    cmd.add_collection(article.get_collection())
-                    cmd.set_container(article.my_container)
-                    cmd.do()  # pre_do starts by deleting the existing Solr document
+        if options["start_id"] is not None:
+            qs = qs.filter(pid__gte=options["start_id"])
+
+        with concurrent.futures.ThreadPoolExecutor(max_workers=options["threads"]) as executor:
+
+            def follow_article_progress(article: Article, options):
+                self.reindex_article(article, options)
+                return article.pid
+
+            tasks = (
+                executor.submit(follow_article_progress, article, options)
+                for article in qs.order_by("pid")
+            )
+
+            with alive_bar(
+                qs.count(),
+                dual_line=True,
+                title="SolR : reindex articles",
+                stats="({rate})",
+                force_tty=True,
+            ) as progress_bar:
+                for future in concurrent.futures.as_completed(tasks):
+                    progress_bar.text = future.result()
+                    progress_bar()
+
+            concurrent.futures.wait(tasks)
 
         qs = Container.objects.filter(Q(ctype__startswith="book") | Q(ctype="lecture-notes"))
         if options["pid"]:
             qs = qs.filter(my_collection__pid=options["pid"])
 
+        if options["start_id"]:
+            qs = qs.filter(pid__gte=options["start_id"])
+
         for book in qs.order_by("pid"):
-            if options["start_id"] and article.pid == options["start_id"]:
-                start = True
-
-            if start:
-                xobj = db_to_book_data(book)
-                xobj.body = self.get_body(xobj)
-
-                if xobj.body:
-                    print(book.pid)
-                else:
-                    print(book.pid, "No body")
-
-                cmd = addContainerSolrCmd(
-                    {"xobj": xobj, "id": book.id, "pid": book.pid, "db_obj": book}
-                )
-                cmd.add_collection(book.get_collection())
-                cmd.do()
+            self.reindex_book(book)
+
+    def reindex_article(self, article: Article, options: dict):
+        site_id = None
+        if options["use_settings_site"]:
+            site_id = settings.SITE_ID
+        # The Solr search is by default filtered on the site, which is by default equivalent to the article's collection
+        # Since the reindex_solr is run from any given site, force the 'sites' param
+        else:
+            site_id = settings.SITE_REGISTER[article.get_top_collection().pid.lower()]["site_id"]
+        params = {
+            "id": article.id,
+            "pid": article.pid,
+            "db_obj": article,
+            "sites": [site_id],
+        }
+
+        if (
+            article.my_container.ctype.startswith("book")
+            or article.my_container.ctype == "lecture-notes"
+        ):
+            xobj = db_to_bookpart_data(article)
+            params["xobj"] = xobj
+            cmd = addBookPartSolrCmd(params)
+        else:
+            xobj = db_to_article_data(article)
+            params["xobj"] = xobj
+            cmd = addArticleSolrCmd(params)
+
+        xobj.body = self.get_body(xobj)
+        # if xobj.body:
+        #     print(article.pid)
+        # else:
+        #     print(article.pid, "No body")
+
+        cmd.add_collection(article.get_collection())
+        cmd.set_container(article.my_container)
+        cmd.do()  # pre_do starts by deleting the existing Solr document
+
+        for trans_article in article.translations.all():
+            print(trans_article.pid)
+            xtrans_obj = db_to_article_data(trans_article)
+            if xtrans_obj.body_html:
+                xtrans_obj.body = xml_utils.get_text_from_xml_with_mathml(xtrans_obj.body_html)
+            xtrans_obj.trans_title_tex = xobj.title_tex
+            xtrans_obj.trans_title_html = xobj.title_html
+
+            if article.trans_lang == trans_article.lang:
+                if article.trans_title_tex:
+                    xtrans_obj.title_tex = article.trans_title_tex
+                    xtrans_obj.title_html = article.trans_title_html
+                for abstract in xobj.abstracts:
+                    if abstract["tag"] == "abstract" and abstract["lang"] == trans_article.lang:
+                        xtrans_obj.abstracts = [abstract]
+
+            params = {
+                "id": trans_article.id,
+                "pid": trans_article.pid,
+                "db_obj": trans_article,
+                "sites": [site_id],
+                "xobj": xtrans_obj,
+            }
+            cmd = addArticleSolrCmd(params)
+            cmd.add_collection(article.get_collection())
+            cmd.set_container(article.my_container)
+            cmd.do()  # pre_do starts by deleting the existing Solr document
+
+    def reindex_book(self, book: Container):
+        xobj = db_to_book_data(book)
+        xobj.body = self.get_body(xobj)
+
+        if xobj.body:
+            print(book.pid)
+        else:
+            print(book.pid, "No body")
+
+        cmd = addContainerSolrCmd({"xobj": xobj, "id": book.id, "pid": book.pid, "db_obj": book})
+        cmd.add_collection(book.get_collection())
+        cmd.do()
-- 
GitLab


From 3f8a2a8e65e5719863fed8786c365b269758d926 Mon Sep 17 00:00:00 2001
From: Nathan Tien You <nathan.tien-you@univ-grenoble-alpes.fr>
Date: Tue, 25 Feb 2025 13:56:42 +0000
Subject: [PATCH 2/2] fix : solR pdf urls for Geodesic

---
 src/ptf/cmds/solr_cmds.py | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/src/ptf/cmds/solr_cmds.py b/src/ptf/cmds/solr_cmds.py
index 06e1928e..0350c014 100644
--- a/src/ptf/cmds/solr_cmds.py
+++ b/src/ptf/cmds/solr_cmds.py
@@ -1,10 +1,10 @@
 import pysolr
-
 from django.conf import settings
 
-from ptf.cmds.base_cmds import baseCmd
-from ptf.cmds.base_cmds import make_int
+from ptf.cmds.base_cmds import baseCmd, make_int
 from ptf.display import resolver
+from ptf.model_data import ResourceData
+from ptf.models import Resource
 from ptf.site_register import SITE_REGISTER
 from ptf.solr import search_helpers
 from ptf.utils import get_display_name
@@ -150,7 +150,7 @@ class solrDeleteCmd(solrCmd):
 class solrAddCmd(solrCmd):
     def __init__(self, params={}):
         self.commit = True
-        self.db_obj = None
+        self.db_obj: Resource | None = None
         self.id = None
         self.pid = None
         self.data = {}
@@ -194,7 +194,7 @@ class solrAddCmd(solrCmd):
 ######################################################################
 class addResourceSolrCmd(solrAddCmd):
     def __init__(self, params={}):
-        self.xobj = None  # model_data object
+        self.xobj: ResourceData | None = None  # model_data object
 
         # fields of the xobj to pass to SolR
         self.fields = [
@@ -300,9 +300,12 @@ class addResourceSolrCmd(solrAddCmd):
             for stream in self.xobj.streams:
                 mimetype = stream["mimetype"]
                 if mimetype in solr_fields:
-                    href = self.db_obj.get_binary_file_href_full_path(
-                        "self", mimetype, stream["location"]
-                    )
+                    if stream["location"].startswith("/"):
+                        href = self.db_obj.get_binary_file_href_full_path(
+                            "self", mimetype, stream["location"]
+                        )
+                    else:
+                        href = stream["location"]
                     self.data[solr_fields[mimetype]] = href
 
         if self.db_obj is not None:
-- 
GitLab