From 90e3fb8c2728119def71457d7004838e65be51ef Mon Sep 17 00:00:00 2001 From: rdimaio Date: Wed, 24 Jan 2024 19:15:46 +0100 Subject: [PATCH 1/5] Daemons: Create base Daemon class and refactor Undertaker to inherit from base class; #6478 Initial commit where base class Daemon is created. As an initial test, Undertaker inherits from Daemon. If tests pass, other daemons can also be refactored to inherit from Daemon. --- bin/rucio-undertaker | 11 +- lib/rucio/daemons/common.py | 302 ++++++++++++++++----- lib/rucio/daemons/undertaker/undertaker.py | 165 +++++------ tests/test_abacus_collection_replica.py | 5 +- tests/test_bb8.py | 8 +- tests/test_daemons.py | 22 +- tests/test_undertaker.py | 19 +- 7 files changed, 339 insertions(+), 193 deletions(-) diff --git a/bin/rucio-undertaker b/bin/rucio-undertaker index 15003ff85e..3141074a77 100755 --- a/bin/rucio-undertaker +++ b/bin/rucio-undertaker @@ -21,7 +21,7 @@ Undertaker is a daemon to manage expired did. import argparse import signal -from rucio.daemons.undertaker.undertaker import run, stop +from rucio.daemons.undertaker.undertaker import Undertaker def get_parser(): @@ -66,12 +66,11 @@ Check if the DID exists:: if __name__ == "__main__": - - signal.signal(signal.SIGTERM, stop) parser = get_parser() args = parser.parse_args() + undertaker = Undertaker(total_workers=args.total_workers, chunk_size=args.chunk_size, once=args.run_once, sleep_time=args.sleep_time) + signal.signal(signal.SIGTERM, undertaker.stop) try: - run(total_workers=args.total_workers, chunk_size=args.chunk_size, once=args.run_once, - sleep_time=args.sleep_time) + undertaker.run() except KeyboardInterrupt: - stop() + undertaker.stop() diff --git a/lib/rucio/daemons/common.py b/lib/rucio/daemons/common.py index 60883d6a25..424cfa7992 100644 --- a/lib/rucio/daemons/common.py +++ b/lib/rucio/daemons/common.py @@ -19,26 +19,128 @@ import os import queue import socket +import signal import threading import time from collections.abc import Callable, Generator, Iterator, Sequence from typing import Any, Generic, Optional, TypeVar, Union +from types import TracebackType, FrameType +from abc import ABC, abstractmethod -from rucio.common.logging import formatted_logger +import rucio.db.sqla.util +from rucio.common.logging import formatted_logger, setup_logging from rucio.common.utils import PriorityQueue from rucio.core import heartbeat as heartbeat_core from rucio.core.monitor import MetricManager +from rucio.common.exception import DatabaseException -T = TypeVar('T') +T = TypeVar("T") METRICS = MetricManager(module=__name__) +class Daemon(ABC): + """ + Base daemon class + """ + + def __init__( + self, + once: bool = False, + total_workers: int = 1, + sleep_time: int = 60, + partition_wait_time: int = 1, + daemon_name: str = "undefined_daemon", + ) -> None: + """ + :param once: Only execute one iteration. + :param total_workers: Number of total workers. + :param sleep_time: Sleep time between daemon iterations. + :param partition_wait_time: Time to wait for database partition rebalancing before starting the actual daemon loop. + :param daemon_name: Name of daemon that is constructed. + """ + self.once = once + self.total_workers = total_workers + self.sleep_time = sleep_time + self.partition_wait_time = partition_wait_time + self.daemon_name = daemon_name + self.graceful_stop = threading.Event() + signal.signal(signal.SIGTERM, self.stop) + setup_logging(process_name=daemon_name) + + @staticmethod + def _pre_run_checks() -> None: + """ + Checks to run before daemon execution + """ + if rucio.db.sqla.util.is_old_db(): + raise DatabaseException("Database was not updated, daemon won't start") + + @abstractmethod + def _run_once(self, heartbeat_handler: "HeartbeatHandler", **_kwargs) -> None: + """ + Daemon-specific logic (to be defined in child classes) for a single iteration + """ + pass + + def _call_daemon(self, activities: Optional[list[str]] = None) -> None: + """ + Run the daemon loop and call the function _run_once at each iteration + """ + + run_once_fnc = functools.partial(self._run_once) + + daemon = db_workqueue( + once=self.once, + graceful_stop=self.graceful_stop, + executable=self.daemon_name, + partition_wait_time=self.partition_wait_time, + sleep_time=self.sleep_time, + activities=activities, + )(run_once_fnc) + + for _ in daemon(): + pass + + def run(self) -> None: + """ + Run the daemon. + """ + self._pre_run_checks() + + if self.once: + self._call_daemon() + else: + logging.info("main: starting threads") + threads = [ + threading.Thread(target=self._call_daemon) + for _ in range(0, self.total_workers) + ] + [t.start() for t in threads] + logging.info("main: waiting for interrupts") + + # Interruptible joins require a timeout. + while threads[0].is_alive(): + [t.join(timeout=3.14) for t in threads] + + def stop( + self, signum: "Optional[int]" = None, frame: "Optional[FrameType]" = None + ) -> None: + """ + Graceful exit the daemon. Used as handler for SIGTERM. + The unused parameters are needed for the method to be accepted as handler for signal.signal(). + + :param signum: signal number + :param frame: stack frame + """ + self.graceful_stop.set() + + class HeartbeatHandler: """ Simple contextmanager which sets a heartbeat and associated logger on entry and cleans up the heartbeat on exit. """ - def __init__(self, executable: str, renewal_interval: int): + def __init__(self, executable: str, renewal_interval: int) -> None: """ :param executable: the executable name which will be set in heartbeats :param renewal_interval: the interval at which the heartbeat will be renewed in the database. @@ -47,7 +149,9 @@ def __init__(self, executable: str, renewal_interval: int): self.executable = executable self._hash_executable = None self.renewal_interval = renewal_interval - self.older_than = renewal_interval * 10 if renewal_interval and renewal_interval > 0 else None # 10 was chosen without any particular reason + self.older_than = ( + renewal_interval * 10 if renewal_interval and renewal_interval > 0 else None + ) # 10 was chosen without any particular reason self.hostname = socket.getfqdn() self.pid = os.getpid() @@ -58,59 +162,90 @@ def __init__(self, executable: str, renewal_interval: int): self.last_time = None self.last_payload = None - def __enter__(self): + def __enter__(self) -> "HeartbeatHandler": heartbeat_core.sanity_check(executable=self.executable, hostname=self.hostname) self.live() return self - def __exit__(self, exc_type, exc_val, exc_tb): + def __exit__( + self, + exc_type: type[BaseException], + exc_val: BaseException, + exc_tb: TracebackType, + ) -> None: if self.last_heart_beat: heartbeat_core.die(self.executable, self.hostname, self.pid, self.hb_thread) if self.logger: - self.logger(logging.INFO, 'Heartbeat cleaned up') + self.logger(logging.INFO, "Heartbeat cleaned up") @property - def hash_executable(self): + def hash_executable(self) -> str: if not self._hash_executable: self._hash_executable = heartbeat_core.calc_hash(self.executable) return self._hash_executable @property - def short_executable(self): + def short_executable(self) -> str: return min(self.executable, self.hash_executable, key=len) - def live(self, force_renew: bool = False, payload: Optional[str] = None): + def live( + self, force_renew: bool = False, payload: Optional[str] = None + ) -> tuple[int, int, Callable]: """ :return: a tuple: , , """ - if force_renew \ - or not self.last_time \ - or not self.last_heart_beat \ - or self.last_time < datetime.datetime.now() - datetime.timedelta(seconds=self.renewal_interval) \ - or self.last_payload != payload: + if ( + force_renew + or not self.last_time + or not self.last_heart_beat + or self.last_time + < datetime.datetime.now() + - datetime.timedelta(seconds=self.renewal_interval) + or self.last_payload != payload + ): if self.older_than: - self.last_heart_beat = heartbeat_core.live(self.executable, self.hostname, self.pid, self.hb_thread, payload=payload, older_than=self.older_than) + self.last_heart_beat = heartbeat_core.live( + self.executable, + self.hostname, + self.pid, + self.hb_thread, + payload=payload, + older_than=self.older_than, + ) else: - self.last_heart_beat = heartbeat_core.live(self.executable, self.hostname, self.pid, self.hb_thread, payload=payload) - - prefix = '[%i/%i]: ' % (self.last_heart_beat['assign_thread'], self.last_heart_beat['nr_threads']) - self.logger = formatted_logger(logging.log, prefix + '%s') + self.last_heart_beat = heartbeat_core.live( + self.executable, + self.hostname, + self.pid, + self.hb_thread, + payload=payload, + ) + + prefix = "[%i/%i]: " % ( + self.last_heart_beat["assign_thread"], + self.last_heart_beat["nr_threads"], + ) + self.logger = formatted_logger(logging.log, prefix + "%s") if not self.last_time: - self.logger(logging.DEBUG, 'First heartbeat set') + self.logger(logging.DEBUG, "First heartbeat set") else: - self.logger(logging.DEBUG, 'Heartbeat renewed') + self.logger(logging.DEBUG, "Heartbeat renewed") self.last_time = datetime.datetime.now() self.last_payload = payload - return self.last_heart_beat['assign_thread'], self.last_heart_beat['nr_threads'], self.logger + return ( + self.last_heart_beat["assign_thread"], + self.last_heart_beat["nr_threads"], + self.logger, + ) def _activity_looper( - once: bool, - sleep_time: int, - activities: Optional[Sequence[str]], - heartbeat_handler: HeartbeatHandler, + once: bool, + sleep_time: int, + activities: Optional[Sequence[str]], + heartbeat_handler: HeartbeatHandler, ) -> Generator[tuple[str, float], tuple[float, bool], None]: """ Generator which loops (either once, or indefinitely) over all activities while ensuring that `sleep_time` @@ -143,14 +278,19 @@ def _activity_looper( logger = heartbeat_handler.logger if time_to_sleep > 0: if activity: - logger(logging.DEBUG, 'Switching to activity %s and sleeping %s seconds', activity, time_to_sleep) + logger( + logging.DEBUG, + "Switching to activity %s and sleeping %s seconds", + activity, + time_to_sleep, + ) else: - logger(logging.DEBUG, 'Sleeping %s seconds', time_to_sleep) + logger(logging.DEBUG, "Sleeping %s seconds", time_to_sleep) else: if activity: - logger(logging.DEBUG, 'Switching to activity %s', activity) + logger(logging.DEBUG, "Switching to activity %s", activity) else: - logger(logging.DEBUG, 'Starting next iteration') + logger(logging.DEBUG, "Starting next iteration") # The calling context notifies us when the activity actually got handled. And if sleeping is desired. actual_exe_time, must_sleep = yield activity, time_to_sleep @@ -165,12 +305,12 @@ def _activity_looper( def db_workqueue( - once: bool, - graceful_stop: threading.Event, - executable: str, - partition_wait_time: int, - sleep_time: int, - activities: Optional[Sequence[str]] = None, + once: bool, + graceful_stop: threading.Event, + executable: str, + partition_wait_time: int, + sleep_time: int, + activities: Optional[Sequence[str]] = None, ): """ Used to wrap a function for interacting with the database as a work queue: i.e. to select @@ -186,20 +326,29 @@ def db_workqueue( :param activities: optional list of activities on which to work. The run_once_fnc will be called on activities one by one. """ - def _decorate(run_once_fnc: Callable[..., Optional[Union[bool, tuple[bool, T]]]]) -> Callable[[], Iterator[Optional[T]]]: + def _decorate( + run_once_fnc: Callable[..., Optional[Union[bool, tuple[bool, T]]]] + ) -> Callable[[], Iterator[Optional[T]]]: @functools.wraps(run_once_fnc) def _generator(): - with HeartbeatHandler(executable=executable, renewal_interval=sleep_time - 1) as heartbeat_handler: + with HeartbeatHandler( + executable=executable, renewal_interval=sleep_time - 1 + ) as heartbeat_handler: logger = heartbeat_handler.logger - logger(logging.INFO, 'started') + logger(logging.INFO, "started") if partition_wait_time: graceful_stop.wait(partition_wait_time) _, _, logger = heartbeat_handler.live(force_renew=True) - activity_loop = _activity_looper(once=once, sleep_time=sleep_time, activities=activities, heartbeat_handler=heartbeat_handler) + activity_loop = _activity_looper( + once=once, + sleep_time=sleep_time, + activities=activities, + heartbeat_handler=heartbeat_handler, + ) activity, time_to_sleep = next(activity_loop, (None, None)) while time_to_sleep is not None: if graceful_stop.is_set(): @@ -213,7 +362,9 @@ def _generator(): must_sleep = True start_time = time.time() try: - result = run_once_fnc(heartbeat_handler=heartbeat_handler, activity=activity) + result = run_once_fnc( + heartbeat_handler=heartbeat_handler, activity=activity + ) # Handle return values already existing in the code # TODO: update all existing daemons to always explicitly return (must_sleep, ret_value) @@ -229,18 +380,22 @@ def _generator(): if ret_value is not None: yield ret_value except Exception as e: - METRICS.counter('exceptions.{exception}').labels(exception=e.__class__.__name__).inc() + METRICS.counter("exceptions.{exception}").labels( + exception=e.__class__.__name__ + ).inc() logger(logging.CRITICAL, "Exception", exc_info=True) if once: raise try: - activity, time_to_sleep = activity_loop.send((start_time, must_sleep)) + activity, time_to_sleep = activity_loop.send( + (start_time, must_sleep) + ) except StopIteration: break if not once: - logger(logging.INFO, 'Graceful stop requested') + logger(logging.INFO, "Graceful stop requested") return _generator @@ -248,13 +403,14 @@ def _generator(): def run_daemon( - once: bool, - graceful_stop: threading.Event, - executable: str, - partition_wait_time: int, - sleep_time: int, - run_once_fnc: Callable[..., Optional[Union[bool, tuple[bool, Any]]]], - activities: Optional[list[str]] = None): + once: bool, + graceful_stop: threading.Event, + executable: str, + partition_wait_time: int, + sleep_time: int, + run_once_fnc: Callable[..., Optional[Union[bool, tuple[bool, Any]]]], + activities: Optional[list[str]] = None, +) -> None: """ Run the daemon loop and call the function run_once_fnc at each iteration """ @@ -288,11 +444,7 @@ def __init__(self, producers, consumers, graceful_stop, logger=logging.log): self.producers_done_event = threading.Event() self.logger = logger - def _produce( - self, - it: Callable[[], Iterator[T]], - wait_for_consumers: bool = False - ): + def _produce(self, it: Callable[[], Iterator[T]], wait_for_consumers: bool = False): """ Iterate over the generator function and put the extracted elements into the queue. @@ -314,7 +466,9 @@ def _produce( except StopIteration: break except Exception as e: - METRICS.counter('exceptions.{exception}').labels(exception=e.__class__.__name__).inc() + METRICS.counter("exceptions.{exception}").labels( + exception=e.__class__.__name__ + ).inc() self.logger(logging.CRITICAL, "Exception", exc_info=True) finally: with self.lock: @@ -325,10 +479,7 @@ def _produce( if wait_for_consumers: self.queue.join() - def _consume( - self, - fnc: Callable[[T], Any] - ): + def _consume(self, fnc: Callable[[T], Any]): """ Wait for elements to arrive via the queue and call the given function on each element. @@ -344,7 +495,9 @@ def _consume( try: fnc(product) except Exception as e: - METRICS.counter('exceptions.{exception}').labels(exception=e.__class__.__name__).inc() + METRICS.counter("exceptions.{exception}").labels( + exception=e.__class__.__name__ + ).inc() self.logger(logging.CRITICAL, "Exception", exc_info=True) finally: self.queue.task_done() @@ -355,11 +508,8 @@ def run(self): for i, producer in enumerate(self.producers): thread = threading.Thread( target=self._produce, - name=f'producer-{i}-{producer.__name__}', - kwargs={ - 'it': producer, - 'wait_for_consumers': True - } + name=f"producer-{i}-{producer.__name__}", + kwargs={"it": producer, "wait_for_consumers": True}, ) thread.start() producer_threads.append(thread) @@ -368,24 +518,28 @@ def run(self): for i, consumer in enumerate(self.consumers): thread = threading.Thread( target=self._consume, - name=f'consumer-{i}-{consumer.__name__}', + name=f"consumer-{i}-{consumer.__name__}", kwargs={ - 'fnc': consumer, - } + "fnc": consumer, + }, ) thread.start() consumer_threads.append(thread) - logging.info('waiting for interrupts') + logging.info("waiting for interrupts") while producer_threads: for thread in producer_threads: thread.join(timeout=3.14) - producer_threads = [thread for thread in producer_threads if thread.is_alive()] + producer_threads = [ + thread for thread in producer_threads if thread.is_alive() + ] self.producers_done_event.set() while consumer_threads: for thread in consumer_threads: thread.join(timeout=3.14) - consumer_threads = [thread for thread in consumer_threads if thread.is_alive()] + consumer_threads = [ + thread for thread in consumer_threads if thread.is_alive() + ] diff --git a/lib/rucio/daemons/undertaker/undertaker.py b/lib/rucio/daemons/undertaker/undertaker.py index 1db57f3201..b8b061a56d 100644 --- a/lib/rucio/daemons/undertaker/undertaker.py +++ b/lib/rucio/daemons/undertaker/undertaker.py @@ -17,7 +17,6 @@ Undertaker is a daemon to manage expired did. ''' -import functools import logging import threading import traceback @@ -25,114 +24,84 @@ from datetime import datetime, timedelta from random import randint from re import match -from typing import TYPE_CHECKING from sqlalchemy.exc import DatabaseError from rucio.db.sqla.constants import MYSQL_LOCK_NOWAIT_REGEX, ORACLE_RESOURCE_BUSY_REGEX, PSQL_LOCK_NOT_AVAILABLE_REGEX -import rucio.db.sqla.util from rucio.common.exception import DatabaseException, UnsupportedOperation, RuleNotFound -from rucio.common.logging import setup_logging from rucio.common.types import InternalAccount from rucio.common.utils import chunks from rucio.core.did import list_expired_dids, delete_dids from rucio.core.monitor import MetricManager -from rucio.daemons.common import run_daemon, HeartbeatHandler - -if TYPE_CHECKING: - from types import FrameType - from typing import Optional +from rucio.daemons.common import Daemon, HeartbeatHandler logging.getLogger("requests").setLevel(logging.CRITICAL) METRICS = MetricManager(module=__name__) graceful_stop = threading.Event() -DAEMON_NAME = 'undertaker' - - -def undertaker(once: bool = False, sleep_time: int = 60, chunk_size: int = 10): - """ - Main loop to select and delete dids. - """ - paused_dids = {} # {(scope, name): datetime} - run_daemon( - once=once, - graceful_stop=graceful_stop, - executable=DAEMON_NAME, - partition_wait_time=1, - sleep_time=sleep_time, - run_once_fnc=functools.partial( - run_once, - paused_dids=paused_dids, - chunk_size=chunk_size, - ) - ) - - -def run_once(paused_dids: dict[tuple, datetime], chunk_size: int, heartbeat_handler: HeartbeatHandler, **_kwargs): - worker_number, total_workers, logger = heartbeat_handler.live() - - try: - # Refresh paused dids - iter_paused_dids = deepcopy(paused_dids) - for key in iter_paused_dids: - if datetime.utcnow() > paused_dids[key]: - del paused_dids[key] - - dids = list_expired_dids(worker_number=worker_number, total_workers=total_workers, limit=10000) - - dids = [did for did in dids if (did['scope'], did['name']) not in paused_dids] - - if not dids: - logger(logging.INFO, 'did not get any work') - return - - for chunk in chunks(dids, chunk_size): - _, _, logger = heartbeat_handler.live() - try: - logger(logging.INFO, 'Receive %s dids to delete', len(chunk)) - delete_dids(dids=chunk, account=InternalAccount('root', vo='def'), expire_rules=True) - logger(logging.INFO, 'Delete %s dids', len(chunk)) - METRICS.counter(name='undertaker.delete_dids').inc(len(chunk)) - except RuleNotFound as error: - logger(logging.ERROR, error) - except (DatabaseException, DatabaseError, UnsupportedOperation) as e: - if match(ORACLE_RESOURCE_BUSY_REGEX, str(e.args[0])) or match(PSQL_LOCK_NOT_AVAILABLE_REGEX, str(e.args[0])) or match(MYSQL_LOCK_NOWAIT_REGEX, str(e.args[0])): - for did in chunk: - paused_dids[(did['scope'], did['name'])] = datetime.utcnow() + timedelta(seconds=randint(600, 2400)) - METRICS.counter('delete_dids.exceptions.{exception}').labels(exception='LocksDetected').inc() - logger(logging.WARNING, 'Locks detected for chunk') - else: - logger(logging.ERROR, 'Got database error %s.', str(e)) - except: - logging.critical(traceback.format_exc()) - - -def stop(signum: "Optional[int]" = None, frame: "Optional[FrameType]" = None) -> None: - """ - Graceful exit. - """ - graceful_stop.set() - - -def run(once: bool = False, total_workers: int = 1, chunk_size: int = 10, sleep_time: int = 60): - """ - Starts up the undertaker threads. - """ - setup_logging(process_name=DAEMON_NAME) - - if rucio.db.sqla.util.is_old_db(): - raise DatabaseException("Database was not updated, daemon won't start") - - if once: - undertaker(once) - else: - logging.info('main: starting threads') - threads = [threading.Thread(target=undertaker, kwargs={'once': once, 'chunk_size': chunk_size, - 'sleep_time': sleep_time}) for i in range(0, total_workers)] - [t.start() for t in threads] - logging.info('main: waiting for interrupts') - - # Interruptible joins require a timeout. - while threads[0].is_alive(): - [t.join(timeout=3.14) for t in threads] +DAEMON_NAME = "undertaker" + + +class Undertaker(Daemon): + def __init__(self, chunk_size: int = 10, **_kwargs) -> None: + super().__init__(daemon_name="undertaker", **_kwargs) + self.chunk_size = chunk_size + self.paused_dids = {} + + def _run_once(self, heartbeat_handler: HeartbeatHandler, **_kwargs) -> None: + worker_number, total_workers, logger = heartbeat_handler.live() + + try: + # Refresh paused dids + iter_paused_dids = deepcopy(self.paused_dids) + for key in iter_paused_dids: + if datetime.utcnow() > self.paused_dids[key]: + del self.paused_dids[key] + + dids = list_expired_dids( + worker_number=worker_number, total_workers=total_workers, limit=10000 + ) + + dids = [ + did + for did in dids + if (did["scope"], did["name"]) not in self.paused_dids + ] + + if not dids: + logger(logging.INFO, "did not get any work") + return + + for chunk in chunks(dids, self.chunk_size): + _, _, logger = heartbeat_handler.live() + try: + logger(logging.INFO, "Receive %s dids to delete", len(chunk)) + delete_dids( + dids=chunk, + account=InternalAccount("root", vo="def"), + expire_rules=True, + ) + logger(logging.INFO, "Delete %s dids", len(chunk)) + METRICS.counter(name="undertaker.delete_dids").inc(len(chunk)) + except RuleNotFound as error: + logger(logging.ERROR, error) + except (DatabaseException, DatabaseError, UnsupportedOperation) as e: + if ( + match(ORACLE_RESOURCE_BUSY_REGEX, str(e.args[0])) + or match(PSQL_LOCK_NOT_AVAILABLE_REGEX, str(e.args[0])) + or match(MYSQL_LOCK_NOWAIT_REGEX, str(e.args[0])) + ): + for did in chunk: + self.paused_dids[ + (did["scope"], did["name"]) + ] = datetime.utcnow() + timedelta( + seconds=randint(600, 2400) + ) + METRICS.counter("delete_dids.exceptions.{exception}").labels( + exception="LocksDetected" + ).inc() + logger(logging.WARNING, "Locks detected for chunk") + else: + logger(logging.ERROR, "Got database error %s.", str(e)) + except: + logging.critical(traceback.format_exc()) diff --git a/tests/test_abacus_collection_replica.py b/tests/test_abacus_collection_replica.py index b9f57c9fe8..10f30f87dc 100644 --- a/tests/test_abacus_collection_replica.py +++ b/tests/test_abacus_collection_replica.py @@ -23,7 +23,7 @@ from rucio.daemons.abacus import collection_replica from rucio.daemons.judge import cleaner from rucio.daemons.reaper import reaper -from rucio.daemons.undertaker import undertaker +from rucio.daemons.undertaker.undertaker import Undertaker from rucio.db.sqla import models, session from rucio.db.sqla.constants import DIDType, ReplicaState from rucio.tests.common import did_name_generator @@ -55,7 +55,8 @@ def test_abacus_collection_replica_cleanup(self, vo, mock_scope, rse_factory, di db_session.commit() assert len(get_cleaned_updated_collection_replicas(1, 1)) == 3 did_client.set_metadata(mock_scope.external, dataset, 'lifetime', -1) - undertaker.run(once=True) + undertaker = Undertaker(once=True) + undertaker.run() cleaner.run(once=True) if vo: reaper.run(once=True, include_rses='vo=%s&(%s|%s)' % (str(vo), rse1, rse2), greedy=True) diff --git a/tests/test_bb8.py b/tests/test_bb8.py index fd2ee62823..4251165394 100644 --- a/tests/test_bb8.py +++ b/tests/test_bb8.py @@ -30,7 +30,7 @@ from rucio.daemons.bb8.bb8 import run as bb8_run from rucio.daemons.judge.cleaner import rule_cleaner from rucio.daemons.judge.evaluator import re_evaluator -from rucio.daemons.undertaker import undertaker +from rucio.daemons.undertaker.undertaker import Undertaker from rucio.db.sqla.constants import RuleState from .test_rule import create_files, tag_generator @@ -92,7 +92,8 @@ def test_bb8_rebalance_rule(vo, root_account, jdoe_account, rse_factory, mock_sc rule_cleaner(once=True) assert (get_rule(child_rule)['state'] == RuleState.OK) set_metadata(mock_scope, dataset['name'], 'lifetime', -86400) - undertaker.run(once=True) + undertaker = Undertaker(once=True) + undertaker.run() @pytest.mark.noparallel(reason='uses daemons') @@ -267,4 +268,5 @@ def test_bb8_full_workflow(vo, root_account, jdoe_account, rse_factory, mock_sco for dataset in dsn: set_metadata(mock_scope, dataset, 'lifetime', -86400) - undertaker.run(once=True) + undertaker = Undertaker(once=True) + undertaker.run() diff --git a/tests/test_daemons.py b/tests/test_daemons.py index 51dfd4fe3a..a67fdab915 100644 --- a/tests/test_daemons.py +++ b/tests/test_daemons.py @@ -34,7 +34,7 @@ from rucio.daemons.replicarecoverer import suspicious_replica_recoverer from rucio.daemons.tracer import kronos from rucio.daemons.transmogrifier import transmogrifier -from rucio.daemons.undertaker import undertaker +from rucio.daemons.common import Daemon DAEMONS = [ account, @@ -66,15 +66,15 @@ suspicious_replica_recoverer, kronos, transmogrifier, - undertaker, ] ids = [mod.__name__ for mod in DAEMONS] +# This test is to be deleted once https://github.com/rucio/rucio/issues/6478 is completed @pytest.mark.parametrize('daemon', argvalues=DAEMONS, ids=ids) @mock.patch('rucio.db.sqla.util.is_old_db') -def test_fail_on_old_database(mock_is_old_db, daemon): +def test_fail_on_old_database_parametrized(mock_is_old_db, daemon): """ DAEMON: Test daemon failure on old database """ mock_is_old_db.return_value = True assert rucio.db.sqla.util.is_old_db() is True @@ -83,3 +83,19 @@ def test_fail_on_old_database(mock_is_old_db, daemon): daemon.run() assert mock_is_old_db.call_count > 1 + + +class TestDaemon(Daemon): + def _run_once(self, heartbeat_handler, **_kwargs): + pass + +@mock.patch('rucio.db.sqla.util.is_old_db') +def test_fail_on_old_database(mock_is_old_db): + """ DAEMON: Test daemon failure on old database """ + mock_is_old_db.return_value = True + assert rucio.db.sqla.util.is_old_db() is True + + with pytest.raises(exception.DatabaseException, match='Database was not updated, daemon won\'t start'): + TestDaemon().run() + + assert mock_is_old_db.call_count > 1 diff --git a/tests/test_undertaker.py b/tests/test_undertaker.py index 58f39fe7bf..bf141f6b4d 100644 --- a/tests/test_undertaker.py +++ b/tests/test_undertaker.py @@ -26,13 +26,12 @@ from rucio.core.rse import add_rse from rucio.core.rule import add_rules, list_rules from rucio.daemons.judge.cleaner import rule_cleaner -from rucio.daemons.undertaker.undertaker import undertaker +from rucio.daemons.undertaker.undertaker import Undertaker from rucio.db.sqla.util import json_implemented from rucio.tests.common import rse_name_generator, did_name_generator LOG = getLogger(__name__) - @pytest.mark.dirty @pytest.mark.noparallel(reason='uses pre-defined rses; runs undertaker, which impacts other tests') class TestUndertaker: @@ -75,8 +74,9 @@ def test_undertaker(self, vo, rse_factory, mock_scope, root_account, jdoe_accoun add_rules(dids=dsns1, rules=[{'account': jdoe_account, 'copies': 1, 'rse_expression': rse, 'grouping': 'DATASET'}]) - undertaker(once=True) - undertaker(once=True) + undertaker = Undertaker(once=True) + undertaker._call_daemon() + undertaker._call_daemon() for replica in replicas: assert get_replica(scope=replica['scope'], name=replica['name'], rse_id=rse_id)['tombstone'] is not None @@ -132,7 +132,9 @@ def test_atlas_archival_policy(self, vo, mock_scope, root_account, jdoe_account) attach_dids(scope=mock_scope, name=dsn['name'], rse_id=rse_id, dids=files, account=root_account) replicas += files - undertaker(once=True) + undertaker = Undertaker(once=True) + undertaker._call_daemon() + undertaker._call_daemon() for replica in replicas: assert (get_replica(scope=replica['scope'], name=replica['name'], rse_id=rse_id)['tombstone'] is None) @@ -183,8 +185,11 @@ def test_removal_all_replicas2(rse_factory, root_account, mock_scope, core_confi rule_cleaner(once=True) for replica in replicas: assert get_replica(scope=replica['scope'], name=replica['name'], rse_id=rse2_id)['tombstone'] is not None - undertaker(once=True) - undertaker(once=True) + + + undertaker = Undertaker(once=True) + undertaker._call_daemon() + undertaker._call_daemon() for replica in replicas: assert get_replica(scope=replica['scope'], name=replica['name'], rse_id=rse1_id)['tombstone'] == datetime(year=1970, month=1, day=1) From e40463586505452e5c8e972d40fb7169214e0c67 Mon Sep 17 00:00:00 2001 From: rdimaio Date: Mon, 29 Jan 2024 18:34:49 +0100 Subject: [PATCH 2/5] Daemons: Refactor Reaper to inherit from base Daemon class; #6478 --- bin/rucio-reaper | 35 +- lib/rucio/common/exception.py | 10 + lib/rucio/core/rse_expression_parser.py | 2 +- lib/rucio/daemons/common.py | 39 +- lib/rucio/daemons/reaper/reaper.py | 1538 ++++++++++------- lib/rucio/daemons/undertaker/undertaker.py | 12 +- tests/test_abacus_account.py | 8 +- tests/test_abacus_collection_replica.py | 18 +- tests/test_abacus_rse.py | 8 +- tests/test_api_external_representation.py | 8 +- tests/test_conveyor.py | 13 +- tests/test_conveyor_submitter.py | 9 +- tests/test_daemons.py | 8 +- tests/test_reaper.py | 49 +- tools/count_missing_type_annotations_utils.sh | 4 +- 15 files changed, 1057 insertions(+), 704 deletions(-) diff --git a/bin/rucio-reaper b/bin/rucio-reaper index 711fa836e2..bc6ef8c270 100755 --- a/bin/rucio-reaper +++ b/bin/rucio-reaper @@ -21,7 +21,7 @@ import argparse import signal from rucio.common.utils import StoreAndDeprecateWarningAction -from rucio.daemons.reaper.reaper import run, stop +from rucio.daemons.reaper.reaper import Reaper def get_parser(): @@ -61,24 +61,23 @@ def get_parser(): if __name__ == "__main__": - # Bind our callback to the SIGTERM signal and run the daemon: - signal.signal(signal.SIGTERM, stop) - parser = get_parser() args = parser.parse_args() + reaper = Reaper(threads=args.threads, + chunk_size=args.chunk_size, + include_rses=args.include_rses, + exclude_rses=args.exclude_rses, + rses=args.rses, + vos=args.vos, + once=args.run_once, + greedy=args.greedy, + scheme=args.scheme, + delay_seconds=args.delay_seconds, + sleep_time=args.sleep_time, + auto_exclude_threshold=args.auto_exclude_threshold, + auto_exclude_timeout=args.auto_exclude_timeout) + signal.signal(signal.SIGTERM, reaper.stop) try: - run(threads=args.threads, - chunk_size=args.chunk_size, - include_rses=args.include_rses, - exclude_rses=args.exclude_rses, - rses=args.rses, - vos=args.vos, - once=args.run_once, - greedy=args.greedy, - scheme=args.scheme, - delay_seconds=args.delay_seconds, - sleep_time=args.sleep_time, - auto_exclude_threshold=args.auto_exclude_threshold, - auto_exclude_timeout=args.auto_exclude_timeout) + reaper.run() except KeyboardInterrupt: - stop() + reaper.stop() diff --git a/lib/rucio/common/exception.py b/lib/rucio/common/exception.py index d975fb3b7e..2993a0d33e 100644 --- a/lib/rucio/common/exception.py +++ b/lib/rucio/common/exception.py @@ -1090,3 +1090,13 @@ def __init__(self, *args, **kwargs): super(DeprecationError, self).__init__(*args, **kwargs) self._message = 'Command or function has been deprecated.' self.error_code = 105 + + +class ReaperNoRSEsToProcess(RucioException): + """ + Reaper found no RSEs to process. + """ + def __init__(self, *args, **kwargs): + super(ReaperNoRSEsToProcess, self).__init__(*args, **kwargs) + self._message = 'Reaper: No RSEs to process found.' + self.error_code = 106 diff --git a/lib/rucio/core/rse_expression_parser.py b/lib/rucio/core/rse_expression_parser.py index 5a87d937fd..a739d4e678 100644 --- a/lib/rucio/core/rse_expression_parser.py +++ b/lib/rucio/core/rse_expression_parser.py @@ -42,7 +42,7 @@ @transactional_session -def parse_expression(expression, filter_=None, *, session: "Session"): +def parse_expression(expression: str, filter_=None, *, session: "Session"): """ Parse a RSE expression and return the list of RSE dictionaries. diff --git a/lib/rucio/daemons/common.py b/lib/rucio/daemons/common.py index 424cfa7992..d4478c8810 100644 --- a/lib/rucio/daemons/common.py +++ b/lib/rucio/daemons/common.py @@ -76,9 +76,17 @@ def _pre_run_checks() -> None: raise DatabaseException("Database was not updated, daemon won't start") @abstractmethod - def _run_once(self, heartbeat_handler: "HeartbeatHandler", **_kwargs) -> None: + def _run_once( + self, heartbeat_handler: "HeartbeatHandler", **_kwargs + ) -> tuple[bool, Any]: """ Daemon-specific logic (to be defined in child classes) for a single iteration + + :param heartbeat_handler: Handler to set and manage the heartbeat for this execution. + + :returns: Tuple of (must_sleep, ret_value). + must_sleep: set to True to signal to the calling context that it must sleep before next execution, False otherwise + ret_value: Daemon-specific return value """ pass @@ -107,20 +115,23 @@ def run(self) -> None: """ self._pre_run_checks() - if self.once: - self._call_daemon() - else: - logging.info("main: starting threads") - threads = [ - threading.Thread(target=self._call_daemon) - for _ in range(0, self.total_workers) - ] - [t.start() for t in threads] - logging.info("main: waiting for interrupts") + logging.info(f"{self.daemon_name}: starting threads") + thread_list = [ + threading.Thread(target=self._call_daemon) + for _ in range(0, self.total_workers) + ] + [t.start() for t in thread_list] - # Interruptible joins require a timeout. - while threads[0].is_alive(): - [t.join(timeout=3.14) for t in threads] + if not self.once: + logging.info(f"{self.daemon_name}: waiting for interrupts") + + # Interruptible joins require a timeout. + while thread_list: + thread_list = [ + thread.join(timeout=3.14) + for thread in thread_list + if thread and thread.is_alive() + ] def stop( self, signum: "Optional[int]" = None, frame: "Optional[FrameType]" = None diff --git a/lib/rucio/daemons/reaper/reaper.py b/lib/rucio/daemons/reaper/reaper.py index 5528354724..3b39941107 100644 --- a/lib/rucio/daemons/reaper/reaper.py +++ b/lib/rucio/daemons/reaper/reaper.py @@ -17,7 +17,6 @@ Reaper is a daemon to manage file deletion. ''' -import functools import logging import random import threading @@ -25,22 +24,21 @@ import traceback from configparser import NoOptionError, NoSectionError from datetime import datetime, timedelta -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Optional, Any from dogpile.cache.api import NoValue from math import log2 from sqlalchemy.exc import DatabaseError, IntegrityError -import rucio.db.sqla.util from rucio.common.cache import make_region_memcached from rucio.common.config import config_get_bool, config_get_int from rucio.common.exception import (DatabaseException, RSENotFound, ReplicaUnAvailable, ReplicaNotFound, ServiceUnavailable, RSEAccessDenied, ResourceTemporaryUnavailable, SourceNotFound, - VONotFound, RSEProtocolNotSupported) -from rucio.common.logging import setup_logging + VONotFound, RSEProtocolNotSupported, ReaperNoRSEsToProcess) from rucio.common.stopwatch import Stopwatch from rucio.common.utils import chunks +from rucio.common.types import RSESettingsDict from rucio.core.credential import get_signed_url from rucio.core.heartbeat import list_payload_counts from rucio.core.message import add_message @@ -52,643 +50,947 @@ from rucio.core.rse_expression_parser import parse_expression from rucio.core.rule import get_evaluation_backlog from rucio.core.vo import list_vos -from rucio.daemons.common import run_daemon +from rucio.daemons.common import Daemon, HeartbeatHandler from rucio.rse import rsemanager as rsemgr if TYPE_CHECKING: from collections.abc import Callable - from types import FrameType - from typing import Any, Optional - - from rucio.daemons.common import HeartbeatHandler GRACEFUL_STOP = threading.Event() METRICS = MetricManager(module=__name__) REGION = make_region_memcached(expiration_time=600) -DAEMON_NAME = 'reaper' - -EXCLUDED_RSE_GAUGE = METRICS.gauge('excluded_rses.{rse}', documentation='Temporarly excluded RSEs') - - -def get_rses_to_process(rses, include_rses, exclude_rses, vos): - """ - Return the list of RSEs to process based on rses, include_rses and exclude_rses - - :param rses: List of RSEs the reaper should work against. If empty, it considers all RSEs. - :param exclude_rses: RSE expression to exclude RSEs from the Reaper. - :param include_rses: RSE expression to include RSEs. - :param vos: VOs on which to look for RSEs. Only used in multi-VO mode. - If None, we either use all VOs if run from "def" - - :returns: A list of RSEs to process - """ - multi_vo = config_get_bool('common', 'multi_vo', raise_exception=False, default=False) - if not multi_vo: - if vos: - logging.log(logging.WARNING, 'Ignoring argument vos, this is only applicable in a multi-VO setup.') - vos = ['def'] - else: - if vos: - invalid = set(vos) - set([v['vo'] for v in list_vos()]) - if invalid: - msg = 'VO{} {} cannot be found'.format('s' if len(invalid) > 1 else '', ', '.join([repr(v) for v in invalid])) - raise VONotFound(msg) - else: - vos = [v['vo'] for v in list_vos()] - logging.log(logging.INFO, 'Reaper: This instance will work on VO%s: %s' % ('s' if len(vos) > 1 else '', ', '.join([v for v in vos]))) - cache_key = 'rses_to_process_1%s2%s3%s' % (str(rses), str(include_rses), str(exclude_rses)) - if multi_vo: - cache_key += '@%s' % '-'.join(vo for vo in vos) - result = REGION.get(cache_key) - if not isinstance(result, NoValue): - return result +class Reaper(Daemon): + def __init__( + self, + rses: Optional[list[str]] = None, + scheme: Optional[str] = None, + exclude_rses: Optional[str] = None, + include_rses: Optional[str] = None, + chunk_size: int = 100, + greedy: bool = False, + vos: Optional[list[str]] = None, + delay_seconds: int = 0, + auto_exclude_threshold: int = 100, + auto_exclude_timeout: int = 600, + **_kwargs, + ) -> None: + """ + :param rses: List of RSEs the reaper should work against. + If empty, it considers all RSEs. + :param scheme: Force the reaper to use a particular protocol/scheme, e.g., mock. + :param exclude_rses: RSE expression to exclude RSEs from the Reaper. + :param include_rses: RSE expression to include RSEs. + :param chunk_size: The size of chunk for deletion. + :param threads_per_worker: Total number of threads created by each worker. + :param greedy: If True, delete right away replicas with tombstone. + :param vos: VOs on which to look for RSEs. Only used in multi-VO mode. + If None, we either use all VOs if run from "def", + or the current VO otherwise. + :param delay_seconds: The delay to query replicas in BEING_DELETED state. + :param auto_exclude_threshold: Number of service unavailable exceptions after which the RSE gets temporarily excluded. + :param auto_exclude_timeout: Timeout for temporarily excluded RSEs. + """ + super().__init__(daemon_name="reaper", **_kwargs) + self.rses = rses + self.scheme = scheme + self.exclude_rses = exclude_rses + self.include_rses = include_rses + self.chunk_size = chunk_size + self.greedy = greedy + self.vos = vos + self.delay_seconds = delay_seconds + self.auto_exclude_threshold = auto_exclude_threshold + self.auto_exclude_timeout = auto_exclude_timeout + + def _pre_run_checks(self): + super()._pre_run_checks() + rses_to_process = self._get_rses_to_process() + if not rses_to_process: + logging.log(logging.ERROR, "Reaper: No RSEs to process found.") + raise ReaperNoRSEsToProcess("Reaper: No RSEs to process found.") + logging.log( + logging.INFO, + "Reaper: This instance will work on RSEs: %s", + ", ".join([rse["rse"] for rse in rses_to_process]), + ) + + def _run_once( + self, heartbeat_handler: "HeartbeatHandler", **_kwargs + ) -> tuple[bool, Any]: + + must_sleep = True - all_rses = [] - for vo in vos: - all_rses.extend(list_rses(filters={'vo': vo})) - - if rses: - invalid = set(rses) - set([rse['rse'] for rse in all_rses]) - if invalid: - msg = 'RSE{} {} cannot be found'.format('s' if len(invalid) > 1 else '', - ', '.join([repr(rse) for rse in invalid])) - raise RSENotFound(msg) - rses = [rse for rse in all_rses if rse['rse'] in rses] - else: - rses = all_rses - - if include_rses: - included_rses = parse_expression(include_rses) - rses = [rse for rse in rses if rse in included_rses] - - if exclude_rses: - excluded_rses = parse_expression(exclude_rses) - rses = [rse for rse in rses if rse not in excluded_rses] - - REGION.set(cache_key, rses) - logging.log(logging.INFO, 'Reaper: This instance will work on RSEs: %s', ', '.join([rse['rse'] for rse in rses])) - return rses - - -def delete_from_storage(heartbeat_handler, hb_payload, replicas, prot, rse_info, is_staging, auto_exclude_threshold, logger=logging.log): - deleted_files = [] - rse_name = rse_info['rse'] - rse_id = rse_info['id'] - noaccess_attempts = 0 - pfns_to_bulk_delete = [] - try: - prot.connect() - for replica in replicas: - # Physical deletion - _, _, logger = heartbeat_handler.live(payload=hb_payload) - stopwatch = Stopwatch() - deletion_dict = {'scope': replica['scope'].external, - 'name': replica['name'], - 'rse': rse_name, - 'file-size': replica['bytes'], - 'bytes': replica['bytes'], - 'url': replica['pfn'], - 'protocol': prot.attributes['scheme'], - 'datatype': replica['datatype']} + _, total_workers, logger = heartbeat_handler.live() + logger(logging.INFO, "Reaper started") + + # Try to get auto exclude parameters from the config table. Otherwise use CLI parameters. + # It's best to access these at every iteration, instead of in the constructor, + # as the config table can be changed at any moment by Rucio administrators. + auto_exclude_threshold = config_get_int( + "reaper", + "auto_exclude_threshold", + default=self.auto_exclude_threshold, + raise_exception=False, + ) + auto_exclude_timeout = config_get_int( + "reaper", + "auto_exclude_timeout", + default=self.auto_exclude_timeout, + raise_exception=False, + ) + # Check if there is a Judge Evaluator backlog + max_evaluator_backlog_count = config_get_int( + "reaper", "max_evaluator_backlog_count", default=None, raise_exception=False + ) + max_evaluator_backlog_duration = config_get_int( + "reaper", + "max_evaluator_backlog_duration", + default=None, + raise_exception=False, + ) + if max_evaluator_backlog_count or max_evaluator_backlog_duration: + backlog = get_evaluation_backlog() + count_is_hit = ( + max_evaluator_backlog_count + and backlog[0] + and backlog[0] > max_evaluator_backlog_count + ) + duration_is_hit = ( + max_evaluator_backlog_duration + and backlog[1] + and backlog[1] + < datetime.utcnow() - timedelta(minutes=max_evaluator_backlog_duration) + ) + if count_is_hit and duration_is_hit: + logger( + logging.ERROR, + "Reaper: Judge evaluator backlog count and duration hit, stopping operation", + ) + return must_sleep, None + elif count_is_hit: + logger( + logging.ERROR, + "Reaper: Judge evaluator backlog count hit, stopping operation", + ) + return must_sleep, None + elif duration_is_hit: + logger( + logging.ERROR, + "Reaper: Judge evaluator backlog duration hit, stopping operation", + ) + return must_sleep, None + + rses_to_process = self._get_rses_to_process() + rses_to_process = [ + RseData(id_=rse["id"], name=rse["rse"], columns=rse) + for rse in rses_to_process + ] + if not rses_to_process: + logger(logging.ERROR, "Reaper: No RSEs found. Will sleep for 30 seconds") + return must_sleep, None + + # On big deletion campaigns, we desire to re-iterate fast on RSEs which have a lot of data to delete. + # The called function will return the RSEs which have more work remaining. + # Call the deletion routine again on this returned subset of RSEs. + # Scale the number of allowed iterations with the number of total reaper workers + iteration = 0 + max_fast_reiterations = int(log2(total_workers)) + while rses_to_process and iteration <= max_fast_reiterations: + rses_to_process = self._deletion_routine( + rses_to_process=rses_to_process, + auto_exclude_threshold=auto_exclude_threshold, + auto_exclude_timeout=auto_exclude_timeout, + heartbeat_handler=heartbeat_handler, + ) + if rses_to_process and iteration < max_fast_reiterations: + logger( + logging.INFO, + "Will perform fast-reiteration %d/%d with rses: %s", + iteration + 1, + max_fast_reiterations, + [str(rse) for rse in rses_to_process], + ) + iteration += 1 + + if rses_to_process: + # There is still more work to be performed. + # Inform the calling context that it must call reaper again (on the full list of rses) + must_sleep = False + + return must_sleep, None + + def _deletion_routine( + self, + rses_to_process: list[RseData], + auto_exclude_threshold: int, + auto_exclude_timeout: int, + heartbeat_handler: "HeartbeatHandler", + ) -> Optional[list[RseData]]: + + dict_rses = {} + _, total_workers, logger = heartbeat_handler.live() + tot_needed_free_space = 0 + for rse in rses_to_process: + # Check if RSE is blocklisted + if not rse.columns["availability_delete"]: + logger(logging.DEBUG, "RSE %s is blocklisted for delete", rse.name) + continue + rse.ensure_loaded(load_attributes=True) + enable_greedy = rse.attributes.get("greedyDeletion", False) or self.greedy + needed_free_space, only_delete_obsolete = Reaper.__check_rse_usage_cached( + rse, greedy=enable_greedy, logger=logger + ) + if needed_free_space: + dict_rses[rse] = [ + needed_free_space, + only_delete_obsolete, + enable_greedy, + ] + tot_needed_free_space += needed_free_space + elif only_delete_obsolete: + dict_rses[rse] = [ + needed_free_space, + only_delete_obsolete, + enable_greedy, + ] + else: + logger(logging.DEBUG, "Nothing to delete on %s", rse.name) + + rses_with_params = [ + (rse, needed_free_space, only_delete_obsolete, enable_greedy) + for rse, ( + needed_free_space, + only_delete_obsolete, + enable_greedy, + ) in dict_rses.items() + ] + + # Ordering the RSEs based on the needed free space + sorted_rses = sorted(rses_with_params, key=lambda x: x[1], reverse=True) + log_msg_str = ", ".join( + f"{rse}:{needed_free_space}:{only_delete_obsolete}:{enable_greedy}" + for rse, needed_free_space, only_delete_obsolete, enable_greedy in sorted_rses + ) + logger( + logging.DEBUG, + "List of RSEs to process ordered by needed space desc: %s", + log_msg_str, + ) + + random.shuffle(rses_with_params) + + work_remaining_by_rse = {} + paused_rses = [] + for ( + rse, + needed_free_space, + only_delete_obsolete, + enable_greedy, + ) in rses_with_params: + result = REGION.get("pause_deletion_%s" % rse.id, expiration_time=120) + if not isinstance(result, NoValue): + paused_rses.append(rse.name) + logger( + logging.DEBUG, + "Not enough replicas to delete on %s during the previous cycle. Deletion paused for a while", + rse.name, + ) + continue + + result = REGION.get( + "temporary_exclude_%s" % rse.id, expiration_time=auto_exclude_timeout + ) + if not isinstance(result, NoValue): + logger( + logging.WARNING, + "Too many failed attempts for %s in last cycle. RSE is temporarily excluded.", + rse.name, + ) + EXCLUDED_RSE_GAUGE.labels(rse=rse.name).set(1) + continue + EXCLUDED_RSE_GAUGE.labels(rse=rse.name).set(0) + + percent = 0 + if tot_needed_free_space: + percent = needed_free_space / tot_needed_free_space * 100 + logger( + logging.DEBUG, + "Working on %s. Percentage of the total space needed %.2f", + rse.name, + percent, + ) + + rse_hostname = self._rse_deletion_hostname(rse) + if not rse_hostname: + if self.scheme: + logger( + logging.WARNING, + "Protocol %s not supported on %s", + self.scheme, + rse.name, + ) + else: + logger( + logging.WARNING, "No default delete protocol for %s", rse.name + ) + REGION.set("pause_deletion_%s" % rse.id, True) + continue + + hb_payload = Reaper.__try_reserve_worker_slot( + heartbeat_handler=heartbeat_handler, rse=rse, hostname=rse_hostname + ) + if not hb_payload: + # Might need to reschedule a try on this RSE later in the same cycle + continue + + # List and mark BEING_DELETED the files to delete + del_start_time = time.time() try: - if replica['scope'].vo != 'def': - deletion_dict['vo'] = replica['scope'].vo - logger(logging.DEBUG, 'Deletion ATTEMPT of %s:%s as %s on %s', replica['scope'], replica['name'], replica['pfn'], rse_name) - # For STAGING RSEs, no physical deletion - if is_staging: - logger(logging.WARNING, 'Deletion STAGING of %s:%s as %s on %s, will only delete the catalog and not do physical deletion', replica['scope'], replica['name'], replica['pfn'], rse_name) - deleted_files.append({'scope': replica['scope'], 'name': replica['name']}) - continue - - if replica['pfn']: - pfn = replica['pfn'] - # sign the URL if necessary - if prot.attributes['scheme'] == 'https' and rse_info['sign_url'] is not None: - pfn = get_signed_url(rse_id, rse_info['sign_url'], 'delete', pfn) - if prot.attributes['scheme'] == 'globus': - pfns_to_bulk_delete.append(replica['pfn']) - else: - prot.delete(pfn) + with METRICS.timer("list_unlocked_replicas"): + if only_delete_obsolete: + logger( + logging.DEBUG, + "Will run list_and_mark_unlocked_replicas on %s. No space needed, will only delete EPOCH tombstoned replicas", + rse.name, + ) + replicas = list_and_mark_unlocked_replicas( + limit=self.chunk_size, + bytes_=needed_free_space, + rse_id=rse.id, + delay_seconds=self.delay_seconds, + only_delete_obsolete=only_delete_obsolete, + session=None, + ) + logger( + logging.DEBUG, + "list_and_mark_unlocked_replicas on %s for %s bytes in %s seconds: %s replicas", + rse.name, + needed_free_space, + time.time() - del_start_time, + len(replicas), + ) + if (len(replicas) == 0 and enable_greedy) or ( + len(replicas) < self.chunk_size and not enable_greedy + ): + logger( + logging.DEBUG, + "Not enough replicas to delete on %s (%s requested vs %s returned). Will skip any new attempts on this RSE until next cycle", + rse.name, + self.chunk_size, + len(replicas), + ) + REGION.set("pause_deletion_%s" % rse.id, True) + work_remaining_by_rse[rse] = False else: - logger(logging.WARNING, 'Deletion UNAVAILABLE of %s:%s as %s on %s', replica['scope'], replica['name'], replica['pfn'], rse_name) - - duration = stopwatch.elapsed - METRICS.timer('delete.{scheme}.{rse}').labels(scheme=prot.attributes['scheme'], rse=rse_name).observe(duration) - - deleted_files.append({'scope': replica['scope'], 'name': replica['name']}) - - deletion_dict['duration'] = duration - add_message('deletion-done', deletion_dict) - logger(logging.INFO, 'Deletion SUCCESS of %s:%s as %s on %s in %.2f seconds', replica['scope'], replica['name'], replica['pfn'], rse_name, duration) - - except SourceNotFound: - duration = stopwatch.elapsed - err_msg = 'Deletion NOTFOUND of %s:%s as %s on %s in %.2f seconds' % (replica['scope'], replica['name'], replica['pfn'], rse_name, duration) - logger(logging.WARNING, '%s', err_msg) - deletion_dict['reason'] = 'File Not Found' - deletion_dict['duration'] = duration - add_message('deletion-not-found', deletion_dict) - deleted_files.append({'scope': replica['scope'], 'name': replica['name']}) - - except (ServiceUnavailable, RSEAccessDenied, ResourceTemporaryUnavailable) as error: - duration = stopwatch.elapsed - logger(logging.WARNING, 'Deletion NOACCESS of %s:%s as %s on %s: %s in %.2f', replica['scope'], replica['name'], replica['pfn'], rse_name, str(error), duration) - deletion_dict['reason'] = str(error) - deletion_dict['duration'] = duration - add_message('deletion-failed', deletion_dict) - noaccess_attempts += 1 - if noaccess_attempts >= auto_exclude_threshold: - logger(logging.INFO, 'Too many (%d) NOACCESS attempts for %s. RSE will be temporarly excluded.', noaccess_attempts, rse_name) - REGION.set('temporary_exclude_%s' % rse_id, True) - METRICS.gauge('excluded_rses.{rse}').labels(rse=rse_name).set(1) - - EXCLUDED_RSE_GAUGE.labels(rse=rse_name).set(1) - break - - except Exception as error: - duration = stopwatch.elapsed - logger(logging.CRITICAL, 'Deletion CRITICAL of %s:%s as %s on %s in %.2f seconds : %s', replica['scope'], replica['name'], replica['pfn'], rse_name, duration, str(traceback.format_exc())) - deletion_dict['reason'] = str(error) - deletion_dict['duration'] = duration - add_message('deletion-failed', deletion_dict) - - if pfns_to_bulk_delete and prot.attributes['scheme'] == 'globus': - logger(logging.DEBUG, 'Attempting bulk delete on RSE %s for scheme %s', rse_name, prot.attributes['scheme']) - prot.bulk_delete(pfns_to_bulk_delete) - - except (ServiceUnavailable, RSEAccessDenied, ResourceTemporaryUnavailable) as error: - for replica in replicas: - logger(logging.WARNING, 'Deletion NOACCESS of %s:%s as %s on %s: %s', replica['scope'], replica['name'], replica['pfn'], rse_name, str(error)) - payload = {'scope': replica['scope'].external, - 'name': replica['name'], - 'rse': rse_name, - 'file-size': replica['bytes'], - 'bytes': replica['bytes'], - 'url': replica['pfn'], - 'reason': str(error), - 'protocol': prot.attributes['scheme']} - if replica['scope'].vo != 'def': - payload['vo'] = replica['scope'].vo - add_message('deletion-failed', payload) - logger(logging.INFO, 'Cannot connect to %s. RSE will be temporarly excluded.', rse_name) - REGION.set('temporary_exclude_%s' % rse_id, True) - EXCLUDED_RSE_GAUGE.labels(rse=rse_name).set(1) - finally: - prot.close() - return deleted_files - - -def _rse_deletion_hostname(rse: RseData, scheme: "Optional[str]") -> "Optional[str]": - """ - Retrieves the hostname of the default deletion protocol - """ - rse.ensure_loaded(load_info=True) - for prot in rse.info['protocols']: - if scheme: - if prot['scheme'] == scheme and prot['domains']['wan']['delete'] != 0: - return prot['hostname'] + work_remaining_by_rse[rse] = True + + except (DatabaseException, IntegrityError, DatabaseError) as error: + logger(logging.ERROR, "%s", str(error)) + continue + except Exception: + logger(logging.CRITICAL, "Exception", exc_info=True) + continue + # Physical deletion will take place there + try: + rse.ensure_loaded(load_info=True, load_attributes=True) + prot = rsemgr.create_protocol( + rse.info, "delete", scheme=self.scheme, logger=logger + ) + if ( + rse.attributes.get("oidc_support") is True + and prot.attributes["scheme"] == "davs" + ): + audience = config_get( + "reaper", "oidc_audience", False + ) or determine_audience_for_rse(rse.id) + # FIXME: At the time of writing, StoRM requires `storage.read` + # in order to perform a stat operation. + scope = determine_scope_for_rse( + rse.id, scopes=["storage.modify", "storage.read"] + ) + auth_token = request_token(audience, scope) + if auth_token: + logger( + logging.INFO, "Using a token to delete on RSE %s", rse.name + ) + prot = rsemgr.create_protocol( + rse.info, + "delete", + scheme=self.scheme, + auth_token=auth_token, + logger=logger, + ) + else: + logger( + logging.WARNING, + "Failed to procure a token to delete on RSE %s", + rse.name, + ) + for file_replicas in chunks(replicas, self.chunk_size): + # Refresh heartbeat + _, total_workers, logger = heartbeat_handler.live( + payload=hb_payload + ) + del_start_time = time.time() + for replica in file_replicas: + try: + replica["pfn"] = str( + list( + rsemgr.lfns2pfns( + rse_settings=rse.info, + lfns=[ + { + "scope": replica["scope"].external, + "name": replica["name"], + "path": replica["path"], + } + ], + operation="delete", + scheme=self.scheme, + ).values() + )[0] + ) + except (ReplicaUnAvailable, ReplicaNotFound) as error: + logger( + logging.WARNING, + "Failed get pfn UNAVAILABLE replica %s:%s on %s with error %s", + replica["scope"], + replica["name"], + rse.name, + str(error), + ) + replica["pfn"] = None + + except Exception: + logger(logging.CRITICAL, "Exception", exc_info=True) + + is_staging = rse.columns["staging_area"] + deleted_files = Reaper._delete_from_storage( + heartbeat_handler=heartbeat_handler, + hb_payload=hb_payload, + replicas=file_replicas, + prot=prot, + rse_info=rse.info, + is_staging=is_staging, + auto_exclude_threshold=auto_exclude_threshold, + ) + logger( + logging.INFO, + "%i files processed in %s seconds", + len(file_replicas), + time.time() - del_start_time, + ) + + # Then finally delete the replicas + del_start = time.time() + delete_replicas(rse_id=rse.id, files=deleted_files) + logger( + logging.DEBUG, + "delete_replicas successed on %s : %s replicas in %s seconds", + rse.name, + len(deleted_files), + time.time() - del_start, + ) + METRICS.counter("deletion.done").inc(len(deleted_files)) + except RSEProtocolNotSupported: + logger( + logging.WARNING, + "Protocol %s not supported on %s", + self.scheme, + rse.name, + ) + except Exception: + logger(logging.CRITICAL, "Exception", exc_info=True) + + if paused_rses: + logger( + logging.INFO, + "Deletion paused for a while for following RSEs: %s", + ", ".join(paused_rses), + ) + + rses_with_more_work = [ + rse for rse, has_more_work in work_remaining_by_rse.items() if has_more_work + ] + return rses_with_more_work + + def _get_rses_to_process(self): + """ + Return the list of RSEs to process based on rses, include_rses and exclude_rses + + :returns: A list of RSEs to process + """ + multi_vo = config_get_bool( + "common", "multi_vo", raise_exception=False, default=False + ) + if not multi_vo: + if self.vos: + logging.log( + logging.WARNING, + "Ignoring argument VOs, this is only applicable in a multi-VO setup.", + ) + self.vos = ["def"] + else: + if self.vos: + invalid = set(self.vos) - set([v["vo"] for v in list_vos()]) + if invalid: + msg = "VO{} {} cannot be found".format( + "s" if len(invalid) > 1 else "", + ", ".join([repr(v) for v in invalid]), + ) + raise VONotFound(msg) + else: + self.vos = [v["vo"] for v in list_vos()] + logging.log( + logging.INFO, + "Reaper: This instance will work on VO%s: %s" + % ("s" if len(self.vos) > 1 else "", ", ".join([v for v in self.vos])), + ) + + cache_key = "rses_to_process_1%s2%s3%s" % ( + str(self.rses), + str(self.include_rses), + str(self.exclude_rses), + ) + if multi_vo: + cache_key += "@%s" % "-".join(vo for vo in self.vos) + + result = REGION.get(cache_key) + if not isinstance(result, NoValue): + return result + + all_rses = [] + for vo in self.vos: + all_rses.extend(list_rses(filters={"vo": vo})) + + if self.rses: + invalid = set(self.rses) - set([rse["rse"] for rse in all_rses]) + if invalid: + msg = "RSE{} {} cannot be found".format( + "s" if len(invalid) > 1 else "", + ", ".join([repr(rse) for rse in invalid]), + ) + raise RSENotFound(msg) + rses = [rse for rse in all_rses if rse["rse"] in self.rses] else: - if prot['domains']['wan']['delete'] == 1: - return prot['hostname'] - return None + rses = all_rses + if self.include_rses: + included_rses = parse_expression(self.include_rses) + rses = [rse for rse in rses if rse in included_rses] -def get_max_deletion_threads_by_hostname(hostname: str) -> int: - """ - Internal method to check RSE usage and limits. + if self.exclude_rses: + excluded_rses = parse_expression(self.exclude_rses) + rses = [rse for rse in rses if rse not in excluded_rses] - :param hostname: the hostname of the SE + REGION.set(cache_key, rses) + logging.log( + logging.INFO, + "Reaper: This instance will work on RSEs: %s", + ", ".join([rse["rse"] for rse in rses]), + ) + return rses + + def _rse_deletion_hostname(self, rse: RseData) -> "Optional[str]": + """ + Retrieves the hostname of the default deletion protocol + """ + rse.ensure_loaded(load_info=True) + for prot in rse.info["protocols"]: + if self.scheme: + if ( + prot["scheme"] == self.scheme + and prot["domains"]["wan"]["delete"] != 0 + ): + return prot["hostname"] + else: + if prot["domains"]["wan"]["delete"] == 1: + return prot["hostname"] + return None - :returns: The maximum deletion thread for the SE. - """ - result = REGION.get('max_deletion_threads_%s' % hostname) - if isinstance(result, NoValue): + @staticmethod + def _delete_from_storage( + heartbeat_handler: "HeartbeatHandler", + hb_payload: Optional[str], + replicas: list[dict], + prot: Any, # TODO: define type for protocol, currently does not exist + rse_info: RSESettingsDict, + is_staging: bool, + auto_exclude_threshold: int, + ) -> list[dict]: + deleted_files = [] + rse_name = rse_info["rse"] + rse_id = rse_info["id"] + noaccess_attempts = 0 + pfns_to_bulk_delete = [] try: - max_deletion_thread = config_get_int('reaper', 'max_deletion_threads_%s' % hostname) - except (NoOptionError, NoSectionError, RuntimeError): + prot.connect() + for replica in replicas: + # Physical deletion + _, _, logger = heartbeat_handler.live(payload=hb_payload) + stopwatch = Stopwatch() + deletion_dict = { + "scope": replica["scope"].external, + "name": replica["name"], + "rse": rse_name, + "file-size": replica["bytes"], + "bytes": replica["bytes"], + "url": replica["pfn"], + "protocol": prot.attributes["scheme"], + "datatype": replica["datatype"], + } + try: + if replica["scope"].vo != "def": + deletion_dict["vo"] = replica["scope"].vo + logger( + logging.DEBUG, + "Deletion ATTEMPT of %s:%s as %s on %s", + replica["scope"], + replica["name"], + replica["pfn"], + rse_name, + ) + # For STAGING RSEs, no physical deletion + if is_staging: + logger( + logging.WARNING, + "Deletion STAGING of %s:%s as %s on %s, will only delete the catalog and not do physical deletion", + replica["scope"], + replica["name"], + replica["pfn"], + rse_name, + ) + deleted_files.append( + {"scope": replica["scope"], "name": replica["name"]} + ) + continue + + if replica["pfn"]: + pfn = replica["pfn"] + # sign the URL if necessary + if ( + prot.attributes["scheme"] == "https" + and rse_info["sign_url"] is not None + ): + pfn = get_signed_url( + rse_id, rse_info["sign_url"], "delete", pfn + ) + if prot.attributes["scheme"] == "globus": + pfns_to_bulk_delete.append(replica["pfn"]) + else: + prot.delete(pfn) + else: + logger( + logging.WARNING, + "Deletion UNAVAILABLE of %s:%s as %s on %s", + replica["scope"], + replica["name"], + replica["pfn"], + rse_name, + ) + + duration = stopwatch.elapsed + METRICS.timer("delete.{scheme}.{rse}").labels( + scheme=prot.attributes["scheme"], rse=rse_name + ).observe(duration) + + deleted_files.append( + {"scope": replica["scope"], "name": replica["name"]} + ) + + deletion_dict["duration"] = duration + add_message("deletion-done", deletion_dict) + logger( + logging.INFO, + "Deletion SUCCESS of %s:%s as %s on %s in %.2f seconds", + replica["scope"], + replica["name"], + replica["pfn"], + rse_name, + duration, + ) + + except SourceNotFound: + duration = stopwatch.elapsed + err_msg = ( + "Deletion NOTFOUND of %s:%s as %s on %s in %.2f seconds" + % ( + replica["scope"], + replica["name"], + replica["pfn"], + rse_name, + duration, + ) + ) + logger(logging.WARNING, "%s", err_msg) + deletion_dict["reason"] = "File Not Found" + deletion_dict["duration"] = duration + add_message("deletion-not-found", deletion_dict) + deleted_files.append( + {"scope": replica["scope"], "name": replica["name"]} + ) + + except ( + ServiceUnavailable, + RSEAccessDenied, + ResourceTemporaryUnavailable, + ) as error: + duration = stopwatch.elapsed + logger( + logging.WARNING, + "Deletion NOACCESS of %s:%s as %s on %s: %s in %.2f", + replica["scope"], + replica["name"], + replica["pfn"], + rse_name, + str(error), + duration, + ) + deletion_dict["reason"] = str(error) + deletion_dict["duration"] = duration + add_message("deletion-failed", deletion_dict) + noaccess_attempts += 1 + if noaccess_attempts >= auto_exclude_threshold: + logger( + logging.INFO, + "Too many (%d) NOACCESS attempts for %s. RSE will be temporarily excluded.", + noaccess_attempts, + rse_name, + ) + REGION.set("temporary_exclude_%s" % rse_id, True) + METRICS.gauge("excluded_rses.{rse}").labels(rse=rse_name).set(1) + + EXCLUDED_RSE_GAUGE.labels(rse=rse_name).set(1) + break + + except Exception as error: + duration = stopwatch.elapsed + logger( + logging.CRITICAL, + "Deletion CRITICAL of %s:%s as %s on %s in %.2f seconds : %s", + replica["scope"], + replica["name"], + replica["pfn"], + rse_name, + duration, + str(traceback.format_exc()), + ) + deletion_dict["reason"] = str(error) + deletion_dict["duration"] = duration + add_message("deletion-failed", deletion_dict) + + if pfns_to_bulk_delete and prot.attributes["scheme"] == "globus": + logger( + logging.DEBUG, + "Attempting bulk delete on RSE %s for scheme %s", + rse_name, + prot.attributes["scheme"], + ) + prot.bulk_delete(pfns_to_bulk_delete) + + except ( + ServiceUnavailable, + RSEAccessDenied, + ResourceTemporaryUnavailable, + ) as error: + for replica in replicas: + logger( + logging.WARNING, + "Deletion NOACCESS of %s:%s as %s on %s: %s", + replica["scope"], + replica["name"], + replica["pfn"], + rse_name, + str(error), + ) + payload = { + "scope": replica["scope"].external, + "name": replica["name"], + "rse": rse_name, + "file-size": replica["bytes"], + "bytes": replica["bytes"], + "url": replica["pfn"], + "reason": str(error), + "protocol": prot.attributes["scheme"], + } + if replica["scope"].vo != "def": + payload["vo"] = replica["scope"].vo + add_message("deletion-failed", payload) + logger( + logging.INFO, + "Cannot connect to %s. RSE will be temporarily excluded.", + rse_name, + ) + REGION.set("temporary_exclude_%s" % rse_id, True) + EXCLUDED_RSE_GAUGE.labels(rse=rse_name).set(1) + finally: + prot.close() + return deleted_files + + @staticmethod + def _get_max_deletion_threads_by_hostname(hostname: str) -> int: + """ + Internal method to check RSE usage and limits. + + :param hostname: the hostname of the SE + + :returns: The maximum deletion thread for the SE. + """ + result = REGION.get("max_deletion_threads_%s" % hostname) + if isinstance(result, NoValue): try: - max_deletion_thread = config_get_int('reaper', 'nb_workers_by_hostname') + max_deletion_thread = config_get_int( + "reaper", "max_deletion_threads_%s" % hostname + ) except (NoOptionError, NoSectionError, RuntimeError): - max_deletion_thread = 5 - REGION.set('max_deletion_threads_%s' % hostname, max_deletion_thread) - result = max_deletion_thread - return result - - -def __try_reserve_worker_slot(heartbeat_handler: "HeartbeatHandler", rse: RseData, hostname: str, logger: "Callable[..., Any]") -> "Optional[str]": - """ - The maximum number of concurrent workers is limited per hostname and per RSE due to storage performance reasons. - This function tries to reserve a slot to run the deletion worker for the given RSE and hostname. - - The function doesn't guarantee strong consistency: the number of total workers may end being slightly - higher than the configured limit. - - The reservation is done using the "payload" field of the rucio heart-beats. - if reservation successful, returns the heartbeat payload used for the reservation. Otherwise, returns None - """ - - rse_hostname_key = '%s,%s' % (rse.id, hostname) - payload_cnt = list_payload_counts(heartbeat_handler.executable, older_than=heartbeat_handler.older_than) - tot_threads_for_hostname = 0 - tot_threads_for_rse = 0 - for key in payload_cnt: - if key and key.find(',') > -1: - if key.split(',')[1] == hostname: - tot_threads_for_hostname += payload_cnt[key] - if key.split(',')[0] == str(rse.id): - tot_threads_for_rse += payload_cnt[key] - max_deletion_thread = get_max_deletion_threads_by_hostname(hostname) - if rse_hostname_key in payload_cnt and tot_threads_for_hostname >= max_deletion_thread: - logger(logging.DEBUG, 'Too many deletion threads for %s on RSE %s. Back off', hostname, rse.name) - return None - logger(logging.INFO, 'Nb workers on %s smaller than the limit (current %i vs max %i). Starting new worker on RSE %s', hostname, tot_threads_for_hostname, max_deletion_thread, rse.name) - _, total_workers, logger = heartbeat_handler.live(payload=rse_hostname_key) - logger(logging.DEBUG, 'Total deletion workers for %s : %i', hostname, tot_threads_for_hostname + 1) - return rse_hostname_key - - -def __check_rse_usage_cached(rse: RseData, greedy: bool = False, logger: "Callable[..., Any]" = logging.log) -> tuple[int, bool]: - """ - Wrapper around __check_rse_usage which manages the cache entry. - """ - cache_key = 'rse_usage_%s' % rse.id - result = REGION.get(cache_key) - if isinstance(result, NoValue): - result = __check_rse_usage(rse=rse, greedy=greedy, logger=logger) - REGION.set(cache_key, result) - return result - - -def __check_rse_usage(rse: RseData, greedy: bool = False, logger: "Callable[..., Any]" = logging.log) -> tuple[int, bool]: - """ - Internal method to check RSE usage and limits. - - :param rse_name: The RSE name. - :param rse_id: The RSE id. - :param greedy: If True, needed_free_space will be set to 1TB regardless of actual rse usage. - - :returns: needed_free_space, only_delete_obsolete. - """ - - needed_free_space = 0 - # First of all check if greedy mode is enabled for this RSE - if greedy: - return 1000000000000, False - - rse.ensure_loaded(load_limits=True, load_usage=True, load_attributes=True) - available_sources = {} - available_sources['total'] = {key['source']: key['total'] for key in rse.usage} - available_sources['used'] = {key['source']: key['used'] for key in rse.usage} - - # Get RSE limits - min_free_space = rse.limits.get('MinFreeSpace', 0) - - # Check from which sources to get used and total spaces (default storage) - # If specified sources do not exist, only delete obsolete - source_for_total_space = rse.attributes.get('source_for_total_space', 'storage') - if source_for_total_space not in available_sources['total']: - logger(logging.WARNING, 'RSE: %s, \'%s\' requested for source_for_total_space but cannot be found. Will only delete obsolete', - rse.name, source_for_total_space) - return 0, True - source_for_used_space = rse.attributes.get('source_for_used_space', 'storage') - if source_for_used_space not in available_sources['used']: - logger(logging.WARNING, 'RSE: %s, \'%s\' requested for source_for_used_space but cannot be found. Will only delete obsolete', - rse.name, source_for_used_space) - return 0, True - - logger(logging.DEBUG, 'RSE: %s, source_for_total_space: %s, source_for_used_space: %s', - rse.name, source_for_total_space, source_for_used_space) - - # Get total and used space - total = available_sources['total'][source_for_total_space] - used = available_sources['used'][source_for_used_space] - - free = total - used - if min_free_space: - needed_free_space = min_free_space - free - - # If needed_free_space negative, nothing to delete except if some Epoch tombstoned replicas - if needed_free_space > 0: - return needed_free_space, False - - return 0, True - - -def reaper(rses, include_rses, exclude_rses, vos=None, chunk_size=100, once=False, greedy=False, - scheme=None, delay_seconds=0, sleep_time=60, auto_exclude_threshold=100, auto_exclude_timeout=600): - """ - Main loop to select and delete files. - - :param rses: List of RSEs the reaper should work against. If empty, it considers all RSEs. - :param include_rses: RSE expression to include RSEs. - :param exclude_rses: RSE expression to exclude RSEs from the Reaper. - :param vos: VOs on which to look for RSEs. Only used in multi-VO mode. - If None, we either use all VOs if run from "def", or the current VO otherwise. - :param chunk_size: The size of chunk for deletion. - :param once: If True, only runs one iteration of the main loop. - :param greedy: If True, delete right away replicas with tombstone. - :param scheme: Force the reaper to use a particular protocol, e.g., mock. - :param delay_seconds: The delay to query replicas in BEING_DELETED state. - :param sleep_time: Time between two cycles. - :param auto_exclude_threshold: Number of service unavailable exceptions after which the RSE gets temporarily excluded. - :param auto_exclude_timeout: Timeout for temporarily excluded RSEs. - """ - run_daemon( - once=once, - graceful_stop=GRACEFUL_STOP, - executable=DAEMON_NAME, - partition_wait_time=0 if once else 10, - sleep_time=sleep_time, - run_once_fnc=functools.partial( - run_once, - rses=rses, - include_rses=include_rses, - exclude_rses=exclude_rses, - vos=vos, - chunk_size=chunk_size, - greedy=greedy, - scheme=scheme, - delay_seconds=delay_seconds, - auto_exclude_threshold=auto_exclude_threshold, - auto_exclude_timeout=auto_exclude_timeout, + try: + max_deletion_thread = config_get_int( + "reaper", "nb_workers_by_hostname" + ) + except (NoOptionError, NoSectionError, RuntimeError): + max_deletion_thread = 5 + REGION.set("max_deletion_threads_%s" % hostname, max_deletion_thread) + result = max_deletion_thread + return result + + @staticmethod + def __try_reserve_worker_slot( + heartbeat_handler: "HeartbeatHandler", rse: RseData, hostname: str + ) -> "Optional[str]": + """ + The maximum number of concurrent workers is limited per hostname and per RSE due to storage performance reasons. + This function tries to reserve a slot to run the deletion worker for the given RSE and hostname. + + The function doesn't guarantee strong consistency: the number of total workers may end being slightly + higher than the configured limit. + + The reservation is done using the "payload" field of the rucio heart-beats. + if reservation successful, returns the heartbeat payload used for the reservation. Otherwise, returns None + """ + + rse_hostname_key = "%s,%s" % (rse.id, hostname) + _, total_workers, logger = heartbeat_handler.live(payload=rse_hostname_key) + payload_cnt = list_payload_counts( + heartbeat_handler.executable, older_than=heartbeat_handler.older_than ) - ) - - -def run_once(rses, include_rses, exclude_rses, vos, chunk_size, greedy, scheme, - delay_seconds, auto_exclude_threshold, auto_exclude_timeout, - heartbeat_handler, **_kwargs): - - must_sleep = True - - _, total_workers, logger = heartbeat_handler.live() - logger(logging.INFO, 'Reaper started') - - # try to get auto exclude parameters from the config table. Otherwise use CLI parameters. - auto_exclude_threshold = config_get_int('reaper', 'auto_exclude_threshold', default=auto_exclude_threshold, raise_exception=False) - auto_exclude_timeout = config_get_int('reaper', 'auto_exclude_timeout', default=auto_exclude_timeout, raise_exception=False) - # Check if there is a Judge Evaluator backlog - max_evaluator_backlog_count = config_get_int('reaper', 'max_evaluator_backlog_count', default=None, raise_exception=False) - max_evaluator_backlog_duration = config_get_int('reaper', 'max_evaluator_backlog_duration', default=None, raise_exception=False) - if max_evaluator_backlog_count or max_evaluator_backlog_duration: - backlog = get_evaluation_backlog() - count_is_hit = max_evaluator_backlog_count and backlog[0] and backlog[0] > max_evaluator_backlog_count - duration_is_hit = max_evaluator_backlog_duration and backlog[1] and backlog[1] < datetime.utcnow() - timedelta(minutes=max_evaluator_backlog_duration) - if count_is_hit and duration_is_hit: - logger(logging.ERROR, 'Reaper: Judge evaluator backlog count and duration hit, stopping operation') - return must_sleep - elif count_is_hit: - logger(logging.ERROR, 'Reaper: Judge evaluator backlog count hit, stopping operation') - return must_sleep - elif duration_is_hit: - logger(logging.ERROR, 'Reaper: Judge evaluator backlog duration hit, stopping operation') - return must_sleep - - rses_to_process = get_rses_to_process(rses, include_rses, exclude_rses, vos) - rses_to_process = [RseData(id_=rse['id'], name=rse['rse'], columns=rse) for rse in rses_to_process] - if not rses_to_process: - logger(logging.ERROR, 'Reaper: No RSEs found. Will sleep for 30 seconds') - return must_sleep - - # On big deletion campaigns, we desire to re-iterate fast on RSEs which have a lot of data to delete. - # The called function will return the RSEs which have more work remaining. - # Call the deletion routine again on this returned subset of RSEs. - # Scale the number of allowed iterations with the number of total reaper workers - iteration = 0 - max_fast_reiterations = int(log2(total_workers)) - while rses_to_process and iteration <= max_fast_reiterations: - rses_to_process = _run_once( - rses_to_process=rses_to_process, - chunk_size=chunk_size, - greedy=greedy, - scheme=scheme, - delay_seconds=delay_seconds, - auto_exclude_threshold=auto_exclude_threshold, - auto_exclude_timeout=auto_exclude_timeout, - heartbeat_handler=heartbeat_handler, + tot_threads_for_hostname = 0 + tot_threads_for_rse = 0 + for key in payload_cnt: + if key and key.find(",") > -1: + if key.split(",")[1] == hostname: + tot_threads_for_hostname += payload_cnt[key] + if key.split(",")[0] == str(rse.id): + tot_threads_for_rse += payload_cnt[key] + max_deletion_thread = Reaper._get_max_deletion_threads_by_hostname(hostname) + if ( + rse_hostname_key in payload_cnt + and tot_threads_for_hostname >= max_deletion_thread + ): + logger( + logging.DEBUG, + "Too many deletion threads for %s on RSE %s. Back off", + hostname, + rse.name, + ) + return None + logger( + logging.INFO, + "Nb workers on %s smaller than the limit (current %i vs max %i). Starting new worker on RSE %s", + hostname, + tot_threads_for_hostname, + max_deletion_thread, + rse.name, ) - if rses_to_process and iteration < max_fast_reiterations: - logger(logging.INFO, "Will perform fast-reiteration %d/%d with rses: %s", iteration + 1, max_fast_reiterations, [str(rse) for rse in rses_to_process]) - iteration += 1 - - if rses_to_process: - # There is still more work to be performed. - # Inform the calling context that it must call reaper again (on the full list of rses) - must_sleep = False - - return must_sleep - - -def _run_once(rses_to_process, chunk_size, greedy, scheme, - delay_seconds, auto_exclude_threshold, auto_exclude_timeout, - heartbeat_handler, **_kwargs): - - dict_rses = {} - _, total_workers, logger = heartbeat_handler.live() - tot_needed_free_space = 0 - for rse in rses_to_process: - # Check if RSE is blocklisted - if not rse.columns['availability_delete']: - logger(logging.DEBUG, 'RSE %s is blocklisted for delete', rse.name) - continue - rse.ensure_loaded(load_attributes=True) - enable_greedy = rse.attributes.get('greedyDeletion', False) or greedy - needed_free_space, only_delete_obsolete = __check_rse_usage_cached(rse, greedy=enable_greedy, logger=logger) - if needed_free_space: - dict_rses[rse] = [needed_free_space, only_delete_obsolete, enable_greedy] - tot_needed_free_space += needed_free_space - elif only_delete_obsolete: - dict_rses[rse] = [needed_free_space, only_delete_obsolete, enable_greedy] - else: - logger(logging.DEBUG, 'Nothing to delete on %s', rse.name) - - rses_with_params = [(rse, needed_free_space, only_delete_obsolete, enable_greedy) - for rse, (needed_free_space, only_delete_obsolete, enable_greedy) in dict_rses.items()] - - # Ordering the RSEs based on the needed free space - sorted_rses = sorted(rses_with_params, key=lambda x: x[1], reverse=True) - log_msg_str = ', '.join(f'{rse}:{needed_free_space}:{only_delete_obsolete}:{enable_greedy}' - for rse, needed_free_space, only_delete_obsolete, enable_greedy in sorted_rses) - logger(logging.DEBUG, 'List of RSEs to process ordered by needed space desc: %s', log_msg_str) - - random.shuffle(rses_with_params) + logger( + logging.DEBUG, + "Total deletion workers for %s : %i", + hostname, + tot_threads_for_hostname + 1, + ) + return rse_hostname_key + + @staticmethod + def __check_rse_usage_cached( + rse: RseData, + greedy: bool = False, + logger: "Callable[..., Any]" = logging.log, + ) -> tuple[int, bool]: + """ + Wrapper around __check_rse_usage which manages the cache entry. + """ + cache_key = "rse_usage_%s" % rse.id + result = REGION.get(cache_key) + if isinstance(result, NoValue): + result = Reaper.__check_rse_usage(rse=rse, greedy=greedy, logger=logger) + REGION.set(cache_key, result) + return result - work_remaining_by_rse = {} - paused_rses = [] - for rse, needed_free_space, only_delete_obsolete, enable_greedy in rses_with_params: - result = REGION.get('pause_deletion_%s' % rse.id, expiration_time=120) - if not isinstance(result, NoValue): - paused_rses.append(rse.name) - logger(logging.DEBUG, 'Not enough replicas to delete on %s during the previous cycle. Deletion paused for a while', rse.name) - continue + @staticmethod + def __check_rse_usage( + rse: RseData, greedy: bool = False, logger: "Callable[..., Any]" = logging.log + ) -> tuple[int, bool]: + """ + Internal method to check RSE usage and limits. + + :param rse_name: The RSE name. + :param rse_id: The RSE id. + :param greedy: If True, needed_free_space will be set to 1TB regardless of actual rse usage. + + :returns: needed_free_space, only_delete_obsolete. + """ + + needed_free_space = 0 + # First of all check if greedy mode is enabled for this RSE + if greedy: + return 1000000000000, False + + rse.ensure_loaded(load_limits=True, load_usage=True, load_attributes=True) + available_sources = {} + available_sources["total"] = {key["source"]: key["total"] for key in rse.usage} + available_sources["used"] = {key["source"]: key["used"] for key in rse.usage} + + # Get RSE limits + min_free_space = rse.limits.get("MinFreeSpace", 0) + + # Check from which sources to get used and total spaces (default storage) + # If specified sources do not exist, only delete obsolete + source_for_total_space = rse.attributes.get("source_for_total_space", "storage") + if source_for_total_space not in available_sources["total"]: + logger( + logging.WARNING, + "RSE: %s, '%s' requested for source_for_total_space but cannot be found. Will only delete obsolete", + rse.name, + source_for_total_space, + ) + return 0, True + source_for_used_space = rse.attributes.get("source_for_used_space", "storage") + if source_for_used_space not in available_sources["used"]: + logger( + logging.WARNING, + "RSE: %s, '%s' requested for source_for_used_space but cannot be found. Will only delete obsolete", + rse.name, + source_for_used_space, + ) + return 0, True + + logger( + logging.DEBUG, + "RSE: %s, source_for_total_space: %s, source_for_used_space: %s", + rse.name, + source_for_total_space, + source_for_used_space, + ) - result = REGION.get('temporary_exclude_%s' % rse.id, expiration_time=auto_exclude_timeout) - if not isinstance(result, NoValue): - logger(logging.WARNING, 'Too many failed attempts for %s in last cycle. RSE is temporarly excluded.', rse.name) - EXCLUDED_RSE_GAUGE.labels(rse=rse.name).set(1) - continue - EXCLUDED_RSE_GAUGE.labels(rse=rse.name).set(0) - - percent = 0 - if tot_needed_free_space: - percent = needed_free_space / tot_needed_free_space * 100 - logger(logging.DEBUG, 'Working on %s. Percentage of the total space needed %.2f', rse.name, percent) - - rse_hostname = _rse_deletion_hostname(rse, scheme) - if not rse_hostname: - if scheme: - logger(logging.WARNING, 'Protocol %s not supported on %s', scheme, rse.name) - else: - logger(logging.WARNING, 'No default delete protocol for %s', rse.name) - REGION.set('pause_deletion_%s' % rse.id, True) - continue + # Get total and used space + total = available_sources["total"][source_for_total_space] + used = available_sources["used"][source_for_used_space] - hb_payload = __try_reserve_worker_slot(heartbeat_handler=heartbeat_handler, rse=rse, hostname=rse_hostname, logger=logger) - if not hb_payload: - # Might need to reschedule a try on this RSE later in the same cycle - continue + free = total - used + if min_free_space: + needed_free_space = min_free_space - free - # List and mark BEING_DELETED the files to delete - del_start_time = time.time() - try: - with METRICS.timer('list_unlocked_replicas'): - if only_delete_obsolete: - logger(logging.DEBUG, 'Will run list_and_mark_unlocked_replicas on %s. No space needed, will only delete EPOCH tombstoned replicas', rse.name) - replicas = list_and_mark_unlocked_replicas(limit=chunk_size, - bytes_=needed_free_space, - rse_id=rse.id, - delay_seconds=delay_seconds, - only_delete_obsolete=only_delete_obsolete, - session=None) - logger(logging.DEBUG, 'list_and_mark_unlocked_replicas on %s for %s bytes in %s seconds: %s replicas', rse.name, needed_free_space, time.time() - del_start_time, len(replicas)) - if (len(replicas) == 0 and enable_greedy) or (len(replicas) < chunk_size and not enable_greedy): - logger(logging.DEBUG, 'Not enough replicas to delete on %s (%s requested vs %s returned). Will skip any new attempts on this RSE until next cycle', rse.name, chunk_size, len(replicas)) - REGION.set('pause_deletion_%s' % rse.id, True) - work_remaining_by_rse[rse] = False - else: - work_remaining_by_rse[rse] = True - - except (DatabaseException, IntegrityError, DatabaseError) as error: - logger(logging.ERROR, '%s', str(error)) - continue - except Exception: - logger(logging.CRITICAL, 'Exception', exc_info=True) - continue - # Physical deletion will take place there - try: - rse.ensure_loaded(load_info=True, load_attributes=True) - prot = rsemgr.create_protocol(rse.info, 'delete', scheme=scheme, logger=logger) - if rse.attributes.get('oidc_support') is True and prot.attributes['scheme'] == 'davs': - audience = determine_audience_for_rse(rse.id) - # FIXME: At the time of writing, StoRM requires `storage.read` - # in order to perform a stat operation. - scope = determine_scope_for_rse(rse.id, scopes=['storage.modify', 'storage.read']) - auth_token = request_token(audience, scope) - if auth_token: - logger(logging.INFO, 'Using a token to delete on RSE %s', rse.name) - prot = rsemgr.create_protocol(rse.info, 'delete', scheme=scheme, auth_token=auth_token, logger=logger) - else: - logger(logging.WARNING, 'Failed to procure a token to delete on RSE %s', rse.name) - for file_replicas in chunks(replicas, chunk_size): - # Refresh heartbeat - _, total_workers, logger = heartbeat_handler.live(payload=hb_payload) - del_start_time = time.time() - for replica in file_replicas: - try: - replica['pfn'] = str(list(rsemgr.lfns2pfns(rse_settings=rse.info, - lfns=[{'scope': replica['scope'].external, 'name': replica['name'], 'path': replica['path']}], - operation='delete', scheme=scheme).values())[0]) - except (ReplicaUnAvailable, ReplicaNotFound) as error: - logger(logging.WARNING, 'Failed get pfn UNAVAILABLE replica %s:%s on %s with error %s', replica['scope'], replica['name'], rse.name, str(error)) - replica['pfn'] = None - - except Exception: - logger(logging.CRITICAL, 'Exception', exc_info=True) - - is_staging = rse.columns['staging_area'] - deleted_files = delete_from_storage(heartbeat_handler, hb_payload, file_replicas, prot, rse.info, is_staging, auto_exclude_threshold, logger=logger) - logger(logging.INFO, '%i files processed in %s seconds', len(file_replicas), time.time() - del_start_time) - - # Then finally delete the replicas - del_start = time.time() - delete_replicas(rse_id=rse.id, files=deleted_files) - logger(logging.DEBUG, 'delete_replicas successed on %s : %s replicas in %s seconds', rse.name, len(deleted_files), time.time() - del_start) - METRICS.counter('deletion.done').inc(len(deleted_files)) - except RSEProtocolNotSupported: - logger(logging.WARNING, 'Protocol %s not supported on %s', scheme, rse.name) - except Exception: - logger(logging.CRITICAL, 'Exception', exc_info=True) - - if paused_rses: - logger(logging.INFO, 'Deletion paused for a while for following RSEs: %s', ', '.join(paused_rses)) - - rses_with_more_work = [rse for rse, has_more_work in work_remaining_by_rse.items() if has_more_work] - return rses_with_more_work - - -def stop(signum: "Optional[int]" = None, frame: "Optional[FrameType]" = None) -> None: - """ - Graceful exit. - """ - GRACEFUL_STOP.set() - - -def run(threads=1, chunk_size=100, once=False, greedy=False, rses=None, scheme=None, exclude_rses=None, include_rses=None, vos=None, delay_seconds=0, sleep_time=60, auto_exclude_threshold=100, auto_exclude_timeout=600): - """ - Starts up the reaper threads. - - :param threads: The total number of workers. - :param chunk_size: The size of chunk for deletion. - :param threads_per_worker: Total number of threads created by each worker. - :param once: If True, only runs one iteration of the main loop. - :param greedy: If True, delete right away replicas with tombstone. - :param rses: List of RSEs the reaper should work against. - If empty, it considers all RSEs. - :param scheme: Force the reaper to use a particular protocol/scheme, e.g., mock. - :param exclude_rses: RSE expression to exclude RSEs from the Reaper. - :param include_rses: RSE expression to include RSEs. - :param vos: VOs on which to look for RSEs. Only used in multi-VO mode. - If None, we either use all VOs if run from "def", - or the current VO otherwise. - :param delay_seconds: The delay to query replicas in BEING_DELETED state. - :param sleep_time: Time between two cycles. - :param auto_exclude_threshold: Number of service unavailable exceptions after which the RSE gets temporarily excluded. - :param auto_exclude_timeout: Timeout for temporarily excluded RSEs. - """ - setup_logging(process_name=DAEMON_NAME) - - if rucio.db.sqla.util.is_old_db(): - raise DatabaseException('Database was not updated, daemon won\'t start') - - logging.log(logging.INFO, 'main: starting processes') - rses_to_process = get_rses_to_process(rses, include_rses, exclude_rses, vos) - if not rses_to_process: - logging.log(logging.ERROR, 'Reaper: No RSEs found. Exiting.') - return - - logging.log(logging.INFO, 'Reaper: This instance will work on RSEs: %s', ', '.join([rse['rse'] for rse in rses_to_process])) - - logging.log(logging.INFO, 'starting reaper threads') - threads_list = [threading.Thread(target=reaper, kwargs={'once': once, - 'rses': rses, - 'include_rses': include_rses, - 'exclude_rses': exclude_rses, - 'vos': vos, - 'chunk_size': chunk_size, - 'greedy': greedy, - 'sleep_time': sleep_time, - 'delay_seconds': delay_seconds, - 'scheme': scheme, - 'auto_exclude_threshold': auto_exclude_threshold, - 'auto_exclude_timeout': auto_exclude_timeout}) for _ in range(0, threads)] - - for thread in threads_list: - thread.start() - - logging.log(logging.INFO, 'waiting for interrupts') - - # Interruptible joins require a timeout. - while threads_list: - threads_list = [thread.join(timeout=3.14) for thread in threads_list if thread and thread.is_alive()] + # If needed_free_space negative, nothing to delete except if some Epoch tombstoned replicas + if needed_free_space > 0: + return needed_free_space, False \ No newline at end of file diff --git a/lib/rucio/daemons/undertaker/undertaker.py b/lib/rucio/daemons/undertaker/undertaker.py index b8b061a56d..6b47f19fb2 100644 --- a/lib/rucio/daemons/undertaker/undertaker.py +++ b/lib/rucio/daemons/undertaker/undertaker.py @@ -24,6 +24,7 @@ from datetime import datetime, timedelta from random import randint from re import match +from typing import Any from sqlalchemy.exc import DatabaseError from rucio.db.sqla.constants import MYSQL_LOCK_NOWAIT_REGEX, ORACLE_RESOURCE_BUSY_REGEX, PSQL_LOCK_NOT_AVAILABLE_REGEX @@ -39,18 +40,22 @@ METRICS = MetricManager(module=__name__) graceful_stop = threading.Event() -DAEMON_NAME = "undertaker" class Undertaker(Daemon): def __init__(self, chunk_size: int = 10, **_kwargs) -> None: + """ + :param chunk_size: Size of each chunk of DIDs to process + """ super().__init__(daemon_name="undertaker", **_kwargs) self.chunk_size = chunk_size self.paused_dids = {} - def _run_once(self, heartbeat_handler: HeartbeatHandler, **_kwargs) -> None: + def _run_once(self, heartbeat_handler: "HeartbeatHandler", **_kwargs) -> tuple[bool, Any]: worker_number, total_workers, logger = heartbeat_handler.live() + must_sleep = False + try: # Refresh paused dids iter_paused_dids = deepcopy(self.paused_dids) @@ -70,7 +75,7 @@ def _run_once(self, heartbeat_handler: HeartbeatHandler, **_kwargs) -> None: if not dids: logger(logging.INFO, "did not get any work") - return + return must_sleep, None for chunk in chunks(dids, self.chunk_size): _, _, logger = heartbeat_handler.live() @@ -105,3 +110,4 @@ def _run_once(self, heartbeat_handler: HeartbeatHandler, **_kwargs) -> None: logger(logging.ERROR, "Got database error %s.", str(e)) except: logging.critical(traceback.format_exc()) + return must_sleep, None diff --git a/tests/test_abacus_account.py b/tests/test_abacus_account.py index f9ab722e8f..5dd29d06e3 100644 --- a/tests/test_abacus_account.py +++ b/tests/test_abacus_account.py @@ -22,7 +22,7 @@ from rucio.core.account_limit import get_local_account_usage, set_local_account_limit from rucio.daemons.abacus.account import account_update from rucio.daemons.judge import cleaner -from rucio.daemons.reaper import reaper +from rucio.daemons.reaper.reaper import Reaper from rucio.db.sqla import models from rucio.db.sqla.session import get_session @@ -71,6 +71,8 @@ def test_abacus_account(self, vo, root_account, mock_scope, rse_factory, did_fac assert account_usages['files'] == 0 if vo: - reaper.run(once=True, include_rses='vo=%s&(%s)' % (str(vo), rse), greedy=True) + reaper = Reaper(once=True, include_rses='vo=%s&(%s)' % (str(vo), rse), greedy=True) + reaper.run() else: - reaper.run(once=True, include_rses=rse, greedy=True) + reaper = Reaper(once=True, include_rses='vo=%s&(%s)' % (str(vo), rse), greedy=True) + reaper.run() diff --git a/tests/test_abacus_collection_replica.py b/tests/test_abacus_collection_replica.py index 10f30f87dc..5083891ebd 100644 --- a/tests/test_abacus_collection_replica.py +++ b/tests/test_abacus_collection_replica.py @@ -22,7 +22,8 @@ from rucio.core.replica import delete_replicas, get_cleaned_updated_collection_replicas from rucio.daemons.abacus import collection_replica from rucio.daemons.judge import cleaner -from rucio.daemons.reaper import reaper +import rucio.daemons.reaper.reaper +from rucio.daemons.reaper.reaper import Reaper from rucio.daemons.undertaker.undertaker import Undertaker from rucio.db.sqla import models, session from rucio.db.sqla.constants import DIDType, ReplicaState @@ -59,9 +60,12 @@ def test_abacus_collection_replica_cleanup(self, vo, mock_scope, rse_factory, di undertaker.run() cleaner.run(once=True) if vo: - reaper.run(once=True, include_rses='vo=%s&(%s|%s)' % (str(vo), rse1, rse2), greedy=True) + + reaper = Reaper(once=True, include_rses='vo=%s&(%s|%s)' % (str(vo), rse1, rse2), greedy=True) + reaper.run() else: - reaper.run(once=True, include_rses='(%s|%s)' % (rse1, rse2), greedy=True) + reaper = Reaper(once=True, include_rses='(%s|%s)' % (rse1, rse2), greedy=True) + reaper.run() def test_abacus_collection_replica(self, vo, mock_scope, rse_factory, did_factory, rucio_client): """ ABACUS (COLLECTION REPLICA): Test update of collection replica. """ @@ -111,11 +115,13 @@ def test_abacus_collection_replica(self, vo, mock_scope, rse_factory, did_factor # Delete all files -> collection replica should be deleted # Old behaviour (doesn't delete the DID) cleaner.run(once=True) - reaper.REGION.invalidate() + rucio.daemons.reaper.reaper.REGION.invalidate() if vo: - reaper.run(once=True, include_rses='vo=%s&(%s)' % (str(vo), rse), greedy=True) + reaper = Reaper(once=True, include_rses='vo=%s&(%s)' % (str(vo), rse), greedy=True) + reaper.run() else: - reaper.run(once=True, include_rses=rse, greedy=True) + reaper = Reaper(once=True, include_rses=rse, greedy=True) + reaper.run() activity = get_schema_value('ACTIVITY')['enum'][0] rucio_client.add_replication_rule([{'scope': mock_scope.external, 'name': dataset}], 1, rse, lifetime=-1, activity=activity) collection_replica.run(once=True) diff --git a/tests/test_abacus_rse.py b/tests/test_abacus_rse.py index b1e276fccb..af2757d86c 100644 --- a/tests/test_abacus_rse.py +++ b/tests/test_abacus_rse.py @@ -19,7 +19,7 @@ from rucio.core.rse import get_rse_usage from rucio.daemons.abacus.rse import rse_update from rucio.daemons.judge import cleaner -from rucio.daemons.reaper import reaper +from rucio.daemons.reaper.reaper import Reaper from rucio.db.sqla import models from rucio.db.sqla.session import get_session @@ -57,9 +57,11 @@ def test_abacus_rse(self, vo, mock_scope, rse_factory, did_factory, rucio_client rucio_client.add_replication_rule([{'scope': mock_scope.external, 'name': dataset}], 1, rse, lifetime=-1, activity=activity) cleaner.run(once=True) if vo: - reaper.run(once=True, include_rses='vo=%s&(%s)' % (str(vo), rse), greedy=True) + reaper = Reaper(once=True, include_rses='vo=%s&(%s)' % (str(vo), rse), greedy=True) + reaper.run() else: - reaper.run(once=True, include_rses=rse, greedy=True) + reaper = Reaper(once=True, include_rses=rse, greedy=True) + reaper.run() rse_update(once=True) rse_usage = get_rse_usage(rse_id=rse_id)[0] assert rse_usage['used'] == 0 diff --git a/tests/test_api_external_representation.py b/tests/test_api_external_representation.py index 4fa6e23e3f..24f7370c2a 100644 --- a/tests/test_api_external_representation.py +++ b/tests/test_api_external_representation.py @@ -39,7 +39,7 @@ from rucio.core.vo import add_vo, vo_exists from rucio.daemons.abacus import rse as abacus_rse from rucio.daemons.judge import cleaner -from rucio.daemons.reaper import reaper +from rucio.daemons.reaper.reaper import Reaper from rucio.db.sqla import constants from rucio.tests.common import rse_name_generator, did_name_generator @@ -398,9 +398,11 @@ def test_api_rse(self, vo, rse_factory, vo2, account, account_name, scope_name): # clean up files cleaner.run(once=True) if vo2: - reaper.run(once=True, include_rses='vo=%s&(%s)' % (vo, rse_mock), greedy=True) + reaper = Reaper(once=True, include_rses='vo=%s&(%s)' % (vo, rse_mock), greedy=True) + reaper.run() else: - reaper.run(once=True, include_rses=rse_mock, greedy=True) + reaper = Reaper(once=True, include_rses=rse_mock, greedy=True) + reaper.run() abacus_rse.run(once=True) out = api_rse.parse_rse_expression(f'{rse1}|{rse2}', vo=vo) diff --git a/tests/test_conveyor.py b/tests/test_conveyor.py index 7cb1f396ba..744808ee85 100644 --- a/tests/test_conveyor.py +++ b/tests/test_conveyor.py @@ -44,7 +44,7 @@ from rucio.daemons.conveyor.stager import stager from rucio.daemons.conveyor.throttler import throttler from rucio.daemons.conveyor.receiver import receiver, GRACEFUL_STOP as receiver_graceful_stop, Receiver -from rucio.daemons.reaper.reaper import reaper +from rucio.daemons.reaper.reaper import Reaper from rucio.db.sqla import models from rucio.db.sqla.constants import LockState, RequestState, RequestType, ReplicaState, RSEType, RuleState from rucio.db.sqla.session import read_session, transactional_session @@ -215,7 +215,8 @@ def __fake_source_ranking(*, session=None): # The intermediate replica is protected by its state (Copying) rucio.daemons.reaper.reaper.REGION.invalidate() - reaper(once=True, rses=[], include_rses=jump_rse_name, exclude_rses=None) + reaper = Reaper(once=True, rses=[], include_rses=jump_rse_name, exclude_rses=None) + reaper._call_daemon() replica = replica_core.get_replica(rse_id=jump_rse_id, **did) assert replica['state'] == ReplicaState.COPYING @@ -237,7 +238,8 @@ def __fake_source_ranking(*, session=None): assert replica['state'] == ReplicaState.AVAILABLE rucio.daemons.reaper.reaper.REGION.invalidate() - reaper(once=True, rses=[], include_rses='test_container_xrd=True', exclude_rses=None) + reaper2 = Reaper(once=True, rses=[], include_rses='test_container_xrd=True', exclude_rses=None) + reaper2._call_daemon() with pytest.raises(ReplicaNotFound): replica_core.get_replica(rse_id=jump_rse_id, **did) @@ -1498,7 +1500,8 @@ def on_submit(file): # One of the intermediate replicas is eligible for deletion. Others are blocked by entries in source table reaper_cache_region.invalidate() - reaper(once=True, rses=[], include_rses='|'.join([rse2, rse3, rse4, rse6]), exclude_rses=None) + reaper = Reaper(once=True, rses=[], include_rses='|'.join([rse2, rse3, rse4, rse6]), exclude_rses=None) + reaper._call_daemon() with pytest.raises(ReplicaNotFound): replica_core.get_replica(rse_id=rse_id_second_to_last_submit, **did) for rse_id in [rse2_id, rse3_id, rse_id_second_to_last_queued]: @@ -1512,7 +1515,7 @@ def on_submit(file): # All intermediate replicas can be deleted reaper_cache_region.invalidate() - reaper(once=True, rses=[], include_rses='|'.join([rse2, rse3, rse4, rse6]), exclude_rses=None) + reaper._call_daemon() for rse_id in [rse2_id, rse3_id, rse4_id, rse6_id]: with pytest.raises(ReplicaNotFound): replica_core.get_replica(rse_id=rse_id, **did) diff --git a/tests/test_conveyor_submitter.py b/tests/test_conveyor_submitter.py index 03187eccdf..06e1c9584c 100644 --- a/tests/test_conveyor_submitter.py +++ b/tests/test_conveyor_submitter.py @@ -29,7 +29,7 @@ from rucio.core import rule as rule_core from rucio.core import config as core_config from rucio.daemons.conveyor.submitter import submitter -from rucio.daemons.reaper.reaper import reaper +from rucio.daemons.reaper.reaper import Reaper from rucio.db.sqla.models import Request, Source from rucio.db.sqla.constants import RequestState from rucio.db.sqla.session import read_session, transactional_session @@ -218,7 +218,8 @@ def test_source_avoid_deletion(caches_mock, rse_factory, did_factory, root_accou # Reaper will not delete a file which only has one replica if there is any pending transfer for it reaper_region.invalidate() - reaper(once=True, rses=[], include_rses=any_source, exclude_rses=None) + reaper = Reaper(once=True, rses=[], include_rses=any_source, exclude_rses=None) + reaper._call_daemon() replica = next(iter(replica_core.list_replicas(dids=[did], rse_expression=any_source))) assert len(replica['pfns']) == 1 @@ -232,7 +233,7 @@ def test_source_avoid_deletion(caches_mock, rse_factory, did_factory, root_accou # None of the replicas will be removed. They are protected by an entry in the sources table reaper_region.invalidate() - reaper(once=True, rses=[], include_rses=any_source, exclude_rses=None) + reaper._call_daemon() replica = next(iter(replica_core.list_replicas(dids=[did], rse_expression=any_source))) assert len(replica['pfns']) == 2 @@ -247,7 +248,7 @@ def __delete_sources(rse_id, scope, name, *, session=None): __delete_sources(src_rse1_id, **did) __delete_sources(src_rse2_id, **did) reaper_region.invalidate() - reaper(once=True, rses=[], include_rses=any_source, exclude_rses=None) + reaper._call_daemon() replica = next(iter(replica_core.list_replicas(dids=[did], rse_expression=any_source))) assert len(replica['pfns']) == 1 diff --git a/tests/test_daemons.py b/tests/test_daemons.py index a67fdab915..e3178cbe78 100644 --- a/tests/test_daemons.py +++ b/tests/test_daemons.py @@ -30,7 +30,7 @@ from rucio.daemons.hermes import hermes from rucio.daemons.judge import cleaner, evaluator, injector, repairer from rucio.daemons.oauthmanager import oauthmanager -from rucio.daemons.reaper import dark_reaper, reaper +from rucio.daemons.reaper import dark_reaper from rucio.daemons.replicarecoverer import suspicious_replica_recoverer from rucio.daemons.tracer import kronos from rucio.daemons.transmogrifier import transmogrifier @@ -62,7 +62,6 @@ repairer, oauthmanager, dark_reaper, - reaper, suspicious_replica_recoverer, kronos, transmogrifier, @@ -85,10 +84,11 @@ def test_fail_on_old_database_parametrized(mock_is_old_db, daemon): assert mock_is_old_db.call_count > 1 -class TestDaemon(Daemon): +class DaemonTest(Daemon): def _run_once(self, heartbeat_handler, **_kwargs): pass + @mock.patch('rucio.db.sqla.util.is_old_db') def test_fail_on_old_database(mock_is_old_db): """ DAEMON: Test daemon failure on old database """ @@ -96,6 +96,6 @@ def test_fail_on_old_database(mock_is_old_db): assert rucio.db.sqla.util.is_old_db() is True with pytest.raises(exception.DatabaseException, match='Database was not updated, daemon won\'t start'): - TestDaemon().run() + DaemonTest().run() assert mock_is_old_db.call_count > 1 diff --git a/tests/test_reaper.py b/tests/test_reaper.py index a3674eeb79..0d5b0ad5f6 100644 --- a/tests/test_reaper.py +++ b/tests/test_reaper.py @@ -31,9 +31,8 @@ from rucio.core import replica as replica_core from rucio.core import rse as rse_core from rucio.core import rule as rule_core -from rucio.daemons.reaper.reaper import reaper +from rucio.daemons.reaper.reaper import Reaper from rucio.daemons.reaper.dark_reaper import reaper as dark_reaper -from rucio.daemons.reaper.reaper import run as run_reaper from rucio.db.sqla.models import ConstituentAssociationHistory from rucio.db.sqla.session import read_session from rucio.tests.common import rse_name_generator, skip_rse_tests_with_accounts @@ -90,14 +89,15 @@ def test_reaper(vo, caches_mock, message_mock): # Check first if the reaper does not delete anything if no space is needed cache_region.invalidate() rse_core.set_rse_usage(rse_id=rse_id, source='storage', used=nb_files * file_size, free=323000000000) - reaper(once=True, rses=[], include_rses=rse_name, exclude_rses=None) + reaper = Reaper(once=True, rses=[], include_rses=rse_name, exclude_rses=None) + reaper._call_daemon() assert len(list(replica_core.list_replicas(dids=dids, rse_expression=rse_name))) == nb_files # Now put it over threshold and delete cache_region.invalidate() rse_core.set_rse_usage(rse_id=rse_id, source='storage', used=nb_files * file_size, free=1) - reaper(once=True, rses=[], include_rses=rse_name, exclude_rses=None) - reaper(once=True, rses=[], include_rses=rse_name, exclude_rses=None) + reaper._call_daemon() + reaper._call_daemon() assert len(list(replica_core.list_replicas(dids, rse_expression=rse_name))) == 200 msgs = message_core.retrieve_messages() @@ -128,14 +128,15 @@ def test_reaper_bulk_delete(vo, caches_mock): # Check first if the reaper does not delete anything if no space is needed cache_region.invalidate() rse_core.set_rse_usage(rse_id=rse_id, source='storage', used=nb_files * file_size, free=323000000000) - reaper(once=True, rses=[], include_rses=rse_name, exclude_rses=None, chunk_size=1000, scheme='MOCK') + reaper = Reaper(once=True, rses=[], include_rses=rse_name, exclude_rses=None, chunk_size=1000, scheme='MOCK') + reaper._call_daemon() assert len(list(replica_core.list_replicas(dids=dids, rse_expression=rse_name))) == nb_files # Now put it over threshold and delete cache_region.invalidate() rse_core.set_rse_usage(rse_id=rse_id, source='storage', used=nb_files * file_size, free=1) - reaper(once=True, rses=[], include_rses=rse_name, exclude_rses=None, chunk_size=1000, scheme='MOCK') - reaper(once=True, rses=[], include_rses=rse_name, exclude_rses=None, chunk_size=1000, scheme='MOCK') + reaper._call_daemon() + reaper._call_daemon() assert len(list(replica_core.list_replicas(dids, rse_expression=rse_name))) == 200 @@ -167,7 +168,8 @@ def test_reaper_multi_vo_via_run(vo, second_vo, scope_factory, caches_mock): # Check we reap all VOs by default cache_region.invalidate() - run_reaper(once=True, rses=[rse_name]) + reaper = Reaper(once=True, rses=[rse_name]) + reaper.run() assert len(list(replica_api.list_replicas([{'scope': scope_name, 'name': n} for n in names], rse_expression=rse_name, vo=vo))) == 25 assert len(list(replica_api.list_replicas([{'scope': scope_name, 'name': n} for n in names], rse_expression=rse_name, vo=new_vo))) == 25 @@ -200,7 +202,8 @@ def test_reaper_affect_other_vo_via_run(vo, second_vo, scope_factory, caches_moc # Check we don't affect a second VO that isn't specified cache_region.invalidate() - run_reaper(once=True, rses=[rse_name], vos=['new']) + reaper = Reaper(once=True, rses=[rse_name], vos=['new']) + reaper.run() assert len(list(replica_api.list_replicas([{'scope': scope_name, 'name': n} for n in names], rse_expression=rse_name, vo=vo))) == nb_files assert len(list(replica_api.list_replicas([{'scope': scope_name, 'name': n} for n in names], rse_expression=rse_name, vo=new_vo))) == 25 @@ -229,8 +232,9 @@ def test_reaper_multi_vo(vo, second_vo, scope_factory, caches_mock): rse_core.set_rse_usage(rse_id=rse1_id, source='storage', used=nb_files * file_size, free=1) rse_core.set_rse_usage(rse_id=rse2_id, source='storage', used=nb_files * file_size, free=1) both_rses = '%s|%s' % (rse1_name, rse2_name) - reaper(once=True, rses=[], include_rses=both_rses, exclude_rses=None) - reaper(once=True, rses=[], include_rses=both_rses, exclude_rses=None) + reaper = Reaper(once=True, rses=[], include_rses=both_rses, exclude_rses=None) + reaper._call_daemon() + reaper._call_daemon() assert len(list(replica_core.list_replicas(dids=dids1, rse_expression=both_rses))) == 200 assert len(list(replica_core.list_replicas(dids=dids2, rse_expression=both_rses))) == 200 @@ -295,7 +299,8 @@ def __get_archive_contents_history_count(archive, *, session=None): cache_region.invalidate() rse_core.set_rse_limits(rse_id=rse_id, name='MinFreeSpace', value=2 * archive_size + nb_c_outside_archive * constituent_size) rse_core.set_rse_usage(rse_id=rse_id, source='storage', used=2 * archive_size + nb_c_outside_archive * constituent_size, free=1) - reaper(once=True, rses=[], include_rses=rse_name, exclude_rses=None) + reaper = Reaper(once=True, rses=[], include_rses=rse_name, exclude_rses=None) + reaper._call_daemon() for did in constituents + [archive1, archive2]: assert did_core.get_did(**did) for did in [archive1, archive2, c_with_replica, c_with_replica_and_rule]: @@ -322,7 +327,7 @@ def __get_archive_contents_history_count(archive, *, session=None): cache_region.invalidate() rse_core.set_rse_limits(rse_id=rse_id, name='MinFreeSpace', value=2 * archive_size + nb_c_outside_archive * constituent_size) rse_core.set_rse_usage(rse_id=rse_id, source='storage', used=2 * archive_size + nb_c_outside_archive * constituent_size, free=1) - reaper(once=True, rses=[], include_rses=rse_name, exclude_rses=None) + reaper._call_daemon() with pytest.raises(DataIdentifierNotFound): assert did_core.get_did(**archive1) with pytest.raises(DataIdentifierNotFound): @@ -343,7 +348,7 @@ def __get_archive_contents_history_count(archive, *, session=None): cache_region.invalidate() rse_core.set_rse_limits(rse_id=rse_id, name='MinFreeSpace', value=archive_size + nb_c_outside_archive * constituent_size) rse_core.set_rse_usage(rse_id=rse_id, source='storage', used=archive_size + nb_c_outside_archive * constituent_size, free=1) - reaper(once=True, rses=[], include_rses=rse_name, exclude_rses=None) + reaper._call_daemon() # The archive must be removed with pytest.raises(DataIdentifierNotFound): assert did_core.get_did(**archive2) @@ -394,7 +399,8 @@ def test_archive_of_deleted_dids(vo, did_factory, root_account, core_config_mock reaper_cache_region.invalidate() rse_core.set_rse_usage(rse_id=rse_id, source='storage', used=nb_files * file_size, free=323000000000) - reaper(once=True, rses=[], include_rses=rse_name, exclude_rses=None, greedy=True) + reaper = Reaper(once=True, rses=[], include_rses=rse_name, exclude_rses=None, greedy=True) + reaper._call_daemon() assert len(list(replica_core.list_replicas(dids=dids, rse_expression=rse_name))) == 0 file_clause = [] @@ -446,7 +452,8 @@ def test_run_on_non_existing_scheme(vo, caches_mock): # The reaper will set a flag pause_deletion_ cache_region.invalidate() rse_core.set_rse_usage(rse_id=rse_id, source='storage', used=nb_files * file_size, free=1) - reaper(once=True, rses=[], include_rses=rse_name, exclude_rses=None, chunk_size=1000, scheme='https') + reaper = Reaper(once=True, rses=[], include_rses=rse_name, exclude_rses=None, chunk_size=1000, scheme='https') + reaper._call_daemon() assert len(list(replica_core.list_replicas(dids, rse_expression=rse_name))) == 250 assert cache_region.get('pause_deletion_%s' % rse_id) @@ -470,7 +477,8 @@ def test_reaper_without_rse_usage(vo, caches_mock): # Check first if the reaper does not delete anything if there's nothing obsolete cache_region.invalidate() - reaper(once=True, rses=[], include_rses=rse_name, exclude_rses=None, chunk_size=1000, scheme='MOCK') + reaper = Reaper(once=True, rses=[], include_rses=rse_name, exclude_rses=None, chunk_size=1000, scheme='MOCK') + reaper._call_daemon() assert len(list(replica_core.list_replicas(dids=dids, rse_expression=rse_name))) == nb_files # Now set Epoch tombstone for a few replicas @@ -480,7 +488,7 @@ def test_reaper_without_rse_usage(vo, caches_mock): # The reaper should delete the replica with Epoch tombstone even if the rse_usage is not set cache_region.invalidate() - reaper(once=True, rses=[], include_rses=rse_name, exclude_rses=None, chunk_size=1000, scheme='MOCK') + reaper._call_daemon() assert len(list(replica_core.list_replicas(dids, rse_expression=rse_name))) == nb_files - nb_epoch_tombstone @@ -499,4 +507,5 @@ def test_deletion_with_tokens(vo, did_factory, root_account, caches_mock, file_c for rule in list(rule_core.list_associated_rules_for_file(**did)): rule_core.delete_rule(rule['id']) - reaper(once=True, rses=[], include_rses=rse_name, exclude_rses=None, chunk_size=1000, greedy=True, scheme='davs') + reaper = Reaper(once=True, rses=[], include_rses=rse_name, exclude_rses=None, chunk_size=1000, greedy=True, scheme='davs') + reaper._call_daemon() diff --git a/tools/count_missing_type_annotations_utils.sh b/tools/count_missing_type_annotations_utils.sh index 3dea91b802..a10e0e185e 100755 --- a/tools/count_missing_type_annotations_utils.sh +++ b/tools/count_missing_type_annotations_utils.sh @@ -15,14 +15,14 @@ # limitations under the License. # Script with all tools to count the missing python type annotations in the -# project. Installes all necessary python packages temporarly if needed. To use +# project. Installs all necessary python packages temporarily if needed. To use # it run: `scource count_missing_annotations_utils.sh`. set -e ensure_install() { - # Checks if a python package is installed via pip. It installes the package, + # Checks if a python package is installed via pip. It installs the package, # and removes it after the script run automatically. # # All debug output is redirected to the stderr stream, to not interfear with From fa57fcf53862bf8e7a6647b2a374e94fa8a40133 Mon Sep 17 00:00:00 2001 From: rdimaio Date: Thu, 1 Feb 2024 14:37:57 +0100 Subject: [PATCH 3/5] Daemons: Refactor Judge Repairer to inherit from Daemon base class; #6478 --- bin/rucio-judge-repairer | 9 +- lib/rucio/daemons/judge/repairer.py | 167 +++++++-------------- lib/rucio/daemons/reaper/reaper.py | 3 +- lib/rucio/daemons/undertaker/undertaker.py | 2 - tests/test_daemons.py | 3 +- tests/test_judge_repairer.py | 48 +++--- 6 files changed, 92 insertions(+), 140 deletions(-) diff --git a/bin/rucio-judge-repairer b/bin/rucio-judge-repairer index 8eeef90730..7066eb9f0d 100755 --- a/bin/rucio-judge-repairer +++ b/bin/rucio-judge-repairer @@ -19,9 +19,8 @@ Judge-Repairer is a daemon to repair stuck replication rules. """ import argparse -import signal -from rucio.daemons.judge.repairer import run, stop +from rucio.daemons.judge.repairer import JudgeRepairer def get_parser(): @@ -36,10 +35,10 @@ def get_parser(): if __name__ == "__main__": - signal.signal(signal.SIGTERM, stop) parser = get_parser() args = parser.parse_args() + judge_repairer = JudgeRepairer(once=args.run_once, threads=args.threads, sleep_time=args.sleep_time) try: - run(once=args.run_once, threads=args.threads, sleep_time=args.sleep_time) + judge_repairer.run() except KeyboardInterrupt: - stop() + judge_repairer.stop() diff --git a/lib/rucio/daemons/judge/repairer.py b/lib/rucio/daemons/judge/repairer.py index b797f202b0..64db8f91bb 100644 --- a/lib/rucio/daemons/judge/repairer.py +++ b/lib/rucio/daemons/judge/repairer.py @@ -16,130 +16,79 @@ """ Judge-Repairer is a daemon to repair stuck replication rules. """ -import functools import logging -import threading import time import traceback from copy import deepcopy from datetime import datetime, timedelta from random import randint from re import match -from typing import TYPE_CHECKING +from typing import Any from rucio.db.sqla.constants import ORACLE_CONNECTION_LOST_CONTACT_REGEX, ORACLE_RESOURCE_BUSY_REGEX from sqlalchemy.exc import DatabaseError -import rucio.db.sqla.util -from rucio.common import exception from rucio.common.exception import DatabaseException -from rucio.common.logging import setup_logging from rucio.core.monitor import MetricManager from rucio.core.rule import repair_rule, get_stuck_rules -from rucio.daemons.common import run_daemon - -if TYPE_CHECKING: - from types import FrameType - from typing import Optional +from rucio.daemons.common import Daemon, HeartbeatHandler METRICS = MetricManager(module=__name__) -graceful_stop = threading.Event() -DAEMON_NAME = 'judge-repairer' - - -def rule_repairer(once=False, sleep_time=60): - """ - Main loop to check for STUCK replication rules - """ - paused_rules = {} # {rule_id: datetime} - run_daemon( - once=once, - graceful_stop=graceful_stop, - executable=DAEMON_NAME, - partition_wait_time=1, - sleep_time=sleep_time, - run_once_fnc=functools.partial( - run_once, - paused_rules=paused_rules, - delta=-1 if once else 1800, - ) - ) - - -def run_once(paused_rules, delta, heartbeat_handler, **_kwargs): - worker_number, total_workers, logger = heartbeat_handler.live() - - start = time.time() - - # Refresh paused rules - iter_paused_rules = deepcopy(paused_rules) - for key in iter_paused_rules: - if datetime.utcnow() > paused_rules[key]: - del paused_rules[key] - - # Select a bunch of rules for this worker to repair - rules = get_stuck_rules(total_workers=total_workers, - worker_number=worker_number, - delta=delta, - limit=100, - blocked_rules=[key for key in paused_rules]) - - logger(logging.DEBUG, 'index query time %f fetch size is %d' % (time.time() - start, len(rules))) - - if not rules: - logger(logging.DEBUG, 'did not get any work (paused_rules=%s)' % (str(len(paused_rules)))) - return - - for rule_id in rules: - _, _, logger = heartbeat_handler.live() - rule_id = rule_id[0] - logger(logging.INFO, 'Repairing rule %s' % (rule_id)) - if graceful_stop.is_set(): - break - try: - start = time.time() - repair_rule(rule_id=rule_id) - logger(logging.DEBUG, 'repairing of %s took %f' % (rule_id, time.time() - start)) - except (DatabaseException, DatabaseError) as e: - if match(ORACLE_RESOURCE_BUSY_REGEX, str(e.args[0])): - paused_rules[rule_id] = datetime.utcnow() + timedelta(seconds=randint(600, 2400)) - logger(logging.WARNING, 'Locks detected for %s' % (rule_id)) - METRICS.counter('exceptions.{exception}').labels(exception='LocksDetected').inc() - elif match('.*QueuePool.*', str(e.args[0])): - logger(logging.WARNING, traceback.format_exc()) - METRICS.counter('exceptions.{exception}').labels(exception=e.__class__.__name__).inc() - elif match(ORACLE_CONNECTION_LOST_CONTACT_REGEX, str(e.args[0])): - logger(logging.WARNING, traceback.format_exc()) - METRICS.counter('exceptions.{exception}').labels(exception=e.__class__.__name__).inc() - else: - logger(logging.ERROR, traceback.format_exc()) - METRICS.counter('exceptions.{exception}').labels(exception=e.__class__.__name__).inc() - - -def stop(signum: "Optional[int]" = None, frame: "Optional[FrameType]" = None) -> None: - """ - Graceful exit. - """ - - graceful_stop.set() - - -def run(once=False, threads=1, sleep_time=60): - """ - Starts up the Judge-Repairer threads. - """ - setup_logging(process_name=DAEMON_NAME) - if rucio.db.sqla.util.is_old_db(): - raise exception.DatabaseException('Database was not updated, daemon won\'t start') - if once: - rule_repairer(once) - else: - logging.info('Repairer starting %s threads' % str(threads)) - threads = [threading.Thread(target=rule_repairer, kwargs={'once': once, - 'sleep_time': sleep_time}) for i in range(0, threads)] - [t.start() for t in threads] - # Interruptible joins require a timeout. - while threads[0].is_alive(): - [t.join(timeout=3.14) for t in threads] +class JudgeRepairer(Daemon): + def __init__(self, **_kwargs) -> None: + super().__init__(daemon_name="judge-repairer", **_kwargs) + self.delta = -1 if self.once else 1800 + self.paused_rules = {} # {rule_id: datetime} + + def _run_once(self, heartbeat_handler: "HeartbeatHandler", **_kwargs) -> tuple[bool, Any]: + worker_number, total_workers, logger = heartbeat_handler.live() + must_sleep = False + + start = time.time() + + # Refresh paused rules + iter_paused_rules = deepcopy(self.paused_rules) + for key in iter_paused_rules: + if datetime.utcnow() > self.paused_rules[key]: + del self.paused_rules[key] + + # Select a bunch of rules for this worker to repair + rules = get_stuck_rules(total_workers=total_workers, + worker_number=worker_number, + delta=self.delta, + limit=100, + blocked_rules=[key for key in self.paused_rules]) + + logger(logging.DEBUG, 'index query time %f fetch size is %d' % (time.time() - start, len(rules))) + + if not rules: + logger(logging.DEBUG, 'did not get any work (paused_rules=%s)' % (str(len(self.paused_rules)))) + return must_sleep, None + + for rule_id in rules: + _, _, logger = heartbeat_handler.live() + rule_id = rule_id[0] + logger(logging.INFO, 'Repairing rule %s' % (rule_id)) + if self.graceful_stop.is_set(): + break + try: + start = time.time() + repair_rule(rule_id=rule_id) + logger(logging.DEBUG, 'repairing of %s took %f' % (rule_id, time.time() - start)) + except (DatabaseException, DatabaseError) as e: + if match(ORACLE_RESOURCE_BUSY_REGEX, str(e.args[0])): + self.paused_rules[rule_id] = datetime.utcnow() + timedelta(seconds=randint(600, 2400)) + logger(logging.WARNING, 'Locks detected for %s' % (rule_id)) + METRICS.counter('exceptions.{exception}').labels(exception='LocksDetected').inc() + elif match('.*QueuePool.*', str(e.args[0])): + logger(logging.WARNING, traceback.format_exc()) + METRICS.counter('exceptions.{exception}').labels(exception=e.__class__.__name__).inc() + elif match(ORACLE_CONNECTION_LOST_CONTACT_REGEX, str(e.args[0])): + logger(logging.WARNING, traceback.format_exc()) + METRICS.counter('exceptions.{exception}').labels(exception=e.__class__.__name__).inc() + else: + logger(logging.ERROR, traceback.format_exc()) + METRICS.counter('exceptions.{exception}').labels(exception=e.__class__.__name__).inc() + return must_sleep, None diff --git a/lib/rucio/daemons/reaper/reaper.py b/lib/rucio/daemons/reaper/reaper.py index 3b39941107..63df427088 100644 --- a/lib/rucio/daemons/reaper/reaper.py +++ b/lib/rucio/daemons/reaper/reaper.py @@ -19,7 +19,6 @@ import logging import random -import threading import time import traceback from configparser import NoOptionError, NoSectionError @@ -56,7 +55,7 @@ if TYPE_CHECKING: from collections.abc import Callable -GRACEFUL_STOP = threading.Event() + METRICS = MetricManager(module=__name__) REGION = make_region_memcached(expiration_time=600) diff --git a/lib/rucio/daemons/undertaker/undertaker.py b/lib/rucio/daemons/undertaker/undertaker.py index 6b47f19fb2..cd754719f9 100644 --- a/lib/rucio/daemons/undertaker/undertaker.py +++ b/lib/rucio/daemons/undertaker/undertaker.py @@ -18,7 +18,6 @@ ''' import logging -import threading import traceback from copy import deepcopy from datetime import datetime, timedelta @@ -39,7 +38,6 @@ logging.getLogger("requests").setLevel(logging.CRITICAL) METRICS = MetricManager(module=__name__) -graceful_stop = threading.Event() class Undertaker(Daemon): diff --git a/tests/test_daemons.py b/tests/test_daemons.py index e3178cbe78..370bb923c6 100644 --- a/tests/test_daemons.py +++ b/tests/test_daemons.py @@ -28,7 +28,7 @@ from rucio.daemons.conveyor import finisher, poller, receiver, stager, submitter, throttler, preparer from rucio.daemons.follower import follower from rucio.daemons.hermes import hermes -from rucio.daemons.judge import cleaner, evaluator, injector, repairer +from rucio.daemons.judge import cleaner, evaluator, injector from rucio.daemons.oauthmanager import oauthmanager from rucio.daemons.reaper import dark_reaper from rucio.daemons.replicarecoverer import suspicious_replica_recoverer @@ -59,7 +59,6 @@ cleaner, evaluator, injector, - repairer, oauthmanager, dark_reaper, suspicious_replica_recoverer, diff --git a/tests/test_judge_repairer.py b/tests/test_judge_repairer.py index 446072993e..38ff8a41fa 100644 --- a/tests/test_judge_repairer.py +++ b/tests/test_judge_repairer.py @@ -31,7 +31,7 @@ from rucio.core.rse import add_rse_attribute, add_rse, update_rse from rucio.core.rule import get_rule, add_rule from rucio.daemons.judge.evaluator import re_evaluator -from rucio.daemons.judge.repairer import rule_repairer +from rucio.daemons.judge.repairer import JudgeRepairer from rucio.db.sqla import models from rucio.db.sqla.constants import DIDType, RuleState, ReplicaState from rucio.db.sqla.session import get_session @@ -92,8 +92,7 @@ def setUpClass(cls): def test_to_repair_a_rule_with_NONE_grouping_whose_transfer_failed(self): """ JUDGE REPAIRER: Test to repair a rule with 1 failed transfer (lock)""" - - rule_repairer(once=True) # Clean out the repairer + JudgeRepairer(once=True).run() # Clean out the repairer scope = InternalScope('mock', **self.vo) files = create_files(3, scope, self.rse4_id, bytes_=100) dataset = did_name_generator('dataset') @@ -112,7 +111,7 @@ def test_to_repair_a_rule_with_NONE_grouping_whose_transfer_failed(self): assert (rule_id == get_rule(rule_id)['id'].replace('-', '').lower()) assert (RuleState.STUCK == get_rule(rule_id)['state']) - rule_repairer(once=True) + JudgeRepairer(once=True).run() assert (RuleState.REPLICATING == get_rule(rule_id)['state']) assert (get_replica(scope=files[2]['scope'], name=files[2]['name'], rse_id=failed_rse_id)['state'] == ReplicaState.UNAVAILABLE) assert (get_replica(scope=files[2]['scope'], name=files[2]['name'], rse_id=failed_rse_id)['lock_cnt'] == 0) @@ -120,7 +119,7 @@ def test_to_repair_a_rule_with_NONE_grouping_whose_transfer_failed(self): def test_to_repair_a_rule_with_ALL_grouping_whose_transfer_failed(self): """ JUDGE REPAIRER: Test to repair a rule with 1 failed transfer (lock)""" - rule_repairer(once=True) # Clean out the repairer + JudgeRepairer(once=True).run() # Clean out the repairer scope = InternalScope('mock', **self.vo) files = create_files(4, scope, self.rse4_id, bytes_=100) dataset = did_name_generator('dataset') @@ -136,7 +135,7 @@ def test_to_repair_a_rule_with_ALL_grouping_whose_transfer_failed(self): assert (rule_id == get_rule(rule_id)['id'].replace('-', '').lower()) assert (RuleState.STUCK == get_rule(rule_id)['state']) - rule_repairer(once=True) + JudgeRepairer(once=True).run() assert (RuleState.REPLICATING == get_rule(rule_id)['state']) assert (get_replica_locks(scope=files[2]['scope'], name=files[2]['name'])[0].rse_id == get_replica_locks(scope=files[3]['scope'], name=files[3]['name'])[0].rse_id) assert (get_replica_locks(scope=files[1]['scope'], name=files[1]['name'])[0].rse_id == get_replica_locks(scope=files[3]['scope'], name=files[3]['name'])[0].rse_id) @@ -144,7 +143,7 @@ def test_to_repair_a_rule_with_ALL_grouping_whose_transfer_failed(self): def test_to_repair_a_rule_with_DATASET_grouping_whose_transfer_failed(self): """ JUDGE REPAIRER: Test to repair a rule with 1 failed transfer (lock)""" - rule_repairer(once=True) # Clean out the repairer + JudgeRepairer(once=True).run() # Clean out the repairer scope = InternalScope('mock', **self.vo) files = create_files(4, scope, self.rse4_id, bytes_=100) dataset = did_name_generator('dataset') @@ -160,7 +159,7 @@ def test_to_repair_a_rule_with_DATASET_grouping_whose_transfer_failed(self): assert (rule_id == get_rule(rule_id)['id'].replace('-', '').lower()) assert (RuleState.STUCK == get_rule(rule_id)['state']) - rule_repairer(once=True) + JudgeRepairer(once=True).run() assert (RuleState.REPLICATING == get_rule(rule_id)['state']) assert (get_replica_locks(scope=files[2]['scope'], name=files[2]['name'])[0].rse_id == get_replica_locks(scope=files[3]['scope'], name=files[3]['name'])[0].rse_id) assert (get_replica_locks(scope=files[1]['scope'], name=files[1]['name'])[0].rse_id == get_replica_locks(scope=files[3]['scope'], name=files[3]['name'])[0].rse_id) @@ -194,7 +193,7 @@ def test_repair_a_rule_with_missing_locks(self): rule.state = RuleState.STUCK session.commit() - rule_repairer(once=True) + JudgeRepairer(once=True).run() for file in files: assert (len(get_replica_locks(scope=file['scope'], name=file['name'])) == 2) @@ -231,7 +230,7 @@ def test_repair_a_rule_with_source_replica_expression(self): replica.state = ReplicaState.AVAILABLE session.commit() - rule_repairer(once=True) + JudgeRepairer(once=True).run() assert (RuleState.OK == get_rule(rule_id1)['state']) assert (RuleState.REPLICATING == get_rule(rule_id2)['state']) @@ -239,7 +238,7 @@ def test_repair_a_rule_with_source_replica_expression(self): def test_to_repair_a_rule_with_only_1_rse_whose_transfers_failed(self): """ JUDGE REPAIRER: Test to repair a rule with only 1 rse whose transfers failed (lock)""" - rule_repairer(once=True) # Clean out the repairer + JudgeRepairer(once=True).run() # Clean out the repairer scope = InternalScope('mock', **self.vo) files = create_files(4, scope, self.rse4_id, bytes_=100) dataset = did_name_generator('dataset') @@ -259,7 +258,7 @@ def test_to_repair_a_rule_with_only_1_rse_whose_transfers_failed(self): assert (rule_id == get_rule(rule_id)['id'].replace('-', '').lower()) assert (RuleState.STUCK == get_rule(rule_id)['state']) - rule_repairer(once=True) + JudgeRepairer(once=True).run() # Stil assert STUCK because of delays: assert (RuleState.STUCK == get_rule(rule_id)['state']) @@ -270,7 +269,7 @@ def test_to_repair_a_rule_with_only_1_rse_whose_transfers_failed(self): def test_to_repair_a_rule_with_NONE_grouping_whose_transfer_failed_and_flipping_to_other_rse(self): """ JUDGE REPAIRER: Test to repair a rule with 1 failed transfer and flip to other rse(lock)""" - rule_repairer(once=True) # Clean out the repairer + JudgeRepairer(once=True).run() # Clean out the repairer scope = InternalScope('mock', **self.vo) files = create_files(4, scope, self.rse4_id, bytes_=100) dataset = did_name_generator('dataset') @@ -288,7 +287,7 @@ def test_to_repair_a_rule_with_NONE_grouping_whose_transfer_failed_and_flipping_ assert (rule_id == get_rule(rule_id)['id'].replace('-', '').lower()) assert (RuleState.STUCK == get_rule(rule_id)['state']) - rule_repairer(once=True) + JudgeRepairer(once=True).run() assert (RuleState.REPLICATING == get_rule(rule_id)['state']) assert (get_replica_locks(scope=files[3]['scope'], name=files[3]['name'])[0].rse_id != old_rse_id) @@ -298,7 +297,7 @@ def test_to_repair_a_rule_with_only_1_rse_whose_site_is_blocklisted(self): rse = rse_name_generator() rse_id = add_rse(rse, **self.vo) set_local_account_limit(self.jdoe, rse_id, -1) - rule_repairer(once=True) # Clean out the repairer + JudgeRepairer(once=True).run() # Clean out the repairer region = make_region().configure( 'dogpile.cache.pymemcache', @@ -320,22 +319,31 @@ def change_availability(new_value): if ignore_availability: change_availability(False) - rule_id = add_rule(dids=[{'scope': scope, 'name': dataset}], account=self.jdoe, copies=1, rse_expression=rse, grouping=grouping, weight=None, lifetime=None, locked=False, subscription_id=None, ignore_availability=ignore_availability, activity='DebugJudge')[0] + rule_id = add_rule( + dids=[{'scope': scope, 'name': dataset}], + account=self.jdoe, copies=1, rse_expression=rse, grouping=grouping, + weight=None, lifetime=None, locked=False, subscription_id=None, + ignore_availability=ignore_availability, activity='DebugJudge')[0] assert (RuleState.STUCK == get_rule(rule_id)['state']) - rule_repairer(once=True) + JudgeRepairer(once=True).run() assert (RuleState.REPLICATING == get_rule(rule_id)['state']) change_availability(True) else: - rule_id = add_rule(dids=[{'scope': scope, 'name': dataset}], account=self.jdoe, copies=1, rse_expression=rse, grouping=grouping, weight=None, lifetime=None, locked=False, subscription_id=None, ignore_availability=ignore_availability, activity='DebugJudge')[0] + rule_id = add_rule( + dids=[{'scope': scope, 'name': dataset}], + account=self.jdoe, copies=1, rse_expression=rse, + grouping=grouping, weight=None, lifetime=None, locked=False, + subscription_id=None, + ignore_availability=ignore_availability, activity='DebugJudge')[0] failed_transfer(scope=scope, name=files[0]['name'], rse_id=get_replica_locks(scope=files[0]['scope'], name=files[0]['name'])[0].rse_id) change_availability(False) assert (RuleState.STUCK == get_rule(rule_id)['state']) - rule_repairer(once=True) + JudgeRepairer(once=True).run() assert (RuleState.STUCK == get_rule(rule_id)['state']) change_availability(True) - rule_repairer(once=True) + JudgeRepairer(once=True).run() assert (RuleState.REPLICATING == get_rule(rule_id)['state']) From fe86b5271e687be7178324b02ec8e8a4bcbb2e38 Mon Sep 17 00:00:00 2001 From: rdimaio Date: Thu, 15 Feb 2024 13:47:11 +0100 Subject: [PATCH 4/5] Daemons: Refactor Abacus Account to inherit from base Daemon class; rucio#6478 --- bin/rucio-abacus-account | 15 +- bin/rucio-judge-repairer | 2 + lib/rucio/core/rse_expression_parser.py | 2 +- lib/rucio/daemons/abacus/account.py | 108 +++------ lib/rucio/daemons/abacus/common.py | 64 +++++ lib/rucio/daemons/common.py | 224 +++++++----------- tests/test_abacus_account.py | 6 +- tests/test_bin_rucio.py | 4 +- tests/test_counter.py | 12 +- tests/test_daemons.py | 3 +- tests/test_judge_evaluator.py | 12 +- tests/test_rse.py | 4 +- tests/test_rule.py | 14 +- tools/count_missing_type_annotations_utils.sh | 4 +- 14 files changed, 216 insertions(+), 258 deletions(-) create mode 100644 lib/rucio/daemons/abacus/common.py diff --git a/bin/rucio-abacus-account b/bin/rucio-abacus-account index 5070087e24..cba5105b3a 100755 --- a/bin/rucio-abacus-account +++ b/bin/rucio-abacus-account @@ -20,8 +20,8 @@ Abacus account is a daemon to update account counters. import argparse import signal - -from rucio.daemons.abacus.account import run, stop +from rucio.daemons.abacus.account import AbacusAccount +from rucio.daemons.abacus.common import ABACUS_HISTORY_TABLE_INTERVAL def get_parser(): @@ -57,19 +57,18 @@ Check account usage again:: ''', formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument("--run-once", action="store_true", default=False, help='One iteration only') parser.add_argument("--threads", action="store", default=1, type=int, help='Concurrency control: total number of threads on this process') - parser.add_argument("--enable-history", action="store_true", default=False, help='Record account usage into history table every hour.') + parser.add_argument("--enable-history", action="store_true", default=False, help=f'Record account usage into history table every {ABACUS_HISTORY_TABLE_INTERVAL} seconds.') parser.add_argument('--sleep-time', action="store", default=10, type=int, help='Concurrency control: thread sleep time after each chunk of work') return parser if __name__ == "__main__": - - signal.signal(signal.SIGTERM, stop) - parser = get_parser() args = parser.parse_args() + abacus_account = AbacusAccount(once=args.run_once, threads=args.threads, fill_history_table=args.enable_history, sleep_time=args.sleep_time) + signal.signal(signal.SIGTERM, abacus_account.stop) try: - run(once=args.run_once, threads=args.threads, fill_history_table=args.enable_history, sleep_time=args.sleep_time) + abacus_account.run() except KeyboardInterrupt: - stop() + abacus_account.stop() diff --git a/bin/rucio-judge-repairer b/bin/rucio-judge-repairer index 7066eb9f0d..07eaa6b61f 100755 --- a/bin/rucio-judge-repairer +++ b/bin/rucio-judge-repairer @@ -19,6 +19,7 @@ Judge-Repairer is a daemon to repair stuck replication rules. """ import argparse +import signal from rucio.daemons.judge.repairer import JudgeRepairer @@ -38,6 +39,7 @@ if __name__ == "__main__": parser = get_parser() args = parser.parse_args() judge_repairer = JudgeRepairer(once=args.run_once, threads=args.threads, sleep_time=args.sleep_time) + signal.signal(signal.SIGTERM, judge_repairer.stop) try: judge_repairer.run() except KeyboardInterrupt: diff --git a/lib/rucio/core/rse_expression_parser.py b/lib/rucio/core/rse_expression_parser.py index a739d4e678..5a87d937fd 100644 --- a/lib/rucio/core/rse_expression_parser.py +++ b/lib/rucio/core/rse_expression_parser.py @@ -42,7 +42,7 @@ @transactional_session -def parse_expression(expression: str, filter_=None, *, session: "Session"): +def parse_expression(expression, filter_=None, *, session: "Session"): """ Parse a RSE expression and return the list of RSE dictionaries. diff --git a/lib/rucio/daemons/abacus/account.py b/lib/rucio/daemons/abacus/account.py index 78de5d45c4..e19bf98bb5 100644 --- a/lib/rucio/daemons/abacus/account.py +++ b/lib/rucio/daemons/abacus/account.py @@ -18,89 +18,37 @@ """ import logging -import threading import time -from typing import TYPE_CHECKING +from typing import Any -import rucio.db.sqla.util -from rucio.common import exception -from rucio.common.logging import setup_logging -from rucio.common.utils import get_thread_with_periodic_running_function -from rucio.core.account_counter import get_updated_account_counters, update_account_counter, fill_account_counter_history_table -from rucio.daemons.common import run_daemon +from rucio.core.account_counter import get_updated_account_counters, update_account_counter +from rucio.daemons.common import HeartbeatHandler +from rucio.daemons.abacus.common import AbacusDaemon -if TYPE_CHECKING: - from types import FrameType - from typing import Optional -graceful_stop = threading.Event() -DAEMON_NAME = 'abacus-account' +class AbacusAccount(AbacusDaemon): + def __init__(self, **_kwargs) -> None: + super().__init__(daemon_name="abacus-account", **_kwargs) - -def account_update(once=False, sleep_time=10): - """ - Main loop to check and update the Account Counters. - """ - run_daemon( - once=once, - graceful_stop=graceful_stop, - executable=DAEMON_NAME, - partition_wait_time=1, - sleep_time=sleep_time, - run_once_fnc=run_once, - ) - - -def run_once(heartbeat_handler, **_kwargs): - worker_number, total_workers, logger = heartbeat_handler.live() - - start = time.time() # NOQA - account_rse_ids = get_updated_account_counters(total_workers=total_workers, - worker_number=worker_number) - logger(logging.DEBUG, 'Index query time %f size=%d' % (time.time() - start, len(account_rse_ids))) - - # If the list is empty, sent the worker to sleep - if not account_rse_ids: - logger(logging.INFO, 'did not get any work') - return - - for account_rse_id in account_rse_ids: + def _run_once(self, heartbeat_handler: "HeartbeatHandler", **_kwargs) -> tuple[bool, Any]: worker_number, total_workers, logger = heartbeat_handler.live() - if graceful_stop.is_set(): - break - start_time = time.time() - update_account_counter(account=account_rse_id[0], rse_id=account_rse_id[1]) - logger(logging.DEBUG, 'update of account-rse counter "%s-%s" took %f' % (account_rse_id[0], account_rse_id[1], time.time() - start_time)) - - -def stop(signum: "Optional[int]" = None, frame: "Optional[FrameType]" = None) -> None: - """ - Graceful exit. - """ - - graceful_stop.set() - - -def run(once=False, threads=1, fill_history_table=False, sleep_time=10): - """ - Starts up the Abacus-Account threads. - """ - setup_logging(process_name=DAEMON_NAME) - - if rucio.db.sqla.util.is_old_db(): - raise exception.DatabaseException('Database was not updated, daemon won\'t start') - - if once: - logging.info('main: executing one iteration only') - account_update(once) - else: - logging.info('main: starting threads') - threads = [threading.Thread(target=account_update, kwargs={'once': once, 'sleep_time': sleep_time}) for i in - range(0, threads)] - if fill_history_table: - threads.append(get_thread_with_periodic_running_function(3600, fill_account_counter_history_table, graceful_stop)) - [t.start() for t in threads] - logging.info('main: waiting for interrupts') - # Interruptible joins require a timeout. - while threads[0].is_alive(): - [t.join(timeout=3.14) for t in threads] + must_sleep = False + + start = time.time() # NOQA + account_rse_ids = get_updated_account_counters(total_workers=total_workers, + worker_number=worker_number) + logger(logging.DEBUG, 'Index query time %f size=%d' % (time.time() - start, len(account_rse_ids))) + + # If the list is empty, sent the worker to sleep + if not account_rse_ids: + logger(logging.INFO, 'did not get any work') + return must_sleep, None + + for account_rse_id in account_rse_ids: + worker_number, total_workers, logger = heartbeat_handler.live() + if self.graceful_stop.is_set(): + break + start_time = time.time() + update_account_counter(account=account_rse_id[0], rse_id=account_rse_id[1]) + logger(logging.DEBUG, 'update of account-rse counter "%s-%s" took %f' % (account_rse_id[0], account_rse_id[1], time.time() - start_time)) + return must_sleep, None diff --git a/lib/rucio/daemons/abacus/common.py b/lib/rucio/daemons/abacus/common.py new file mode 100644 index 0000000000..2589b9f5c4 --- /dev/null +++ b/lib/rucio/daemons/abacus/common.py @@ -0,0 +1,64 @@ +# -*- coding: utf-8 -*- +# Copyright European Organization for Nuclear Research (CERN) since 2012 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from rucio.daemons.common import Daemon, HeartbeatHandler +from rucio.common.utils import get_thread_with_periodic_running_function +from rucio.core.account_counter import fill_account_counter_history_table +import logging +import threading +from typing import Any +from abc import abstractmethod + +ABACUS_HISTORY_TABLE_INTERVAL = 3600 + + +class AbacusDaemon(Daemon): + """ + Common daemon logic for multiple Abacus daemons. + """ + def __init__(self, fill_history_table: bool = False, **_kwargs) -> None: + f""" + :param fill_history_table: Set to True to record account usage into history table every {ABACUS_HISTORY_TABLE_INTERVAL} seconds. + """ + super().__init__(**_kwargs) + self.fill_history_table = fill_history_table + self.paused_dids = {} + + @abstractmethod + def _run_once( + self, heartbeat_handler: "HeartbeatHandler", **_kwargs + ) -> tuple[bool, Any]: + pass + + def run(self) -> None: + self._pre_run_checks() + + if self.once: + logging.info("%s: executing one iteration only", self.daemon_name) + self._call_daemon() + else: + logging.info("%s: starting threads", self.daemon_name) + thread_list = [threading.Thread(target=self._call_daemon) for _ in + range(0, self.total_workers)] + if self.fill_history_table: + thread_list.append(get_thread_with_periodic_running_function(ABACUS_HISTORY_TABLE_INTERVAL, fill_account_counter_history_table, self.graceful_stop)) + [t.start() for t in thread_list] + logging.info("%s: waiting for interrupts", self.daemon_name) + # Interruptible joins require a timeout. + while thread_list: + thread_list = [ + thread.join(timeout=3.14) + for thread in thread_list + if thread and thread.is_alive() + ] diff --git a/lib/rucio/daemons/common.py b/lib/rucio/daemons/common.py index d4478c8810..b88663a3df 100644 --- a/lib/rucio/daemons/common.py +++ b/lib/rucio/daemons/common.py @@ -19,25 +19,23 @@ import os import queue import socket -import signal import threading import time -from collections.abc import Callable, Generator, Iterator, Sequence -from typing import Any, Generic, Optional, TypeVar, Union -from types import TracebackType, FrameType from abc import ABC, abstractmethod +from collections.abc import Callable, Generator, Iterator, Sequence +from types import FrameType, TracebackType +from typing import Any, Final, Generic, Optional, TypeVar, Union -import rucio.db.sqla.util +from rucio.common.exception import DatabaseException from rucio.common.logging import formatted_logger, setup_logging from rucio.common.utils import PriorityQueue +import rucio.db.sqla.util from rucio.core import heartbeat as heartbeat_core from rucio.core.monitor import MetricManager -from rucio.common.exception import DatabaseException -T = TypeVar("T") +T = TypeVar('T') METRICS = MetricManager(module=__name__) - class Daemon(ABC): """ Base daemon class @@ -58,13 +56,12 @@ def __init__( :param partition_wait_time: Time to wait for database partition rebalancing before starting the actual daemon loop. :param daemon_name: Name of daemon that is constructed. """ - self.once = once - self.total_workers = total_workers - self.sleep_time = sleep_time - self.partition_wait_time = partition_wait_time + self.once: Final[bool] = once + self.total_workers: Final[int] = total_workers + self.sleep_time: Final[int] = sleep_time + self.partition_wait_time: Final[int] = partition_wait_time self.daemon_name = daemon_name self.graceful_stop = threading.Event() - signal.signal(signal.SIGTERM, self.stop) setup_logging(process_name=daemon_name) @staticmethod @@ -115,15 +112,17 @@ def run(self) -> None: """ self._pre_run_checks() - logging.info(f"{self.daemon_name}: starting threads") + logging.info("%s: starting threads", self.daemon_name) thread_list = [ threading.Thread(target=self._call_daemon) - for _ in range(0, self.total_workers) + for _ in range(self.total_workers) ] - [t.start() for t in thread_list] + + for t in thread_list: + t.start() if not self.once: - logging.info(f"{self.daemon_name}: waiting for interrupts") + logging.info("%s: waiting for interrupts", self.daemon_name) # Interruptible joins require a timeout. while thread_list: @@ -134,7 +133,7 @@ def run(self) -> None: ] def stop( - self, signum: "Optional[int]" = None, frame: "Optional[FrameType]" = None + self, signum: Optional[int] = None, frame: Optional[FrameType] = None ) -> None: """ Graceful exit the daemon. Used as handler for SIGTERM. @@ -145,7 +144,6 @@ def stop( """ self.graceful_stop.set() - class HeartbeatHandler: """ Simple contextmanager which sets a heartbeat and associated logger on entry and cleans up the heartbeat on exit. @@ -160,9 +158,7 @@ def __init__(self, executable: str, renewal_interval: int) -> None: self.executable = executable self._hash_executable = None self.renewal_interval = renewal_interval - self.older_than = ( - renewal_interval * 10 if renewal_interval and renewal_interval > 0 else None - ) # 10 was chosen without any particular reason + self.older_than = renewal_interval * 10 if renewal_interval > 0 else None # 10 was chosen without any particular reason self.hostname = socket.getfqdn() self.pid = os.getpid() @@ -178,16 +174,11 @@ def __enter__(self) -> "HeartbeatHandler": self.live() return self - def __exit__( - self, - exc_type: type[BaseException], - exc_val: BaseException, - exc_tb: TracebackType, - ) -> None: + def __exit__(self, exc_type: type[BaseException], exc_val: BaseException, exc_tb: TracebackType): if self.last_heart_beat: heartbeat_core.die(self.executable, self.hostname, self.pid, self.hb_thread) if self.logger: - self.logger(logging.INFO, "Heartbeat cleaned up") + self.logger(logging.INFO, 'Heartbeat cleaned up') @property def hash_executable(self) -> str: @@ -199,64 +190,38 @@ def hash_executable(self) -> str: def short_executable(self) -> str: return min(self.executable, self.hash_executable, key=len) - def live( - self, force_renew: bool = False, payload: Optional[str] = None - ) -> tuple[int, int, Callable]: + def live(self, force_renew: bool = False, payload: Optional[str] = None): """ :return: a tuple: , , """ - if ( - force_renew - or not self.last_time - or not self.last_heart_beat - or self.last_time - < datetime.datetime.now() - - datetime.timedelta(seconds=self.renewal_interval) - or self.last_payload != payload - ): + if force_renew \ + or not self.last_time \ + or not self.last_heart_beat \ + or self.last_time < datetime.datetime.now() - datetime.timedelta(seconds=self.renewal_interval) \ + or self.last_payload != payload: if self.older_than: - self.last_heart_beat = heartbeat_core.live( - self.executable, - self.hostname, - self.pid, - self.hb_thread, - payload=payload, - older_than=self.older_than, - ) + self.last_heart_beat = heartbeat_core.live(self.executable, self.hostname, self.pid, self.hb_thread, payload=payload, older_than=self.older_than) else: - self.last_heart_beat = heartbeat_core.live( - self.executable, - self.hostname, - self.pid, - self.hb_thread, - payload=payload, - ) - - prefix = "[%i/%i]: " % ( - self.last_heart_beat["assign_thread"], - self.last_heart_beat["nr_threads"], - ) - self.logger = formatted_logger(logging.log, prefix + "%s") + self.last_heart_beat = heartbeat_core.live(self.executable, self.hostname, self.pid, self.hb_thread, payload=payload) + + prefix = '[%i/%i]: ' % (self.last_heart_beat['assign_thread'], self.last_heart_beat['nr_threads']) + self.logger = formatted_logger(logging.log, prefix + '%s') if not self.last_time: - self.logger(logging.DEBUG, "First heartbeat set") + self.logger(logging.DEBUG, 'First heartbeat set') else: - self.logger(logging.DEBUG, "Heartbeat renewed") + self.logger(logging.DEBUG, 'Heartbeat renewed') self.last_time = datetime.datetime.now() self.last_payload = payload - return ( - self.last_heart_beat["assign_thread"], - self.last_heart_beat["nr_threads"], - self.logger, - ) + return self.last_heart_beat['assign_thread'], self.last_heart_beat['nr_threads'], self.logger def _activity_looper( - once: bool, - sleep_time: int, - activities: Optional[Sequence[str]], - heartbeat_handler: HeartbeatHandler, + once: bool, + sleep_time: int, + activities: Optional[Sequence[str]], + heartbeat_handler: HeartbeatHandler, ) -> Generator[tuple[str, float], tuple[float, bool], None]: """ Generator which loops (either once, or indefinitely) over all activities while ensuring that `sleep_time` @@ -289,19 +254,14 @@ def _activity_looper( logger = heartbeat_handler.logger if time_to_sleep > 0: if activity: - logger( - logging.DEBUG, - "Switching to activity %s and sleeping %s seconds", - activity, - time_to_sleep, - ) + logger(logging.DEBUG, 'Switching to activity %s and sleeping %s seconds', activity, time_to_sleep) else: - logger(logging.DEBUG, "Sleeping %s seconds", time_to_sleep) + logger(logging.DEBUG, 'Sleeping %s seconds', time_to_sleep) else: if activity: - logger(logging.DEBUG, "Switching to activity %s", activity) + logger(logging.DEBUG, 'Switching to activity %s', activity) else: - logger(logging.DEBUG, "Starting next iteration") + logger(logging.DEBUG, 'Starting next iteration') # The calling context notifies us when the activity actually got handled. And if sleeping is desired. actual_exe_time, must_sleep = yield activity, time_to_sleep @@ -316,12 +276,12 @@ def _activity_looper( def db_workqueue( - once: bool, - graceful_stop: threading.Event, - executable: str, - partition_wait_time: int, - sleep_time: int, - activities: Optional[Sequence[str]] = None, + once: bool, + graceful_stop: threading.Event, + executable: str, + partition_wait_time: int, + sleep_time: int, + activities: Optional[Sequence[str]] = None, ): """ Used to wrap a function for interacting with the database as a work queue: i.e. to select @@ -337,29 +297,20 @@ def db_workqueue( :param activities: optional list of activities on which to work. The run_once_fnc will be called on activities one by one. """ - def _decorate( - run_once_fnc: Callable[..., Optional[Union[bool, tuple[bool, T]]]] - ) -> Callable[[], Iterator[Optional[T]]]: + def _decorate(run_once_fnc: Callable[..., Optional[Union[bool, tuple[bool, T]]]]) -> Callable[[], Iterator[Optional[T]]]: @functools.wraps(run_once_fnc) def _generator(): - with HeartbeatHandler( - executable=executable, renewal_interval=sleep_time - 1 - ) as heartbeat_handler: + with HeartbeatHandler(executable=executable, renewal_interval=sleep_time - 1) as heartbeat_handler: logger = heartbeat_handler.logger - logger(logging.INFO, "started") + logger(logging.INFO, 'started') if partition_wait_time: graceful_stop.wait(partition_wait_time) _, _, logger = heartbeat_handler.live(force_renew=True) - activity_loop = _activity_looper( - once=once, - sleep_time=sleep_time, - activities=activities, - heartbeat_handler=heartbeat_handler, - ) + activity_loop = _activity_looper(once=once, sleep_time=sleep_time, activities=activities, heartbeat_handler=heartbeat_handler) activity, time_to_sleep = next(activity_loop, (None, None)) while time_to_sleep is not None: if graceful_stop.is_set(): @@ -373,9 +324,7 @@ def _generator(): must_sleep = True start_time = time.time() try: - result = run_once_fnc( - heartbeat_handler=heartbeat_handler, activity=activity - ) + result = run_once_fnc(heartbeat_handler=heartbeat_handler, activity=activity) # Handle return values already existing in the code # TODO: update all existing daemons to always explicitly return (must_sleep, ret_value) @@ -391,22 +340,18 @@ def _generator(): if ret_value is not None: yield ret_value except Exception as e: - METRICS.counter("exceptions.{exception}").labels( - exception=e.__class__.__name__ - ).inc() + METRICS.counter('exceptions.{exception}').labels(exception=e.__class__.__name__).inc() logger(logging.CRITICAL, "Exception", exc_info=True) if once: raise try: - activity, time_to_sleep = activity_loop.send( - (start_time, must_sleep) - ) + activity, time_to_sleep = activity_loop.send((start_time, must_sleep)) except StopIteration: break if not once: - logger(logging.INFO, "Graceful stop requested") + logger(logging.INFO, 'Graceful stop requested') return _generator @@ -414,14 +359,13 @@ def _generator(): def run_daemon( - once: bool, - graceful_stop: threading.Event, - executable: str, - partition_wait_time: int, - sleep_time: int, - run_once_fnc: Callable[..., Optional[Union[bool, tuple[bool, Any]]]], - activities: Optional[list[str]] = None, -) -> None: + once: bool, + graceful_stop: threading.Event, + executable: str, + partition_wait_time: int, + sleep_time: int, + run_once_fnc: Callable[..., Optional[Union[bool, tuple[bool, Any]]]], + activities: Optional[list[str]] = None): """ Run the daemon loop and call the function run_once_fnc at each iteration """ @@ -455,7 +399,11 @@ def __init__(self, producers, consumers, graceful_stop, logger=logging.log): self.producers_done_event = threading.Event() self.logger = logger - def _produce(self, it: Callable[[], Iterator[T]], wait_for_consumers: bool = False): + def _produce( + self, + it: Callable[[], Iterator[T]], + wait_for_consumers: bool = False + ): """ Iterate over the generator function and put the extracted elements into the queue. @@ -477,9 +425,7 @@ def _produce(self, it: Callable[[], Iterator[T]], wait_for_consumers: bool = Fal except StopIteration: break except Exception as e: - METRICS.counter("exceptions.{exception}").labels( - exception=e.__class__.__name__ - ).inc() + METRICS.counter('exceptions.{exception}').labels(exception=e.__class__.__name__).inc() self.logger(logging.CRITICAL, "Exception", exc_info=True) finally: with self.lock: @@ -490,7 +436,10 @@ def _produce(self, it: Callable[[], Iterator[T]], wait_for_consumers: bool = Fal if wait_for_consumers: self.queue.join() - def _consume(self, fnc: Callable[[T], Any]): + def _consume( + self, + fnc: Callable[[T], Any] + ): """ Wait for elements to arrive via the queue and call the given function on each element. @@ -506,9 +455,7 @@ def _consume(self, fnc: Callable[[T], Any]): try: fnc(product) except Exception as e: - METRICS.counter("exceptions.{exception}").labels( - exception=e.__class__.__name__ - ).inc() + METRICS.counter('exceptions.{exception}').labels(exception=e.__class__.__name__).inc() self.logger(logging.CRITICAL, "Exception", exc_info=True) finally: self.queue.task_done() @@ -519,8 +466,11 @@ def run(self): for i, producer in enumerate(self.producers): thread = threading.Thread( target=self._produce, - name=f"producer-{i}-{producer.__name__}", - kwargs={"it": producer, "wait_for_consumers": True}, + name=f'producer-{i}-{producer.__name__}', + kwargs={ + 'it': producer, + 'wait_for_consumers': True + } ) thread.start() producer_threads.append(thread) @@ -529,28 +479,24 @@ def run(self): for i, consumer in enumerate(self.consumers): thread = threading.Thread( target=self._consume, - name=f"consumer-{i}-{consumer.__name__}", + name=f'consumer-{i}-{consumer.__name__}', kwargs={ - "fnc": consumer, - }, + 'fnc': consumer, + } ) thread.start() consumer_threads.append(thread) - logging.info("waiting for interrupts") + logging.info('waiting for interrupts') while producer_threads: for thread in producer_threads: thread.join(timeout=3.14) - producer_threads = [ - thread for thread in producer_threads if thread.is_alive() - ] + producer_threads = [thread for thread in producer_threads if thread.is_alive()] self.producers_done_event.set() while consumer_threads: for thread in consumer_threads: thread.join(timeout=3.14) - consumer_threads = [ - thread for thread in consumer_threads if thread.is_alive() - ] + consumer_threads = [thread for thread in consumer_threads if thread.is_alive()] diff --git a/tests/test_abacus_account.py b/tests/test_abacus_account.py index 5dd29d06e3..f313487c24 100644 --- a/tests/test_abacus_account.py +++ b/tests/test_abacus_account.py @@ -20,7 +20,7 @@ from rucio.core.account import get_usage_history from rucio.core.account_counter import update_account_counter_history from rucio.core.account_limit import get_local_account_usage, set_local_account_limit -from rucio.daemons.abacus.account import account_update +from rucio.daemons.abacus.account import AbacusAccount from rucio.daemons.judge import cleaner from rucio.daemons.reaper.reaper import Reaper from rucio.db.sqla import models @@ -45,7 +45,7 @@ def test_abacus_account(self, vo, root_account, mock_scope, rse_factory, did_fac dataset = dids[0]['dataset_name'] activity = get_schema_value('ACTIVITY')['enum'][0] rucio_client.add_replication_rule([{'scope': mock_scope.external, 'name': dataset}], 1, rse, lifetime=-1, activity=activity) - account_update(once=True) + AbacusAccount(once=True).run() account_usage = get_local_account_usage(account=root_account, rse_id=rse_id)[0] assert account_usage['bytes'] == nfiles * file_sizes assert account_usage['files'] == nfiles @@ -63,7 +63,7 @@ def test_abacus_account(self, vo, root_account, mock_scope, rse_factory, did_fac # Delete rules -> account usage should decrease cleaner.run(once=True) - account_update(once=True) + AbacusAccount(once=True).run() # set account limit because return value of get_local_account_usage differs if a limit is set or not set_local_account_limit(account=root_account, rse_id=rse_id, bytes_=10) account_usages = get_local_account_usage(account=root_account, rse_id=rse_id)[0] diff --git a/tests/test_bin_rucio.py b/tests/test_bin_rucio.py index 663ba912ca..acd80eb56b 100755 --- a/tests/test_bin_rucio.py +++ b/tests/test_bin_rucio.py @@ -1834,7 +1834,7 @@ def test_list_account_usage(self): """ CLIENT (USER): list account usage. """ from rucio.db.sqla import session, models from rucio.core.account_counter import increase - from rucio.daemons.abacus import account as abacus_account + from rucio.daemons.abacus.account import AbacusAccount db_session = session.get_session() db_session.query(models.AccountUsage).delete() @@ -1854,7 +1854,7 @@ def test_list_account_usage(self): self.account_client.set_local_account_limit(account, rse, local_limit) self.account_client.set_global_account_limit(account, rse_exp, global_limit) increase(rse_id, InternalAccount(account, **self.vo), 1, usage) - abacus_account.run(once=True) + AbacusAccount(once=True).run() cmd = 'rucio list-account-usage {0}'.format(account) exitcode, out, err = execute(cmd) assert re.search('.*{0}.*{1}.*{2}.*{3}'.format(rse, usage, local_limit, local_left), out) is not None diff --git a/tests/test_counter.py b/tests/test_counter.py index 0501247d17..19887ae532 100644 --- a/tests/test_counter.py +++ b/tests/test_counter.py @@ -19,7 +19,7 @@ from rucio.core import account_counter, rse_counter from rucio.core.account import get_usage -from rucio.daemons.abacus.account import account_update +from rucio.daemons.abacus.account import AbacusAccount from rucio.daemons.abacus.rse import rse_update from rucio.db.sqla import models @@ -91,7 +91,7 @@ class TestCoreAccountCounter: def test_inc_dec_get_counter(self, jdoe_account, rse_factory, db_session): """ACCOUNT COUNTER (CORE): Increase, decrease and get counter """ db_session.commit() - account_update(once=True) + AbacusAccount(once=True).run() _, rse_id = rse_factory.make_mock_rse(session=db_session) db_session.commit() account = jdoe_account @@ -104,7 +104,7 @@ def test_inc_dec_get_counter(self, jdoe_account, rse_factory, db_session): count, sum_ = 0, 0 for i in range(10): account_counter.increase(rse_id=rse_id, account=account, files=1, bytes_=2.147e+9) - account_update(once=True) + AbacusAccount(once=True).run() count += 1 sum_ += 2.147e+9 cnt = get_usage(rse_id=rse_id, account=account) @@ -113,7 +113,7 @@ def test_inc_dec_get_counter(self, jdoe_account, rse_factory, db_session): for i in range(4): account_counter.decrease(rse_id=rse_id, account=account, files=1, bytes_=2.147e+9) - account_update(once=True) + AbacusAccount(once=True).run() count -= 1 sum_ -= 2.147e+9 cnt = get_usage(rse_id=rse_id, account=account) @@ -122,7 +122,7 @@ def test_inc_dec_get_counter(self, jdoe_account, rse_factory, db_session): for i in range(5): account_counter.increase(rse_id=rse_id, account=account, files=1, bytes_=2.147e+9) - account_update(once=True) + AbacusAccount(once=True).run() count += 1 sum_ += 2.147e+9 cnt = get_usage(rse_id=rse_id, account=account) @@ -131,7 +131,7 @@ def test_inc_dec_get_counter(self, jdoe_account, rse_factory, db_session): for i in range(8): account_counter.decrease(rse_id=rse_id, account=account, files=1, bytes_=2.147e+9) - account_update(once=True) + AbacusAccount(once=True).run() count -= 1 sum_ -= 2.147e+9 cnt = get_usage(rse_id=rse_id, account=account) diff --git a/tests/test_daemons.py b/tests/test_daemons.py index 370bb923c6..d33254204b 100644 --- a/tests/test_daemons.py +++ b/tests/test_daemons.py @@ -19,7 +19,7 @@ import rucio.db.sqla.util from rucio.common import exception -from rucio.daemons.abacus import account, collection_replica, rse +from rucio.daemons.abacus import collection_replica, rse from rucio.daemons.atropos import atropos from rucio.daemons.automatix import automatix from rucio.daemons.badreplicas import minos, minos_temporary_expiration, necromancer @@ -37,7 +37,6 @@ from rucio.daemons.common import Daemon DAEMONS = [ - account, collection_replica, rse, atropos, diff --git a/tests/test_judge_evaluator.py b/tests/test_judge_evaluator.py index e40352c829..6e88a5e071 100644 --- a/tests/test_judge_evaluator.py +++ b/tests/test_judge_evaluator.py @@ -25,7 +25,7 @@ from rucio.core.replica import add_replica from rucio.core.rse import add_rse_attribute from rucio.core.rule import add_rule, get_rule -from rucio.daemons.abacus.account import account_update +from rucio.daemons.abacus.account import AbacusAccount from rucio.daemons.judge.evaluator import re_evaluator from rucio.db.sqla.constants import DIDType, LockState from rucio.db.sqla.models import UpdatedDID @@ -198,7 +198,7 @@ def test_judge_dataset_grouping_all(self): def test_account_counter_judge_evaluate_attach(self): """ JUDGE EVALUATOR: Test if the account counter is updated correctly when a file is added to a DS""" re_evaluator(once=True, did_limit=None) - account_update(once=True) + AbacusAccount(once=True).run() scope = InternalScope('mock', **self.vo) files = create_files(3, scope, self.rse1_id, bytes_=100) @@ -213,7 +213,7 @@ def test_account_counter_judge_evaluate_attach(self): # Fake judge re_evaluator(once=True, did_limit=None) - account_update(once=True) + AbacusAccount(once=True).run() account_counter_after = get_usage(self.rse1_id, self.jdoe) assert account_counter_before['bytes'] + 3 * 100 == account_counter_after['bytes'] @@ -223,7 +223,7 @@ def test_account_counter_judge_evaluate_attach(self): def test_account_counter_judge_evaluate_detach(self): """ JUDGE EVALUATOR: Test if the account counter is updated correctly when a file is removed from a DS""" re_evaluator(once=True, did_limit=None) - account_update(once=True) + AbacusAccount(once=True).run() scope = InternalScope('mock', **self.vo) files = create_files(3, scope, self.rse1_id, bytes_=100) @@ -234,7 +234,7 @@ def test_account_counter_judge_evaluate_detach(self): # Add a first rule to the DS add_rule(dids=[{'scope': scope, 'name': dataset}], account=self.jdoe, copies=1, rse_expression=self.rse1, grouping='ALL', weight=None, lifetime=None, locked=False, subscription_id=None) - account_update(once=True) + AbacusAccount(once=True).run() account_counter_before = get_usage(self.rse1_id, self.jdoe) @@ -242,7 +242,7 @@ def test_account_counter_judge_evaluate_detach(self): # Fake judge re_evaluator(once=True, did_limit=None) - account_update(once=True) + AbacusAccount(once=True).run() account_counter_after = get_usage(self.rse1_id, self.jdoe) assert account_counter_before['bytes'] - 100 == account_counter_after['bytes'] diff --git a/tests/test_rse.py b/tests/test_rse.py index e4d45943b6..56b088c3a6 100644 --- a/tests/test_rse.py +++ b/tests/test_rse.py @@ -35,7 +35,7 @@ parse_checksum_support_attribute, get_rse_supported_checksums_from_attributes, update_rse) -from rucio.daemons.abacus.account import account_update +from rucio.daemons.abacus.account import AbacusAccount from rucio.db.sqla import session, models from rucio.db.sqla.constants import RSEType, DIDType from rucio.rse import rsemanager as mgr @@ -1370,7 +1370,7 @@ def test_get_rse_usage(self, vo, rucio_client, rse_factory, jdoe_account, root_a attach_dids(mock_scope, dataset, files, jdoe_account) rules = add_rule(dids=[{'scope': mock_scope, 'name': dataset}], account=jdoe_account, copies=1, rse_expression=rse, grouping='DATASET', weight=None, lifetime=None, locked=False, subscription_id=None, activity=activity) assert rules - account_update(once=True) + AbacusAccount(once=True).run() usages = rucio_client.get_rse_usage(rse=rse, filters={'per_account': True}) for usage in usages: assert usage['account_usages'] diff --git a/tests/test_rule.py b/tests/test_rule.py index 9271e2bc1d..a5ebe80fa4 100644 --- a/tests/test_rule.py +++ b/tests/test_rule.py @@ -42,7 +42,7 @@ from rucio.core.rse_counter import get_counter as get_rse_counter from rucio.core.rule import add_rule, get_rule, delete_rule, add_rules, update_rule, reduce_rule, move_rule, list_rules from rucio.core.scope import add_scope -from rucio.daemons.abacus.account import account_update +from rucio.daemons.abacus.account import AbacusAccount from rucio.daemons.abacus.rse import rse_update from rucio.daemons.judge.evaluator import re_evaluator from rucio.db.sqla import models @@ -606,7 +606,7 @@ def test_locked_rule(self, mock_scope, did_factory, jdoe_account): def test_account_counter_rule_create(self, mock_scope, did_factory, jdoe_account): """ REPLICATION RULE (CORE): Test if the account counter is updated correctly when new rule is created""" - account_update(once=True) + AbacusAccount(once=True).run() account_counter_before = get_usage(self.rse1_id, jdoe_account) files = create_files(3, mock_scope, self.rse1_id, bytes_=100) @@ -617,7 +617,7 @@ def test_account_counter_rule_create(self, mock_scope, did_factory, jdoe_account add_rule(dids=[dataset], account=jdoe_account, copies=1, rse_expression=self.rse1, grouping='ALL', weight=None, lifetime=None, locked=False, subscription_id=None) # Check if the counter has been updated correctly - account_update(once=True) + AbacusAccount(once=True).run() account_counter_after = get_usage(self.rse1_id, jdoe_account) assert (account_counter_before['bytes'] + 3 * 100 == account_counter_after['bytes']) assert (account_counter_before['files'] + 3 == account_counter_after['files']) @@ -633,11 +633,11 @@ def test_account_counter_rule_delete(self, mock_scope, did_factory, jdoe_account rule_id = add_rule(dids=[dataset], account=jdoe_account, copies=1, rse_expression=self.rse1, grouping='ALL', weight=None, lifetime=None, locked=False, subscription_id=None)[0] - account_update(once=True) + AbacusAccount(once=True).run() account_counter_before = get_usage(self.rse1_id, jdoe_account) delete_rule(rule_id) - account_update(once=True) + AbacusAccount(once=True).run() # Check if the counter has been updated correctly account_counter_after = get_usage(self.rse1_id, jdoe_account) @@ -655,12 +655,12 @@ def test_account_counter_rule_update(self, vo, mock_scope, did_factory, jdoe_acc rule_id = add_rule(dids=[dataset], account=jdoe_account, copies=1, rse_expression=self.rse1, grouping='ALL', weight=None, lifetime=None, locked=False, subscription_id=None)[0] - account_update(once=True) + AbacusAccount(once=True).run() account_counter_before_1 = get_usage(self.rse1_id, jdoe_account) account_counter_before_2 = get_usage(self.rse1_id, root_account) rucio.api.rule.update_replication_rule(rule_id, {'account': 'root'}, issuer='root', vo=vo) - account_update(once=True) + AbacusAccount(once=True).run() # Check if the counter has been updated correctly account_counter_after_1 = get_usage(self.rse1_id, jdoe_account) diff --git a/tools/count_missing_type_annotations_utils.sh b/tools/count_missing_type_annotations_utils.sh index a10e0e185e..3dea91b802 100755 --- a/tools/count_missing_type_annotations_utils.sh +++ b/tools/count_missing_type_annotations_utils.sh @@ -15,14 +15,14 @@ # limitations under the License. # Script with all tools to count the missing python type annotations in the -# project. Installs all necessary python packages temporarily if needed. To use +# project. Installes all necessary python packages temporarly if needed. To use # it run: `scource count_missing_annotations_utils.sh`. set -e ensure_install() { - # Checks if a python package is installed via pip. It installs the package, + # Checks if a python package is installed via pip. It installes the package, # and removes it after the script run automatically. # # All debug output is redirected to the stderr stream, to not interfear with From 91599a5a033a97fd652273dcf6a156af03fb5696 Mon Sep 17 00:00:00 2001 From: rdimaio Date: Thu, 29 Feb 2024 14:18:52 +0100 Subject: [PATCH 5/5] Daemons: Fix graceful stop logic --- bin/rucio-abacus-account | 1 + bin/rucio-judge-repairer | 1 + bin/rucio-reaper | 1 + bin/rucio-undertaker | 1 + lib/rucio/daemons/common.py | 43 +++++++++++++++++------------- lib/rucio/daemons/reaper/reaper.py | 1 + 6 files changed, 29 insertions(+), 19 deletions(-) diff --git a/bin/rucio-abacus-account b/bin/rucio-abacus-account index cba5105b3a..52a16a231e 100755 --- a/bin/rucio-abacus-account +++ b/bin/rucio-abacus-account @@ -68,6 +68,7 @@ if __name__ == "__main__": args = parser.parse_args() abacus_account = AbacusAccount(once=args.run_once, threads=args.threads, fill_history_table=args.enable_history, sleep_time=args.sleep_time) signal.signal(signal.SIGTERM, abacus_account.stop) + signal.signal(signal.SIGINT, abacus_account.stop) try: abacus_account.run() except KeyboardInterrupt: diff --git a/bin/rucio-judge-repairer b/bin/rucio-judge-repairer index 07eaa6b61f..a2b6f1e75b 100755 --- a/bin/rucio-judge-repairer +++ b/bin/rucio-judge-repairer @@ -40,6 +40,7 @@ if __name__ == "__main__": args = parser.parse_args() judge_repairer = JudgeRepairer(once=args.run_once, threads=args.threads, sleep_time=args.sleep_time) signal.signal(signal.SIGTERM, judge_repairer.stop) + signal.signal(signal.SIGINT, judge_repairer.stop) try: judge_repairer.run() except KeyboardInterrupt: diff --git a/bin/rucio-reaper b/bin/rucio-reaper index bc6ef8c270..be69dbc961 100755 --- a/bin/rucio-reaper +++ b/bin/rucio-reaper @@ -77,6 +77,7 @@ if __name__ == "__main__": auto_exclude_threshold=args.auto_exclude_threshold, auto_exclude_timeout=args.auto_exclude_timeout) signal.signal(signal.SIGTERM, reaper.stop) + signal.signal(signal.SIGINT, reaper.stop) try: reaper.run() except KeyboardInterrupt: diff --git a/bin/rucio-undertaker b/bin/rucio-undertaker index 3141074a77..dfc5d133f4 100755 --- a/bin/rucio-undertaker +++ b/bin/rucio-undertaker @@ -70,6 +70,7 @@ if __name__ == "__main__": args = parser.parse_args() undertaker = Undertaker(total_workers=args.total_workers, chunk_size=args.chunk_size, once=args.run_once, sleep_time=args.sleep_time) signal.signal(signal.SIGTERM, undertaker.stop) + signal.signal(signal.SIGINT, undertaker.stop) try: undertaker.run() except KeyboardInterrupt: diff --git a/lib/rucio/daemons/common.py b/lib/rucio/daemons/common.py index b88663a3df..12d2485bc9 100644 --- a/lib/rucio/daemons/common.py +++ b/lib/rucio/daemons/common.py @@ -18,6 +18,7 @@ import logging import os import queue +import signal import socket import threading import time @@ -36,6 +37,7 @@ T = TypeVar('T') METRICS = MetricManager(module=__name__) + class Daemon(ABC): """ Base daemon class @@ -92,19 +94,20 @@ def _call_daemon(self, activities: Optional[list[str]] = None) -> None: Run the daemon loop and call the function _run_once at each iteration """ - run_once_fnc = functools.partial(self._run_once) + while not self.graceful_stop.is_set(): + run_once_fnc = functools.partial(self._run_once) - daemon = db_workqueue( - once=self.once, - graceful_stop=self.graceful_stop, - executable=self.daemon_name, - partition_wait_time=self.partition_wait_time, - sleep_time=self.sleep_time, - activities=activities, - )(run_once_fnc) + daemon = db_workqueue( + once=self.once, + graceful_stop=self.graceful_stop, + executable=self.daemon_name, + partition_wait_time=self.partition_wait_time, + sleep_time=self.sleep_time, + activities=activities, + )(run_once_fnc) - for _ in daemon(): - pass + for _ in daemon(): + pass def run(self) -> None: """ @@ -121,17 +124,12 @@ def run(self) -> None: for t in thread_list: t.start() + for t in thread_list: + t.join() + if not self.once: logging.info("%s: waiting for interrupts", self.daemon_name) - # Interruptible joins require a timeout. - while thread_list: - thread_list = [ - thread.join(timeout=3.14) - for thread in thread_list - if thread and thread.is_alive() - ] - def stop( self, signum: Optional[int] = None, frame: Optional[FrameType] = None ) -> None: @@ -143,6 +141,13 @@ def stop( :param frame: stack frame """ self.graceful_stop.set() + logging.info("%s: terminating", self.daemon_name) + if signum == signal.SIGINT or signum == signal.SIGTERM: + if hasattr(self.stop, 'received_sigint_or_sigterm'): + exit(1) + else: + self.stop.received_sigint_or_sigterm = True + class HeartbeatHandler: """ diff --git a/lib/rucio/daemons/reaper/reaper.py b/lib/rucio/daemons/reaper/reaper.py index 63df427088..a9ae1f7b98 100644 --- a/lib/rucio/daemons/reaper/reaper.py +++ b/lib/rucio/daemons/reaper/reaper.py @@ -58,6 +58,7 @@ METRICS = MetricManager(module=__name__) REGION = make_region_memcached(expiration_time=600) +EXCLUDED_RSE_GAUGE = METRICS.gauge('excluded_rses.{rse}', documentation='Temporarly excluded RSEs') class Reaper(Daemon):