import asyncio from contextlib import suppress from config import defaults from components.logs import logger from components.utils.datetimes import ntime_utc_now from components.cluster.leader import elect_leader from components.cluster.exceptions import ZombiePeer class Monitor: def __init__(self, cluster_instance: "Cluster"): self.cluster_instance = cluster_instance self.tasks = set() async def _ticket_worker(self): """ This asynchronous method checks for locks on tables and releases them if existent. It also deletes the tickets from the cluster instance, which have exceeded the timeout threshold. It performs these checks every 10 seconds until a shutdown is set. There are no input parameters for this function and it does not return anything. It gives an output through logs or errors. """ while not self.cluster_instance._shutdown.is_set(): for table in self.cluster_instance.locks: ticket = self.cluster_instance.locks[table]["ticket"] table_locked = self.cluster_instance.locks[table]["lock"].locked() if (table_locked and not ticket) or ( ntime_utc_now() - float(ticket or "inf") ) > 20.0: with suppress(RuntimeError): self.cluster_instance.locks[table]["lock"].release() self.cluster_instance.locks[table]["ticket"] = None logger.error( f"Force release of table '{table}': " + f"Ticket: {ticket} / Lock status: {table_locked}" ) for t in self.cluster_instance.tickets.copy(): if ntime_utc_now() - float(t) > ( defaults.CLUSTER_PEERS_TIMEOUT * len(defaults.CLUSTER_PEERS) + 10 ): with suppress(KeyError): del self.cluster_instance.tickets[t] await asyncio.sleep(10) async def _cleanup_peer_connection(self, peer): """ Clean up a peer connection. Warning logs are created for the peer removal. The cluster instance of peers is reset and if the shutdown is not set, leadership is elected and commands are sent to non-local peers. Args: peer: the peer for whom the connection needs to be cleaned up. """ logger.warning(f"Removing {peer}") self.cluster_instance.peers.remotes[peer] = self.cluster_instance.peers.remotes[ peer ].reset() if not self.cluster_instance._shutdown.is_set(): async with self.cluster_instance.receiving: elect_leader(self.cluster_instance.peers) if self.cluster_instance.peers.local.swarm != "": for p in self.cluster_instance.peers.local.swarm.split(";"): if p != self.cluster_instance.peers.local.name: try: ( ticket, receivers, ) = await self.cluster_instance.send_command( "STATUS", p, ) except ConnectionResetError: pass async def _peer_worker(self, name): """ The method works with the name as an input and evaluates the stream for the same. It also handles leader election and various exception handling regarding connection reset and timeouts. Args: name: the name for whom the peer works. """ ireader, iwriter = self.cluster_instance.peers.remotes[name].streams._in timeout_c = 0 c = -1 logger.info(f"Evaluating stream for {name}") while not name in self.cluster_instance.peers.get_established(): await asyncio.sleep(0.125) oreader, owriter = self.cluster_instance.peers.remotes[name].streams.out elect_leader(self.cluster_instance.peers) while True and timeout_c < 3: try: assert not all( [ oreader.at_eof(), ireader.at_eof(), iwriter.is_closing(), owriter.is_closing(), ] ) async with asyncio.timeout(defaults.CLUSTER_PEERS_TIMEOUT * 3): iwriter.write(b"\x11") await iwriter.drain() res = await oreader.readexactly(1) assert res == b"\x11" timeout_c = 0 c += 0.25 await asyncio.sleep(0.25) if not c % 5: if ( not self.cluster_instance.peers.local.leader or not self.cluster_instance.peers.local.swarm_complete ): try: ( ticket, receivers, ) = await self.cluster_instance.send_command( "STATUS", "*" if self.cluster_instance.peers.local.leader and not self.cluster_instance.peers.local.swarm_complete else name, ) except ConnectionResetError: break async with self.cluster_instance.receiving: elect_leader(self.cluster_instance.peers) await self.cluster_instance.await_receivers( ticket, receivers, raise_err=False, timeout=3 ) c = 0 except TimeoutError: timeout_c += 1 continue except ( AssertionError, ConnectionResetError, asyncio.exceptions.IncompleteReadError, ): logger.error(f"Peer {name} failed") break if c != -1: try: iwriter.close() async with asyncio.timeout(0.1): await iwriter.wait_closed() except (ConnectionResetError, TimeoutError): pass try: owriter.close() async with asyncio.timeout(0.1): await owriter.wait_closed() await owriter.wait_closed() except (ConnectionResetError, TimeoutError): pass def _on_task_done(self, task: asyncio.Task): """ Called when an asyncio Task is done. A cleanup peer connection task is created and the original task is removed from the set of tasks. Args: task: The task that was completed. """ asyncio.create_task(self._cleanup_peer_connection(task.get_name())) self.tasks.discard(task) async def start_peer_monitoring(self, name): """ Start monitoring a peer. If the name of the peer is already in the list of tasks, it raises a ZombiePeer exception and otherwise, the peer worker is created as a task with the peer’s name. Args: name: the name of the peer to monitor. """ if name in [task.get_name() for task in self.tasks]: raise ZombiePeer(name) t = asyncio.create_task(self._peer_worker(name), name=name) self.tasks.add(t) t.add_done_callback(self._on_task_done)