RoboCup 3D - 动作优化服务器死机问题排查与解决方案

Lunayic

2024-03-24 (Updated: 2025-05-18)

RoboCup 3D

4/4 Update

观察了好久，现在遇到的奇怪问题都可以重启了。

比如下图黄色所示。似乎是v3版本的 Kick Optimization 特有的，而且仅在 10700f 的机器上出现，另外一台 13400f 似乎没有。

经过观察，有四个环境，可能会有其中某个环境机器人会因为踢球前的 Positioning 卡好久，和之前 C++ 的优化打开环境噪声时的表现一样，然后Game Time就会跑过十分钟从而中止，但是因为我设置的超时信号，所以在时间横坐标图可以看到一段不那么连贯的折线。

目前也不清楚从何下手，暂时搁置了这个问题。

应该和之前 “server开久会出问题” 没有关系，说起这个，之前的定时重启wrapper的重启函数其实并没有真正调用）也算是小BUG，修好后截至目前没有再观测到其他运行失败的现象。

wrapper 代码如下

Restart Wrapper

from datetime import datetime, timedelta
from typing import Any
import gymnasium as gym

from config import Config


class RSWrapper(gym.Wrapper):
    def __init__(self, env: gym.Env):
        super().__init__(env)

        # 获取配置变量
        kira_conf: Config = self.env.get_wrapper_attr('config')
        self.restart_hour = kira_conf.restart_hour

        # 环境编写好重启函数
        self.restart_func = self.env.get_wrapper_attr('restart_rcss')

        # 上一次重启时间
        self.last_restart = datetime.now()

        if self.restart_hour is None:
            raise ValueError("config.restart_hour is None")

    def reset(
        self, *, seed: int | None = None, options: dict[str, Any] | None = None
    ) -> tuple[Any, dict[str, Any]]:
        """
        wrapper reset() func, use to restart before reset(),
        restart rcss_server at reset() can avoid break step() frame
        """
        current_time = datetime.now()
        if current_time - self.last_restart >= timedelta(hours=self.restart_hour):
            try:
                print("Time schedule restart...")
                self.restart_func()
                print("Restart over")
            except AttributeError:
                print(f"Not implement restart_rcss() in {self.env.__class__.__name__}")
                print(f"Skip restart")
            except Exception as e:
                print(f"Error when restart: {e}")
                raise
            self.last_restart = current_time
        return super().reset(seed=seed, options=options)

Safe Wrapper

没想好它的名字，随便取一个

import time
import signal
from typing import Any, Callable
import gymnasium as gym
from contextlib import contextmanager
from config import Config


class TimeoutException(Exception):
    pass


class SafeWrapper(gym.Wrapper):
    def __init__(self, env: gym.Env):
        super().__init__(env)
        kira_conf: Config = self.env.get_wrapper_attr('config')

        # 限制重启次数
        self.retry_limit = kira_conf.retry_limit
        
        # 限制step and reset的执行时间
        # 偷懒没把这两个过程的计时分开, 理论上用一个也行
        self.time_limit = kira_conf.step_timeout

        assert self.retry_limit > 0, "Retry times must be greater than 0"
        assert self.time_limit > 0, "Time limit must be greater than 0"

        # 保存上一次的obs, 方便在step错误返回时使用
        self.last_observation = None

        # 重启标志, 同样仅在reset()中重启
        self.should_restart = False

    def reset(
        self, *, seed: int | None = None, options: dict[str, Any] | None = None
    ) -> tuple[Any, dict[str, Any]]:
        wrap_reset = self._execute_with_time_limit(
            self.env.reset, seed=seed, options=options
        )

        for i in range(self.retry_limit + 1):
            if self.should_restart:
                print("Reset error occurred, restarting environment...")
                self.restart_env()
                self.should_restart = False
            try:
                observation, info = wrap_reset()
                break
            except KeyboardInterrupt:
                print("\nCaught KeyboardInterrupt, closing env")
                raise
            except Exception as e:
                player = self.env.get_wrapper_attr(
                    "player"
                )  # restart rcss env will reset agent
                if isinstance(e, TimeoutException):
                    player.world.log(f"Timed out at reset()")
                else:
                    player.world.log(
                        f"\nCaught exception: '{e}', waiting 3 seconds and abort this episode"
                    )
                time.sleep(3)
                self.should_restart = True
                if i == self.retry_limit:
                    print(
                        f"Failed to reset environment after {self.retry_limit} retries"
                    )
                    raise

        return observation, info

    def step(self, action):
        wrap_step = self._execute_with_time_limit(self.env.step, action)

        try:
            self.last_observation, reward, done, truncated, info = wrap_step()
        except KeyboardInterrupt:
            print("\nCaught KeyboardInterrupt, closing env")
            raise
        except Exception as e:
            player = self.env.get_wrapper_attr("player")
            if isinstance(e, TimeoutException):
                player.world.log(f"Timed out at step()")
            else:
                player.world.log(
                    f"\nCaught exception: '{e}', waiting 3 seconds and abort this episode"
                )
            # 若遇到错误, 比如server挂了, agent send/recv异常或者超时, 则该次episode直接结束
            # 并且下一次episode开始时尝试重启
            # 不在这里重启是因为会重置agent state, 不方便也不适合重启
            time.sleep(3)
            self.should_restart = True
            reward = 0
            done = True
            truncated = True
            info = {}

        return self.last_observation, reward, done, truncated, info

    def restart_env(self):
        try:
            self.env.get_wrapper_attr('restart_rcss')()
        except Exception as e:
            print(f"Failed to restart environment: {e}")
            raise

    @contextmanager
    def _timeout(self, seconds: int):
        """
        TImeout signal wrapper
        其实和装饰器玩法差不多
        """
        def _handle_timeout(signum, frame):
            raise TimeoutException(f"Timeout: {seconds} seconds")

        signal.signal(signal.SIGALRM, _handle_timeout)
        signal.alarm(int(seconds))
        try:
            yield
        finally:
            signal.alarm(0)

    def _execute_with_time_limit(self, func: Callable, *args, **kwargs):
        """Time Limit function wrapper
        """
        if self.time_limit > 0:

            def chunk():
                with self._timeout(self.time_limit * 60):
                    return func(*args, **kwargs)

        else:

            def chunk():
                return func(*args, **kwargs)

        return chunk

背景

优化老莫名奇妙挂掉，前几次以为是小崽子们同一个端口开了server，warning之后又炸了，于是想到前辈们口口相传的server开久会出问题的传说，加了个定时重启wrapper，结果还炸！

复现一次大概要3M steps，继续等了两天。

从数据面板上看就是reward突然掉下去，RoboViz看是一片黑，就像是端口被抢占了。

python调试

学了一下 Python 的调试，gdb那边因为用的conda，python3-dgb这个包装的时候版本不对，懒得折腾，找了另一个项目：pystack。

pystack

PyStack is a tool that uses forbidden magic to let you inspect the stack frames of a running Python process or a Python core dump, helping you quickly and easily learn what it’s doing (or what it was doing when it crashed) without having to interpret nasty CPython internals.

apt 安装一些依赖

1	sudo apt install libdw-dev libelf-dev pkg-config

git clone [email protected]:bloomberg/pystack.git pystack
cd pystack
# 进入虚拟环境
python3 -m pip install --upgrade pip wheel 
python3 -m pip install -e .
python3 -m pip install -r requirements-test.txt -r requirements-extra.txt

attach

没用到什么高级的方法，只是为了打印卡住时的堆栈，删掉了库函数的调用栈

~/pystack (main) » pystack remote 449389
Traceback for thread 449389 (python) [] (most recent call last):
	...
    (Python) File "train/wrapper/safe.py", line 28, in reset
        state, info = self.env.reset(seed=seed, options=options)
    (Python) File "cup_env/kick/short_kick.py", line 105, in reset
        self.reset_player(agent_init_pos, agent_init_ori, stabilize=True)
    (Python) File "cup_env/base/single_player_env.py", line 80, in reset_player
        self.player.scom.unofficial_beam(
    (Python) File "communication/Server_Comm.py", line 264, in unofficial_beam
        self.monitor_socket.send((len(msg)).to_bytes(4, byteorder="big") + msg)

可以看出是在reset时，agent往server发beam消息卡住了。但是socket没报错关闭，程序也就没有结束，很诡异

~ » netstat netstat -a | grep '410[0-4]'
tcp        0      0 0.0.0.0:4101            0.0.0.0:*               LISTEN
tcp        0      0 0.0.0.0:4100            0.0.0.0:*               LISTEN
tcp        0      0 0.0.0.0:4103            0.0.0.0:*               LISTEN
tcp        0      0 0.0.0.0:4102            0.0.0.0:*               LISTEN
tcp        0      0 0.0.0.0:4104            0.0.0.0:*               LISTEN
tcp        0      0 localhost:49386         localhost:4102          ESTABLISHED
tcp        0      0 localhost:4103          localhost:38608         ESTABLISHED
tcp        0      0 localhost:4102          localhost:49386         ESTABLISHED
tcp        0      0 localhost:4101          localhost:45800         ESTABLISHED
tcp        0      0 localhost:42186         localhost:4100          ESTABLISHED
tcp        0      0 localhost:4104          localhost:40134         ESTABLISHED
tcp        0      0 localhost:45800         localhost:4101          ESTABLISHED
tcp        0      0 localhost:40134         localhost:4104          ESTABLISHED
tcp        0      0 localhost:4100          localhost:42186         ESTABLISHED
tcp        0      0 localhost:38608         localhost:4103          ESTABLISHED

看一下其他三个 train_env 和一个 eval_env，都卡在了和父进程的通信这，是等<449389>返回信息？

~/pystack (main) » pystack remote 449390
Traceback for thread 449390 (python) [] (most recent call last):
    (Python) File "<string>", line 1, in <module>
    (Python) File ".miniconda3/envs/rc3d-312/lib/python3.12/multiprocessing/forkserver.py", line 274, in main
        code = _serve_one(child_r, fds,

    (Python) File ".miniconda3/envs/rc3d-312/lib/python3.12/multiprocessing/forkserver.py", line 313, in _serve_one
        code = spawn._main(child_r, parent_sentinel)
    (Python) File ".miniconda3/envs/rc3d-312/lib/python3.12/multiprocessing/spawn.py", line 135, in _main
        return self._bootstrap(parent_sentinel)
    (Python) File ".miniconda3/envs/rc3d-312/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
        self.run()
    (Python) File ".miniconda3/envs/rc3d-312/lib/python3.12/multiprocessing/process.py", line 108, in run
        self._target(*self._args, **self._kwargs)
    (Python) File ".miniconda3/envs/rc3d-312/lib/python3.12/site-packages/stable_baselines3/common/vec_env/subproc_vec_env.py", line 33, in _worker
        cmd, data = remote.recv()
    (Python) File ".miniconda3/envs/rc3d-312/lib/python3.12/multiprocessing/connection.py", line 250, in recv
        buf = self._recv_bytes()
    (Python) File ".miniconda3/envs/rc3d-312/lib/python3.12/multiprocessing/connection.py", line 430, in _recv_bytes
        buf = self._recv(4)
    (Python) File ".miniconda3/envs/rc3d-312/lib/python3.12/multiprocessing/connection.py", line 395, in _recv
        chunk = read(handle, remaining)

再看一眼main.py（只保留了learn()线程的堆栈信息）

~/pystack (main) » pystack remote 449276
...

Traceback for thread 449276 (python) [] (most recent call last):
    (Python) File "./main.py", line 39, in <module>
        trainer.train()
    (Python) File "train/base.py", line 144, in train
        self.learn_model(

    (Python) File "train/base.py", line 239, in learn_model
        model.learn(

    (Python) File ".miniconda3/envs/rc3d-312/lib/python3.12/site-packages/stable_baselines3/ppo/ppo.py", line 315, in learn
        return super().learn(

    (Python) File ".miniconda3/envs/rc3d-312/lib/python3.12/site-packages/stable_baselines3/common/on_policy_algorithm.py", line 277, in learn
        continue_training = self.collect_rollouts(self.env, callback, self.rollout_buffer, n_rollout_steps=self.n_steps)
    (Python) File ".miniconda3/envs/rc3d-312/lib/python3.12/site-packages/stable_baselines3/common/on_policy_algorithm.py", line 194, in collect_rollouts
        new_obs, rewards, dones, infos = env.step(clipped_actions)
    (Python) File ".miniconda3/envs/rc3d-312/lib/python3.12/site-packages/stable_baselines3/common/vec_env/base_vec_env.py", line 206, in step
        return self.step_wait()
    (Python) File ".miniconda3/envs/rc3d-312/lib/python3.12/site-packages/stable_baselines3/common/vec_env/subproc_vec_env.py", line 129, in step_wait
        results = [remote.recv() for remote in self.remotes]
    (Python) File ".miniconda3/envs/rc3d-312/lib/python3.12/multiprocessing/connection.py", line 250, in recv
        buf = self._recv_bytes()
    (Python) File ".miniconda3/envs/rc3d-312/lib/python3.12/multiprocessing/connection.py", line 430, in _recv_bytes
        buf = self._recv(4)
    (Python) File ".miniconda3/envs/rc3d-312/lib/python3.12/multiprocessing/connection.py", line 395, in _recv
        chunk = read(handle, remaining)

应该就是某个环境挂了同步等待。

暂时不明白什么原因，也没设置server log，最直接的处理办法就是超时重启了（

先写一个wrapper，用 alarm 计时，超时重启server，希望有用。

有效了再更新。