@misc{keplinger2025nsgymopensourcesimulationenvironments, author = {Keplinger, Nathaniel S. and Luo, Baiting and Bektas, Iliyas and Zhang, Yunuo and Wray, Kyle Hollins and Laszka, Aron and Dubey, Abhishek and Mukhopadhyay, Ayan}, title = {NS-Gym: Open-Source Simulation Environments and Benchmarks for Non-Stationary Markov Decision Processes}, year = {2025}, archiveprefix = {arXiv}, contribution = {colab}, eprint = {2501.09646}, primaryclass = {cs.AI}, url = {https://arxiv.org/abs/2501.09646} }

Baiting Luo is a graduate student in the Department of Electrical Engineering and Computer Science at Vanderbilt University and works as a research assistant at the Institute for Software Integrated Systems. He received his M.S. degree in Computer Engineering from Northwestern University in June 2021 and his Bachelor’s degree in Computer Engineering and Computer Science from Rensselaer Polytechnic Institute in 2019.
Baiting Luo Publications
- N. S. Keplinger, B. Luo, I. Bektas, Y. Zhang, K. H. Wray, A. Laszka, A. Dubey, and A. Mukhopadhyay, NS-Gym: Open-Source Simulation Environments and Benchmarks for Non-Stationary Markov Decision Processes. 2025.
In many real-world applications, agents must make sequential decisions in environments where conditions are subject to change due to various exogenous factors. These non-stationary environments pose significant challenges to traditional decision-making models, which typically assume stationary dynamics. Non-stationary Markov decision processes (NS-MDPs) offer a framework to model and solve decision problems under such changing conditions. However, the lack of standardized benchmarks and simulation tools has hindered systematic evaluation and advance in this field. We present NS-Gym, the first simulation toolkit designed explicitly for NS-MDPs, integrated within the popular Gymnasium framework. In NS-Gym, we segregate the evolution of the environmental parameters that characterize non-stationarity from the agent’s decision-making module, allowing for modular and flexible adaptations to dynamic environments. We review prior work in this domain and present a toolkit encapsulating key problem characteristics and types in NS-MDPs. This toolkit is the first effort to develop a set of standardized interfaces and benchmark problems to enable consistent and reproducible evaluation of algorithms under non-stationary conditions. We also benchmark six algorithmic approaches from prior work on NS-MDPs using NS-Gym. Our vision is that NS-Gym will enable researchers to assess the adaptability and robustness of their decision-making algorithms to non-stationary conditions.
- B. Luo, A. Pettet, A. Laszka, A. Dubey, and A. Mukhopadhyay, Scalable Decision-Making In Stochastic Environments Through Learned Temporal Abstraction, in Proceedings of the 13th International Conference on Learning Representations, Singapore, 2025.
@inproceedings{luo2025scalable, author = {Luo, Baiting and Pettet, Ava and Laszka, Aron and Dubey, Abhishek and Mukhopadhyay, Ayan}, booktitle = {Proceedings of the 13th International Conference on Learning Representations, Singapore}, title = {Scalable Decision-Making In Stochastic Environments Through Learned Temporal Abstraction}, year = {2025}, organization = {International Conference on Learning Representations}, acceptance = {32.8}, category = {selective}, contribution = {colab} }
Sequential decision-making in high-dimensional continuous action spaces, particularly in stochastic environments, faces significant computational challenges. We explore this challenge in the traditional offline RL setting, where an agent must learn how to make decisions based on data collected through a stochastic behavior policy. We present Latent Macro Action Planner (L-MAP), which addresses this challenge by learning a set of temporally extended macro-actions through a state-conditional Vector Quantized Variational Autoencoder (VQ-VAE), effectively reducing action dimensionality. L-MAP employs a (separate) learned prior model that acts as a latent transition model and allows efficient sampling of plausible actions. During planning, our approach accounts for stochasticity in both the environment and the behavior policy by using Monte Carlo tree search (MCTS). In offline RL settings, including stochastic continuous control tasks, L-MAP efficiently searches over discrete latent actions to yield high expected returns. Empirical results demonstrate that L-MAP maintains low decision latency despite increased action dimensionality. Notably, across tasks ranging from continuous control with inherently stochastic dynamics to high-dimensional robotic hand manipulation, L-MAP significantly outperforms existing model-based methods and performs on par with strong model-free actor-critic baselines, highlighting the effectiveness of the proposed approach in planning in complex and stochastic environments with high-dimensional action spaces.
- Y. Zhang, B. Luo, A. Mukhopadhyay, and A. Dubey, Observation Adaptation via Annealed Importance Resampling for POMDPs, in Proceedings of the 35th International Conference on Automated Planning and Scheduling (ICAPS), 2025.
@inproceedings{zhang2025observation, author = {Zhang, Yunuo and Luo, Baiting and Mukhopadhyay, Ayan and Dubey, Abhishek}, booktitle = {Proceedings of the 35th International Conference on Automated Planning and Scheduling ({ICAPS})}, title = {Observation Adaptation via Annealed Importance Resampling for {POMDPs}}, year = {2025}, note = {accepted as oral presentation}, publisher = {{AAAI} Press}, acceptance = {22.8}, category = {selective}, contribution = {lead}, url = {https://openreview.net/forum?id=KxxxyuOFg2&referrer=%5Bthe%20profile%20of%20Baiting%20Luo%5D(%2Fprofile%3Fid%3D~Baiting_Luo1)} }
Partially observable Markov decision processes (POMDPs) are a general mathematical model for sequential decision-making in stochastic environments under state uncertainty. POMDPs are often solved online, which enables the algorithm to adapt to new information in real time. Online solvers typically use bootstrap particle filters based on importance resampling for updating the belief distribution. Since directly sampling from the ideal state distribution given the latest observation and previous state is infeasible, particle filters approximate the posterior belief distribution by propagating states and adjusting weights through prediction and resampling steps. However, in practice, the importance resampling technique often leads to particle degeneracy and sample impoverishment when the state transition model poorly aligns with the posterior belief distribution, especially when the received observation is noisy. We propose an approach that constructs a sequence of bridge distributions between the state-transition and optimal distributions through iterative Monte Carlo steps, better accommodating noisy observations in online POMDP solvers. Our algorithm demonstrates significantly superior performance compared to state-of-the-art methods when evaluated across multiple challenging POMDP domains.
- B. Luo, Y. Zhang, A. Dubey, and A. Mukhopadhyay, Act as You Learn: Adaptive Decision-Making in Non-Stationary Markov Decision Processes, in Proceedings of the 23rd International Conference on Autonomous Agents and Multiagent Systems, Richland, SC, 2024, pp. 1301–1309.
@inproceedings{baiting2024AAMAS, author = {Luo, Baiting and Zhang, Yunuo and Dubey, Abhishek and Mukhopadhyay, Ayan}, booktitle = {Proceedings of the 23rd International Conference on Autonomous Agents and Multiagent Systems}, title = {Act as You Learn: Adaptive Decision-Making in Non-Stationary Markov Decision Processes}, year = {2024}, address = {Richland, SC}, pages = {1301–1309}, acceptance = {20}, publisher = {International Foundation for Autonomous Agents and Multiagent Systems}, series = {AAMAS '24}, contribution = {colab}, isbn = {9798400704864}, keywords = {monte carlo tree search, non-stationary environments, online planning, sequential decision-making}, location = {Auckland, New Zealand}, numpages = {9} }
A fundamental challenge in sequential decision-making is dealing with non-stationary environments, where exogenous environmental conditions change over time. Such problems are traditionally modeled as non-stationary Markov decision processes (NS-MDP). However, existing approaches for decision-making in NS-MDPs have two major shortcomings: first, they assume that the updated environmental dynamics at the current time are known (although future dynamics can change); and second, planning is largely pessimistic, i.e., the agent acts "safely” to account for the non-stationary evolution of the environment. We argue that both these assumptions are invalid in practice-updated environmental conditions are rarely known, and as the agent interacts with the environment, it can learn about the updated dynamics and avoid being pessimistic, at least in states whose dynamics it is confident about. We present a heuristic search algorithm called Adaptive Monte Carlo Tree Search (ADA-MCTS) that addresses these challenges. We show that the agent can learn the updated dynamics of the environment over time and then act as it learns, i.e., if the agent is in a region of the state space about which it has updated knowledge, it can avoid being pessimistic. To quantify "updated knowledge,” we disintegrate the aleatoric and epistemic uncertainty in the agent’s updated belief and show how the agent can use these estimates for decision-making. We compare the proposed approach with multiple state-of-the-art approaches in decision-making across multiple well-established open-source problems and empirically show that our approach is faster and more adaptive without sacrificing safety.
- A. Pettet, Y. Zhang, B. Luo, K. Wray, H. Baier, A. Laszka, A. Dubey, and A. Mukhopadhyay, Decision Making in Non-Stationary Environments with Policy-Augmented Search, in Proceedings of the 23rd International Conference on Autonomous Agents and Multiagent Systems, Richland, SC, 2024, pp. 2417–2419.
@inproceedings{pettet2024decision, author = {Pettet, Ava and Zhang, Yunuo and Luo, Baiting and Wray, Kyle and Baier, Hendrik and Laszka, Aron and Dubey, Abhishek and Mukhopadhyay, Ayan}, booktitle = {Proceedings of the 23rd International Conference on Autonomous Agents and Multiagent Systems}, title = {Decision Making in Non-Stationary Environments with Policy-Augmented Search}, year = {2024}, address = {Richland, SC}, acceptance = {36}, pages = {2417–2419}, publisher = {International Foundation for Autonomous Agents and Multiagent Systems}, series = {AAMAS '24}, contribution = {lead}, isbn = {9798400704864}, note = {extended abstract}, keywords = {mcts, non-stationary environments, sequential decision-making}, location = {Auckland, New Zealand}, numpages = {3} }
Sequential decision-making is challenging in non-stationary environments, where the environment in which an agent operates can change over time. Policies learned before execution become stale when the environment changes, and relearning takes time and computational effort. Online search, on the other hand, can return sub-optimal actions when there are limitations on allowed runtime. In this paper, we introduce Policy-Augmented Monte Carlo tree search (PA-MCTS), which combines action-value estimates from an out-of-date policy with an online search using an up-to-date model of the environment. We prove several theoretical results about PA-MCTS. We also compare and contrast our approach with AlphaZero, another hybrid planning approach, and Deep Q Learning on several OpenAI Gym environments and show that PA-MCTS outperforms these baselines.
- Y. Zhang, B. Luo, A. Mukhopadhyay, D. Stojcsics, D. Elenius, A. Roy, S. Jha, M. Maroti, X. Koutsoukos, G. Karsai, and A. Dubey, Shrinking POMCP: A Framework for Real-Time UAV Search and Rescue, in 2024 International Conference on Assured Autonomy (ICAA), 2024, pp. 48–57.
@inproceedings{zhang2024, author = {Zhang, Yunuo and Luo, Baiting and Mukhopadhyay, Ayan and Stojcsics, Daniel and Elenius, Daniel and Roy, Anirban and Jha, Susmit and Maroti, Miklos and Koutsoukos, Xenofon and Karsai, Gabor and Dubey, Abhishek}, booktitle = {2024 International Conference on Assured Autonomy (ICAA)}, title = {Shrinking POMCP: A Framework for Real-Time UAV Search and Rescue}, year = {2024}, pages = {48--57}, contribution = {lead}, doi = {10.1109/ICAA64256.2024.00016}, keywords = {Three-dimensional displays;Navigation;Markov decision processes;Urban areas;Probabilistic logic;Real-time systems;Trajectory;Maintenance;Time factors;Optimization;Search and Rescue;POMDP;MCTS} }
Efficient path optimization for drones in search and rescue operations faces challenges, including limited visibility, time constraints, and complex information gathering in urban environments. We present a comprehensive approach to optimize UAV-based search and rescue operations in neighborhood areas, utilizing both a 3D AirSim-ROS2 simulator and a 2D simulator. The path planning problem is formulated as a partially observable Markov decision process (POMDP), and we propose a novel "Shrinking POMCP" approach to address time constraints. In the AirSim environment, we integrate our approach with a probabilistic world model for belief maintenance and a neurosymbolic navigator for obstacle avoidance. The 2D simulator employs surrogate ROS2 nodes with equivalent functionality. We compare trajectories generated by different approaches in the 2D simulator and evaluate performance across various belief types in the 3D AirSim-ROS simulator. Experimental results from both simulators demonstrate that our proposed shrinking POMCP solution achieves significant improvements in search times compared to alternative methods, showcasing its potential for enhancing the efficiency of UAV-assisted search and rescue operations.
- B. Luo, S. Ramakrishna, A. Pettet, C. Kuhn, abhishek dubey, G. Karsai, and A. Mukhopadhyay, Dynamic Simplex: Balancing Safety and Performance in Autonomous Cyber Physical Systems, in Proceedings of the ACM/IEEE 14th International Conference on Cyber-Physical Systems (with CPS-IoT Week 2023), New York, NY, USA, 2023, pp. 177–186.
@inproceedings{baiting2023iccps, author = {Luo, Baiting and Ramakrishna, Shreyas and Pettet, Ava and Kuhn, Christopher and dubey, abhishek and Karsai, Gabor and Mukhopadhyay, Ayan}, booktitle = {Proceedings of the ACM/IEEE 14th International Conference on Cyber-Physical Systems (with CPS-IoT Week 2023)}, title = {Dynamic Simplex: Balancing Safety and Performance in Autonomous Cyber Physical Systems}, year = {2023}, address = {New York, NY, USA}, pages = {177--186}, publisher = {Association for Computing Machinery}, series = {ICCPS '23}, acceptance = {25.6}, contribution = {colab}, doi = {10.1145/3576841.3585934}, isbn = {9798400700361}, location = {San Antonio, TX, USA}, numpages = {10}, url = {https://doi.org/10.1145/3576841.3585934} }
Learning Enabled Components (LEC) have greatly assisted cyber-physical systems in achieving higher levels of autonomy. However, LEC’s susceptibility to dynamic and uncertain operating conditions is a critical challenge for the safety of these systems. Redundant controller architectures have been widely adopted for safety assurance in such contexts. These architectures augment LEC "performant" controllers that are difficult to verify with "safety" controllers and the decision logic to switch between them. While these architectures ensure safety, we point out two limitations. First, they are trained offline to learn a conservative policy of always selecting a controller that maintains the system’s safety, which limits the system’s adaptability to dynamic and non-stationary environments. Second, they do not support reverse switching from the safety controller to the performant controller, even when the threat to safety is no longer present. To address these limitations, we propose a dynamic simplex strategy with an online controller switching logic that allows two-way switching. We consider switching as a sequential decision-making problem and model it as a semi-Markov decision process. We leverage a combination of a myopic selector using surrogate models (for the forward switch) and a non-myopic planner (for the reverse switch) to balance safety and performance. We evaluate this approach using an autonomous vehicle case study in the CARLA simulator using different driving conditions, locations, and component failures. We show that the proposed approach results in fewer collisions and higher performance than state-of-the-art alternatives.
- S. Ramakrishna, B. Luo, Y. Barve, G. Karsai, and A. Dubey, Risk-Aware Scene Sampling for Dynamic Assurance of Autonomous Systems, in 2022 IEEE International Conference on Assured Autonomy (ICAA) (ICAA’22), virtual, Puerto Rico, 2022.
@inproceedings{ICAA2022, author = {Ramakrishna, Shreyas and Luo, Baiting and Barve, Yogesh and Karsai, Gabor and Dubey, Abhishek}, booktitle = {2022 IEEE International Conference on Assured Autonomy (ICAA) (ICAA'22)}, title = {{Risk-Aware} Scene Sampling for Dynamic Assurance of Autonomous Systems}, year = {2022}, address = {virtual, Puerto Rico}, month = mar, contribution = {lead}, days = {22}, keywords = {Cyber-Physical Systems; Dynamic Assurance; Dynamic Risk; High-Risk Scenes; Bow-Tie Diagram; Hazards}, tag = {ai4cps} }
Autonomous Cyber-Physical Systems must often operate under uncertainties like sensor degradation and distribution shifts in the operating environment, thus increasing operational risk. Dynamic Assurance of these systems requires augmenting runtime safety components like out-of-distribution detectors and risk estimators. Designing these safety components requires labeled data from failure conditions and risky corner cases that fail the system. However, collecting real-world data of these high-risk scenes can be expensive and sometimes not possible. To address this, there are several scenario description languages with sampling capability for generating synthetic data from simulators to replicate the scenes that are not possible in the real world. Most often, simple search-based techniques like random search and grid search are used as samplers. But we point out three limitations in using these techniques. First, they are passive samplers, which do not use the feedback of previous results in the sampling process. Second, the variables to be sampled may have constraints that need to be applied. Third, they do not balance the tradeoff between exploration and exploitation, which we hypothesize is needed for better coverage of the search space. We present a scene generation workflow with two samplers called Random Neighborhood Search (RNS) and Guided Bayesian Optimization (GBO). These samplers extend the conventional random search and Bayesian Optimization search with the limitation points. We demonstrate our approach using an Autonomous Vehicle case study in CARLA simulation. To evaluate our samplers, we compared them against the baselines of random search, grid search, and Halton sequence search.
- S. Ramakrishna, B. Luo, C. B. Kuhn, G. Karsai, and A. Dubey, ANTI-CARLA: An Adversarial Testing Framework for Autonomous Vehicles in CARLA, in 2022 IEEE 25th International Conference on Intelligent Transportation Systems (ITSC), 2022, pp. 2620–2627.
@inproceedings{ramakrishna2022anticarla, author = {Ramakrishna, Shreyas and Luo, Baiting and Kuhn, Christopher B. and Karsai, Gabor and Dubey, Abhishek}, booktitle = {2022 IEEE 25th International Conference on Intelligent Transportation Systems (ITSC)}, title = {ANTI-CARLA: An Adversarial Testing Framework for Autonomous Vehicles in CARLA}, year = {2022}, pages = {2620-2627}, contribution = {lead}, doi = {10.1109/ITSC55140.2022.9921776} }
Despite recent advances in autonomous driving systems, accidents such as the fatal Uber crash in 2018 show these systems are still susceptible to edge cases. Such systems need to be thoroughly tested and validated before being deployed in the real world to avoid such events. Testing in open-world scenarios can be difficult, time-consuming, and expensive. These challenges can be addressed by using driving simulators such as CARLA instead. A key part of such tests is adversarial testing, in which the goal is to find scenarios that lead to failures of the given system. While several independent efforts in adversarial testing have been made, a well-established testing framework that enables adaptive stress testing has yet to be made available for CARLA. We therefore propose ANTI-CARLA, an adversarial testing framework in CARLA. The operating conditions in which a given system should be tested are specified in a scenario description language. The framework offers an adversarial search mechanism that searches for operating conditions that will fail the tested system. In this way, ANTI-CARLA extends the CARLA simulator with the capability of performing adversarial testing on any given driving pipeline. We use ANTI-CARLA to test the driving pipeline trained with Learning By Cheating (LBC) approach. The simulation results demonstrate that ANTI-CARLA can effectively and automatically find a range of failure cases despite LBC reaching an accuracy of 100% in the CARLA benchmark.