@inproceedings{iccps2026_schoolride,
author = {Nath, Vakul and Liu, Fangqi and He, Guocheng and Rogers, David and Chhokra, Ajay and Talusan, Jose Paolo and Ma, Meiyi and Mukhopadhyay, Ayan and Dubey, Abhishek},
title = {SchoolRide: A Platform for School Bus Disruption Management and Operational Resilience},
year = {2026},
booktitle = {Proceedings of the HSCC/ICCPS 2026: 29th ACM International Conference on Hybrid Systems: Computation and Control and 17th ACM/IEEE International Conference on Cyber-Physical Systems},
location = {Saint Malo, France},
keywords = {school transportation, disruption management, vehicle routing, optimization, cyber-physical systems, transit operations, real-time decision-making},
note = {Acceptance rate: 28\%; Short Paper; Track: Systems and Applications},
series = {HSCC/ICCPS '26},
what = {SchoolRide is a comprehensive cyber-physical system platform designed for school bus disruption management and operational resilience. The system integrates live telemetry, real-time status collection, and dynamic bus status monitoring to detect and respond to disruptions such as vehicle breakdowns, traffic congestion, and driver absences. Using an integrated pipeline that combines baseline routing with travel-time prediction and constrained optimization, SchoolRide automatically recomputes routing plans when disruptions occur. The platform serves as a testbed for evaluating data-driven optimization strategies for real-world school transportation systems with practical constraints.},
why = {School transportation is a societal-scale transportation cyber-physical system serving 26 million students daily, yet it remains vulnerable to operational disruptions despite strict schedules and regulations. Most existing disruption management relies on manual coordination, while SchoolRide advances the state-of-the-art by demonstrating that systematic, data-driven optimization can enhance operational resilience at realistic scale. This work is innovative because it balances competing objectives—student service quality (waiting time, delays, schedule adherence) with operational efficiency—while respecting institutional constraints and preserving privacy through synthetic data generation.},
results = {Experiments on synthetic benchmarks and real district data demonstrate strong performance and scalability of the SchoolRide optimization approach. The AdVIns insertion heuristic consistently outperforms baseline human-intuition policies on student-centered metrics, achieving substantially lower average stop and school delays. Across large-scale synthetic instances and real district scenarios, the system effectively handles realistic disruption patterns while generating high-quality rerouting solutions that balance feasibility with optimality.},
project_tags = {transit, CPS, planning}
}
As a societal-scale transportation Cyber-Physical System (CPS), school transportation integrates large-scale physical operations with cyber components for planning and control under uncertainty. Despite its scale and societal importance, the system remains vulnerable to operational disruptions such as vehicle breakdowns, road closures, traffic congestion, and driver absences. This work demonstrates how data-driven optimization can enhance operational resilience in a real-world school transit context. To advance research in this domain, we introduce SchoolRide, a platform developed in close collaboration with a school district in the southern United States. SchoolRide serves as a comprehensive testbed for studying and evaluating robust operational policies for disruption management, enabling systematic investigation of strategies under realistic data and operational constraints. We design an integrated pipeline for dynamic bus status collection and formulate the School Bus Disruption Management (SBDM) problem as a combinatorial optimization task that replans routes based on predefined schedules, real-time status, and disruption events. The framework balances student service quality (e.g., waiting time and school delays) with operational efficiency (e.g., route adjustments and driver workload). We explore heuristic and optimization-based approaches that leverage historical disruption logs from the partner district to proactively replan routes and evaluate their performance using synthetic data generated from real-world operational records to protect privacy. The generated synthetic datasets will be released to facilitate future research in this domain. Our approach outperforms current operational policies, effectively preserving service quality while reducing disruptions and workload.
@misc{hu2026columngenerationmicrotransitzoning,
author = {Hu, Hins and Sen, Rishav and Talusan, Jose Paolo and Dubey, Abhishek and Laszka, Aron and Samaranayake, Samitha},
title = {Column Generation for the Micro-Transit Zoning Problem},
year = {2026},
eprint = {2603.07821},
archiveprefix = {arXiv},
primaryclass = {math.OC},
url = {https://arxiv.org/abs/2603.07821},
keywords = {micro-transit, zoning, column generation, combinatorial optimization, urban mobility, demand-responsive transit, public transportation},
what = {This paper generalizes the Micro-Transit Zoning Problem to incorporate a global budget constraint on operational costs rather than a fixed limit on the number of zones. The work reformulates the problem into a Column Generation framework where candidate zones are generated iteratively through a pricing subproblem, and develops a scalable pricing heuristic that replaces exact integer programming with a greedy node-addition strategy. The approach is validated on real-world mobility data from five major U.S. cities including Chattanooga, where CARTA provided origin-destination trip data.},
why = {Micro-transit services require carefully designed geo-fenced zones to operate effectively, but existing computational methods impose unrealistic constraints like fixed zone counts and suffer from scalability issues in larger cities. The innovation is applying Column Generation — a decomposition technique from operations research — to the zoning problem, which naturally handles the exponentially large space of candidate zones by generating only promising candidates guided by dual variables. This also enables a more realistic global budget formulation that reflects how transit agencies actually plan service areas.},
results = {Experiments across Miami, Boston, Atlanta, Chattanooga, and Nashville demonstrate that the CG framework produces higher-quality solutions than the state-of-the-art two-phase enumeration approach while scaling more efficiently to larger cities. The pricing heuristic achieves near-optimal solution quality with dramatically reduced computation time, making the approach practical for real-world deployment. Additional analysis provides parameter tuning guidance for transit agencies adopting the method.},
project_tags = {transit, planning}
}
Along with the rapid development of new urban mobility options like ride-sharing over the past decade, on-demand micro-transit services stand out as a middle ground, bridging the gap between fixed-line mass transit and single-request ride-hailing, balancing ridership maximization and travel time minimization. However, effective operation of micro-transit services requires planning geo-fenced zones in advance, which involves solving a challenging combinatorial optimization problem. Existing approaches enumerate candidate zones first and select a fixed number of optimal zones in the second step. In this paper, we generalize the Micro-Transit Zoning Problem (MZP) to allow a global budget rather than imposing a size limit for candidate zones. We also design a Column Generation (CG) framework to solve the problem and several pricing heuristics to accelerate computation. Extensive numerical experiments across major U.S. cities demonstrate that our approach produces higher-quality solutions more efficiently and scales better in the generalized setting.
@inbook{dubey2026neurosymbolic,
author = {Dubey, Abhishek and Johnson, Taylor T. and Koutsoukos, Xenofon and Luo, Baiting and Lopez, Diego Manzanas and Maroti, Miklos and Mukhopadhyay, Ayan and Potteiger, Nicholas and Serbinowska, Serena and Stojcsics, Daniel and Zhang, Yunuo and Karsai, Gabor},
title = {Toward Assured Autonomy Using Neurosymbolic Components and Systems},
booktitle = {Neurosymbolic AI},
publisher = {John Wiley \& Sons, Ltd},
year = {2026},
chapter = {4},
pages = {89-118},
isbn = {9781394302406},
doi = {https://doi.org/10.1002/9781394302406.ch04},
url = {https://onlinelibrary.wiley.com/doi/abs/10.1002/9781394302406.ch04},
keywords = {neurosymbolic AI, assured autonomy, UAV, world model, planning, trajectory control, model checking, hybrid systems},
what = {This book chapter presents how neurosymbolic techniques can implement three core functions of an autonomous UAV system: world model maintenance (updating an internal representation of the environment from sensory inputs), planning (generating waypoints for the vehicle), and trajectory control (producing fine-grain control commands). The components are developed for a UAV mission — localizing a specific object in an urban area — and evaluated in a virtual environment. An assurance technique based on model checking is also presented for verifying neurosymbolic components that combine finite-state control with neural modules.},
why = {Autonomous systems increasingly rely on neural components for perception and decision-making, but assuring the safety of these components remains a fundamental challenge. Pure neural approaches lack formal guarantees, while pure symbolic approaches cannot handle the complexity of real-world perception. The innovation is decomposing the autonomy stack into neurosymbolic components where each combines learned perception or prediction with symbolic reasoning and constraints, and then applying model checking to verify properties of the resulting hybrid system — providing a principled path toward assured autonomy.},
results = {The neurosymbolic components successfully implement world model maintenance, subgoal-based planning, and trajectory control for a UAV target localization mission. Evaluation in a virtual urban environment demonstrates that the neurosymbolic architecture achieves mission objectives while enabling formal verification of safety properties through model checking. The chapter documents lessons learned from integrating neural and symbolic components, including the importance of safety constraints in the planning loop and the role of landmark selection in maintaining accurate world models.},
project_tags = {planning, CPS, scalable AI}
}
Neurosymbolic techniques are expected to deliver more functionalities and better performance in autonomous systems, but their assurance remains a challenge. There are various roles such components can play in an autonomous vehicle, for instance, world model maintenance, planning, and trajectory control. The world model is an internal representation of the external environment of the vehicle that is continuously updated based on new sensory inputs; the planning component generates waypoints for the vehicle to reach, while the trajectory controller produces the fine-grain control commands for the vehicle’s path. This chapter presents how these three functions can be implemented using neurosymbolic techniques, and presents results and the lessons learned. The components were developed in the context of a UAV executing a specific mission: localization of a specific object in an urban area, and evaluated in a virtual environment. An assurance technique based on model checking is presented that can be applied to a class of neurosymbolic components that include finite-state control with neural components.
@inproceedings{iccps2026_prompt_confirmation,
author = {Sivagnanam, Amutheezan and Mukhopadhyay, Ayan and Samaranayake, Samitha and Dubey, Abhishek and Laszka, Aron},
title = {Dynamic Vehicle Routing with Prompt Confirmation and Continual Optimization},
year = {2026},
booktitle = {Proceedings of the HSCC/ICCPS 2026: 29th ACM International Conference on Hybrid Systems: Computation and Control and 17th ACM/IEEE International Conference on Cyber-Physical Systems},
location = {Saint Malo, France},
keywords = {dynamic vehicle routing, on-demand transportation, prompt confirmation, optimization, stochastic requests, anytime algorithms, reinforcement learning},
note = {Acceptance rate: 28\%; Regular Paper; Track: Systems and Applications},
series = {HSCC/ICCPS '26},
what = {This paper introduces a novel computational approach for dynamic vehicle routing with prompt confirmation of advance requests. The work addresses the problem of on-demand transportation services that must make real-time decisions about accepting or rejecting trip requests while continuously optimizing vehicle manifests and routes. The research formulates this as a two-stage optimization problem: first deciding whether to accept or reject incoming requests with immediate response requirements, then continuously improving route plans to accommodate future requests between arrival of consecutive requests.},
why = {Real-world on-demand transit services face a fundamental challenge: agencies must provide prompt confirmation of whether requests can be accepted, yet future requests are unknown and will influence optimal route plans. Most prior work either provides immediate confirmation without optimizing or continuously optimizes without addressing the confirmation timing problem. This work is innovative because it bridges this gap by combining quick insertion search for rapid decision-making with continuous optimization, enabling both high service rates and operational efficiency while managing computational constraints.},
results = {The proposed computational approach demonstrates significantly better trade-offs between confirmation timeliness and service rate compared to existing methods on real-world and synthetic problem instances from a public transit agency. The anytime algorithm with continuous optimization provides prompt confirmation while also improving subsequent route plans, achieving higher service rates than approaches that simply optimize without considering confirmation requirements.},
project_tags = {transit, planning}
}
Transit agencies that operate on-demand transportation services have to respond to trip requests from passengers in real time, which involves solving dynamic vehicle routing problems with pick-up and drop-off constraints. Based on discussions with public transit agencies, we observe a real-world problem that is not addressed by prior work: when trips are booked in advance (e.g., trip requests arrive a few hours in advance of their requested pick-up times), the agency needs to promptly confirm whether a request can be accepted or not, and ensure that accepted requests are served as promised. State-of-the-art computational approaches either provide prompt confirmation but lack the ability to continually optimize and improve routes for accepted requests, or they provide continual optimization but cannot guarantee serving all accepted requests. To address this gap, we introduce a novel problem formulation of dynamic vehicle routing with prompt confirmation and continual optimization. We propose a novel computational approach for this vehicle routing problem, which integrates a quick insertion search for prompt confirmation with an anytime algorithm for continual optimization. To maximize the number requests served, we train a non-myopic objective function using reinforcement learning, which guides both the insertion and the anytime algorithms towards optimal, non-myopic solutions. We evaluate our computational approach on a real-world microtransit dataset from a public transit agency in the U.S., demonstrating that our proposed approach provides prompt confirmation while significantly increasing the number of requests served compared to existing approaches.
@inproceedings{iccps2026_pv2b,
author = {Sen, Rishav and Liu, Fangqi and Talusan, Jose Paolo and Pettet, Ava and Suzue, Yoshinori and Mukhopadhyay, Ayan and Dubey, Abhishek},
title = {P-V2B: A Neuro-Symbolic Framework for Leveraging User Persistence in Vehicle-to-Building Charging},
year = {2026},
booktitle = {Proceedings of the HSCC/ICCPS 2026: 29th ACM International Conference on Hybrid Systems: Computation and Control and 17th ACM/IEEE International Conference on Cyber-Physical Systems},
location = {Saint Malo, France},
keywords = {vehicle-to-building, EV charging, demand charge management, user persistence, neuro-symbolic control, Monte Carlo tree search, model predictive control},
note = {Acceptance rate: 28\%; Regular Paper; Track: Systems and Applications},
series = {HSCC/ICCPS '26},
what = {P-V2B introduces a neuro-symbolic framework for vehicle-to-building charging that incorporates user persistence information alongside technical optimization. The work addresses the persistent user problem where electric vehicles exhibit recurring arrival patterns over time at buildings, enabling buildings to anticipate charging demand and schedule charging strategically. The approach combines a neuro-symbolic control framework integrating Monte Carlo Model Predictive Control with a learned value function to handle both short-horizon feasibility and long-horizon demand-charge prediction, accounting for user behavior patterns while managing real-time constraints.},
why = {Vehicle-to-building systems present a complex control challenge combining real-time physical constraints with long-horizon stochastic effects of user behavior, where traditional decomposition approaches fail to capture crucial dependencies. The innovation lies in explicitly leveraging user persistence—the observation that EV users exhibit recurring patterns—as a key input alongside technical constraints, enabling more intelligent demand charge management. This bridges control theory and behavioral modeling, providing a principled way to incorporate user behavioral patterns into cyber-physical system optimization.},
results = {Evaluation on real EV fleet data from a major California manufacturer demonstrates substantial improvements in demand charge reduction and total operating costs compared to both heuristic baselines and prior work that ignore user persistence. The neuro-symbolic framework achieves significant cost savings while ensuring feasibility and full compliance with user charging requirements, validating the effectiveness of persistence-aware control strategies.},
project_tags = {energy, CPS, planning}
}
Vehicle-to-Building (V2B) integration is a cyber–physical system (CPS) where Electric Vehicles (EVs) enhance building resilience by serving as mobile storage for peak shaving, reducing monthly peak-power demand charges, supporting grid stability, and lowering electricity costs. We introduce the Persistent Vehicle-to-Building (P-V2B) problem, a long-horizon formulation that incorporates user-level persistence, where each EV corresponds to a consistent user identity across days. This structure captures recurring arrival patterns and travel-related external energy use, common in employee-based facilities with regular commuting behavior. Persistence enables multi-day strategies that are unattainable in single-day formulations, such as over-charging on low-demand days to support discharging during future high-demand periods. Real-time decision making in this CPS setting presents three key challenges: (i) uncertainty in long-term EV behavior and building load forecasts, which causes traditional control and heuristic methods to degrade under stochastic conditions; (ii) inter-day coupling of decisions and rewards, where early actions affect downstream feasible charging and discharging opportunities, complicating long-horizon optimization; and (iii) high-dimensional continuous action spaces, which exacerbate the curse of dimensionality in reinforcement learning (RL) and search-based approaches. To address these challenges, we propose a neuro-symbolic framework that integrates a constraint-based Monte Carlo Model Predictive Control (MC-MPC) layer with a learned Value Function (VF). The MC–MPC enforces physical feasibility and manages environmental uncertainty, while the VF provides long-term strategic foresight. Evaluations using real building and EV fleet data from an EV manufacturer in California demonstrate that the hybrid framework substantially outperforms state-of-the-art baselines, significantly reducing demand charge and total energy costs, while ensuring feasibility and full compliance with user charging requirements.
@inproceedings{sen2026negotiations,
author = {Sen, Rishav and Liu, Fangqi and Talusan, Jose Paolo and Pettet, Ava and Suzue, Yoshinori and Bailey, Mark and Mukhopadhyay, Ayan and Dubey, Abhishek},
title = {CONSENT: A Negotiation Framework for Leveraging User Flexibility in Vehicle-to-Building Charging under Uncertainty},
booktitle = {Proceedings of the 24th Conference on Autonomous Agents and MultiAgent Systems (AAMAS 2026)},
year = {2026},
note = {Acceptance rate: 25\%},
location = {Paphos, Cyprus},
publisher = {International Foundation for Autonomous Agents and Multiagent Systems},
series = {AAMAS '26},
keywords = {vehicle-to-building, energy management, negotiation, demand response, incentive design, semi-Markov decision processes, user flexibility},
what = {CONSENT is a negotiation framework that enables coordination between EV owners and smart buildings under uncertainty in vehicle-to-building charging systems. The work formulates the V2B charging problem as a semi-Markov decision process with negotiation between buildings and users. The system offers personalized charging options based on user flexibility constraints, building energy efficiency goals, and uncertainty in EV arrival patterns, allowing users to express preferences through bounded SoC and departure time adjustments while buildings optimize charging schedules.},
why = {Vehicle-to-building energy coordination creates a fundamental conflict: buildings want to minimize peak demand costs while users want convenient, low-cost charging. Existing approaches either assume full system control or fail to capture real-world incentive-based coordination where users voluntarily participate. CONSENT is innovative because it explicitly bridges technical control with behavioral negotiation, using formal constraint handling and incentive design to enable mutually beneficial cooperation without requiring users to fully comply with building preferences.},
results = {Simulation and user study evaluation demonstrates that CONSENT generates mutually beneficial outcomes: buildings achieve 23% cost reductions compared to baseline approaches while users maintain satisfaction with their charging requirements through negotiated flexibility options. The framework proves effective at aligning disparate objectives through structured negotiation, significantly reducing operational costs while ensuring user voluntary participation.},
project_tags = {energy, CPS, planning}
}
The growth of Electric Vehicles (EVs) creates a conflict in vehicle-to-building (V2B) settings between building operators, who face high energy costs from uncoordinated charging, and drivers, who prioritize convenience and a full charge. To resolve this, we propose a negotiation-based framework that, by design, guarantees voluntary participation, strategy-proofness, and budget feasibility. It transforms EV charging into a strategic resource by offering drivers a range of incentive-backed options for modest flexibility in their departure time or requested state of charge (SoC). Our framework is calibrated with user survey data and validated using real operational data from a commercial building and an EV manufacturer. Simulations show that our negotiation protocol creates a mutually beneficial outcome: lowering the building operator’s costs by over 3.5% compared to an optimized, non-negotiating smart charging policy, while simultaneously reducing user charging expenses by 22% below the utility’s retail energy rate. By aligning operator and EV user objectives, our framework provides a strategic bridge between energy and mobility systems, transforming EV charging from a source of operational friction into a platform for collaboration and shared savings.
@inproceedings{iccps2026_wenflow,
author = {Buckelew, Jacob and Talusan, Jose Paolo and Sivaramakrishnan, Vasavi and Mukhopadhyay, Ayan and Srivastava, Anurag and Dubey, Abhishek},
title = {WENFlow: Scalable Attention for Unsupervised Spatiotemporal Anomaly Detection in High-Dimensional Cyber-Physical Systems},
year = {2026},
booktitle = {Proceedings of the HSCC/ICCPS 2026: 29th ACM International Conference on Hybrid Systems: Computation and Control and 17th ACM/IEEE International Conference on Cyber-Physical Systems},
location = {Saint Malo, France},
keywords = {anomaly detection, cyber-physical systems, wavelet transforms, normalizing flows, spatiotemporal analysis, unsupervised learning, interpretability},
note = {Acceptance rate: 28\%; Regular Paper; Track: Foundations},
series = {HSCC/ICCPS '26},
what = {WENFlow proposes a wavelet-enabled normalizing flow framework for unsupervised anomaly detection in high-dimensional cyber-physical systems. The work addresses the challenge of detecting subtle anomalies in systems like power grids and water networks that exhibit complex spatiotemporal patterns. WENFlow combines discrete wavelet transform for multi-scale temporal feature extraction with gated selective self-attention to identify critical sensors, conditional density estimation for likelihood-based anomaly scoring, and interpretable analysis through log-density and feature importance.},
why = {Real-time anomaly detection in complex infrastructure systems requires capturing both slow operational trends and fast localized disruptions, with scalable robustness to contaminated training data and high dimensionality. Existing methods struggle with spatiotemporal dependencies and contamination from unlogged maintenance events. WENFlow is innovative because it achieves linear complexity scaling with sensor dimensionality through wavelet decomposition and feature-wise attention, providing both accurate anomaly detection and interpretable explanations of which sensors and temporal patterns indicate anomalies.},
results = {Extensive evaluation on power grid and water treatment benchmarks demonstrates WENFlow achieves superior anomaly detection performance compared to state-of-the-art methods including transformers and density-based approaches, while maintaining linear scaling with system dimensionality and robustness to contaminated training data. The framework provides interpretable analysis through feature importance scores and temporal pattern visualization.},
project_tags = {CPS, ML for CPS, Explainable AI}
}
Real-time anomaly detection in high-dimensional data is crucial for ensuring the security of cyber-physical systems (CPS) such as power grids and water distribution networks. Such data commonly take the form of multivariate time series, often unlabeled and necessitating the need for unsupervised detection methods. However, many unsupervised deep learning methods make assumptions about the normality of training data, which is unrealistic in real-world CPS where training data often contain anomalies or rare patterns. Furthermore, these methods rely on inefficient mechanisms to learn spatiotemporal dependencies in the data and scale quadratically with the number of system features. To address these problems, we propose Wavelet-Enhanced Normalizing Flows (WENFlow), an unsupervised deep learning model that identifies anomalies in low-density regions of the data distribution and does not assume access to anomaly-free training data. Notably, WENFlow leverages a scalable Gated Selective Self-Attention mechanism for capturing the most critical spatial dependencies between features. Compared to existing models, WENFlow scales linearly with respect to the number of system features and meets real-time inference requirements for anomaly detection. In our experiments, WENFlow achieves superior AUC scores against baseline methods across datasets with varying anomaly ratios, showcasing its robustness against contaminated training data. We evaluate WENFlow on 2 real-world benchmark datasets and a simulated phasor measurement unit dataset collected from a power grid testbed.
@inproceedings{iccps2026_logiex,
author = {An, Ziyan and Wang, Xia and Baier, Hendrik and Chen, Zirong and Dubey, Abhishek and Mukhopadhyay, Ayan and Johnson, Taylor T. and Sprinkle, Jonathan and Ma, Meiyi},
title = {LogiEx: Logic-Integrated Explanations for Stochastic Planning in Cyber-Physical Systems},
year = {2026},
booktitle = {Proceedings of the HSCC/ICCPS 2026: 29th ACM International Conference on Hybrid Systems: Computation and Control and 17th ACM/IEEE International Conference on Cyber-Physical Systems},
location = {Saint Malo, France},
keywords = {explainable AI, transit planning, formal logic, large language models, Monte Carlo tree search, knowledge graphs, human-AI interaction},
note = {Acceptance rate: 28\%; Regular Paper; Track: Systems and Applications},
series = {HSCC/ICCPS '26},
what = {LogiEx integrates formal logic and large language models to provide explainable sequential planning for human-centered cyber-physical systems like intelligent transportation. The system combines Monte Carlo Tree Search planning with logical reasoning to generate trustworthy explanations for planning decisions. LogiEx categorizes user queries into three types: those answerable from existing search trees, those requiring human-guided search, and those requiring background knowledge. The framework generates logical evidence supporting planning decisions and translates this into natural language explanations that users can verify.},
why = {Traditional planning algorithms like Monte Carlo Tree Search achieve strong performance but lack transparency, making their outputs unsuitable for high-stakes CPS applications where users need to understand and trust system decisions. LogiEx is innovative because it bridges the transparency gap by combining stochastic search with formal logic verification, allowing the system to explain not just what actions it recommends but why those actions are justified given domain constraints and objectives. This addresses a critical safety concern in AI-driven CPS.},
results = {Quantitative evaluation demonstrates LogiEx achieves up to 7.9x higher semantic similarity and 1.7x higher factual consistency compared to LLM-only baselines on explaining transportation planning decisions. User studies validate that the framework provides faithful, consistent explanations that help users understand the planning process while maintaining the ability to ask follow-up questions for deeper reasoning.},
project_tags = {transit, planning, Explainable AI}
}
Human-centered cyber-physical systems (CPS), such as intelligent transportation services, warehouse robotics operated by human supervisors, and healthcare infrastructures involving clinicians and medical staff, increasingly rely on Artificial Intelligence (AI)-driven sequential decision-making under uncertainty. However, the lack of transparent reasoning in these systems limits trust, verifiability, and human oversight. This challenge is particularly acute for planning algorithms like Monte Carlo Tree Search (MCTS), whose stochastic search processes are opaque to engineers and operators. To address this gap, we introduce LogiEx, a logic-integrated framework that combines large language models (LLMs) with formal methods to generate trustworthy explanations for planning behavior. LogiEx transforms free-form user queries into logical statements with templated variables, then verifies whether evidence extracted from the decision process aligns with both the environment state and the constraints of the stochastic planning model. This enables grounded explanations across a wide range of user questions—from factual retrieval to comparative reasoning. LogiEx also supports Human-Guided Search (HuGS), allowing users to pose conditional ‘what-if” queries that trigger new, scenario-specific searches, ensuring that humans are not passive observers but active participants who can steer and refine the planning process. We evaluate LogiEx through both quantitative assessments and user studies, finding that it consistently outperforms baselines, achieving up to 7.9 higher semantic similarity (BERTScore) and 1.6 higher factual consistency (FactCC) compared to baseline LLMs, and is the most preferred form of explanation among CPS practitioners.
@inproceedings{iccps2026_respond,
author = {Zulqarnain, Ammar Bin and Talusan, Jose Paolo and Napier, Kelly and Gens, Corey and Higgs, Jennifer and Herndon, Colleen and Mukhopadhyay, Ayan and Dubey, Abhishek},
title = {RESPOND: A Modular Platform for Urban Emergency Response Research and Decision Support},
year = {2026},
booktitle = {Proceedings of the HSCC/ICCPS 2026: 29th ACM International Conference on Hybrid Systems: Computation and Control and 17th ACM/IEEE International Conference on Cyber-Physical Systems},
location = {Saint Malo, France},
keywords = {emergency response, dispatch optimization, facility location, simulation, policy evaluation, urban computing, resource allocation},
note = {Acceptance rate: 28\%; Short Paper; Track: Systems and Applications},
series = {HSCC/ICCPS '26},
what = {RESPOND is a modular cyber-physical system platform for urban emergency response that integrates strategic planning and operational dispatch. The platform provides unified simulation infrastructure for evaluating fire/EMS dispatching policies and station placement strategies under realistic constraints. RESPOND enables scenario-driven evaluation by combining real station operations with hypothetical alternatives, incorporating actual incident data, travel times, and service metrics. The system allows researchers to evaluate counterintuitive policies and trade-offs between competing operational objectives like coverage and response time.},
why = {Urban emergency response is a complex societal-scale CPS involving coordination between multiple agencies, tight operational constraints, and high consequences of failures. Most research remains fragmented and simulation-based, lacking integrated platforms that seamlessly combine strategic planning with operational dispatch evaluation. RESPOND is innovative because it provides a unified testbed for evaluating coupled planning and dispatch decisions at scale, enabling scenario-based exploration of policy alternatives that would be infeasible to test on real systems.},
results = {Simulation fidelity assessment demonstrates that RESPOND accurately reproduces historical incident distributions when using real data, with MAE of approximately 6 incidents per station validating model accuracy. Counterintuitive scenario analysis shows that adding optimally-placed stations near downtown improves coverage by 10-25 seconds while reducing spatial imbalance, demonstrating the platform's ability to reveal non-obvious policy effects.},
project_tags = {emergency, transit, planning, CPS}
}
Growing urban populations strain fire/Emergency Medical Services (EMS) systems, creating societal-scale concerns where decisions about station siting (strategy) and dispatch policies (operations) unfold in a tightly coupled cyber-physical loop. The core challenge lies in validating different approaches since direct experimentation on real populations is infeasible. Prior efforts address isolated components, treating strategic siting heatmaps and operational dispatch heuristics as separate problems. They lack a unified, incident-level simulator to expose the critical cross-policy trade-offs between siting and dispatch. We present RESPOND (REsponse Simulation Platform for Operations, Navigation, and Dispatch), a modular, incident-level, Operational Decision Support System. RESPOND holistically integrates these previously siloed functions, including: (i) optimal station placement, (ii) apparatus allocation, (iii) dispatch policies, (iv) travel time and service time models, and (v) survival modeling for incident prediction. The platform’s engine replays historical incidents at unit resolution and stress-tests counterfactual futures (e.g., station moves, demand surges). A planner-facing interface surfaces key metrics (SLA compliance, 90th Percentile (P90) response time) for deliberation. Evaluations demonstrate reproduction of observed response patterns and reveal policy trade-offs. The result is a unifying platform that transforms fragmented analysis into an operational decision environment, enabling safe and rigorous evaluation of coupled station placement and dispatch policies through simulation.
@inproceedings{iccps2026_moveod,
author = {Sen, Rishav and Talusan, Jose Paolo and Dubey, Abhishek and Mukhopadhyay, Ayan and Samaranayake, Samitha and Laszka, Aron},
title = {MoveOD: Synthesizing Origin-Destination Commute Distribution from U.S. Census Data},
year = {2026},
booktitle = {Proceedings of the HSCC/ICCPS 2026: 29th ACM International Conference on Hybrid Systems: Computation and Control and 17th ACM/IEEE International Conference on Cyber-Physical Systems},
location = {Saint Malo, France},
keywords = {origin-destination synthesis, travel demand, transportation planning, data fusion, Bayesian methods, public datasets, traffic simulation},
note = {Acceptance rate: 28\%; Short Paper; Track: Systems and Applications},
series = {HSCC/ICCPS '26},
what = {MoveOD presents a framework for synthesizing fine-grained origin-destination commute patterns from publicly available datasets by integrating census data, employment records, and road networks. The approach uses Bayesian decomposition to generate minute-level commute trip distributions while preserving spatial and temporal coherence with observed commuting patterns. The framework leverages public data sources including US Census Community Survey, Longitudinal Employer-Household Dynamics, and OpenStreetMap to generate realistic synthetic commute data.},
why = {High-resolution origin-destination data is essential for transportation planning and traffic management, yet collecting such data through surveys or GPS tracking is expensive and privacy-invasive. Existing synthetic approaches fail to capture temporal and spatial granularity needed for realistic simulation. MoveOD is innovative because it demonstrates how publicly available marginal data can be combined through principled statistical methods to generate detailed, temporally-resolved commute patterns that preserve observed macro-level statistics while enabling microscopic simulations.},
results = {Validation on Hamilton County, Tennessee data demonstrates that the calibrated MoveOD approach accurately reproduces observed census commute patterns while generating realistic minute-level departure time distributions. The framework achieves alignment with ACS travel time margins through careful calibration, enabling fast synthetic data generation suitable for any US county and providing a reusable tool for transportation research.},
project_tags = {transit, planning}
}
High-resolution origin–destination (OD) tables are critical to cyber-physical transportation systems, enabling realistic digital twins, adaptive routing strategies, signal timing optimization, and demand-responsive mobility services. However, such OD data is rarely available outside a small number of data-rich metropolitan regions. We introduce MoveOD, an open-source pipeline that synthesizes publicly available datasets to generate fine-grained commuter OD flows with spatial and temporal departure distributions for any U.S. county. MoveOD fuses American Community Survey travel-time and departure distributions, Longitudinal Employer–Household Dynamics (LODES) residence–workplace flows, OpenStreetMap (OSM) road networks, and building footprint data. Our approach ensures consistency with observed commuter totals, workplace employment distributions, and reported travel durations. MoveOD is integrated with a transportation digital twin, enabling end-to-end CPS experimentation. We demonstrate the system in Hamilton County, Tennessee, generating approximately 150,000 synthetic daily trips and evaluating routing algorithms in a live dashboard.
@article{airoas_zhang,
title = {Observation Adaptation via Annealed Importance Resampling for Partially Observable Markov Decision Processes},
author = {Zhang, Yunuo and Luo, Baiting and Mukhopadhyay, Ayan and Dubey, Abhishek},
year = {2025},
month = sep,
journal = {Proceedings of the International Conference on Automated Planning and Scheduling},
volume = {35},
number = {1},
pages = {306--314},
doi = {10.1609/icaps.v35i1.36132},
url = {https://ojs.aaai.org/index.php/ICAPS/article/view/36132},
abstractnote = {Partially observable Markov decision processes (POMDPs) are a general mathematical model for sequential decision-making in stochastic environments under state uncertainty. POMDPs are often solved online, which enables the algorithm to adapt to new information in real time. Online solvers typically use bootstrap particle filters based on importance resampling for updating the belief distribution. Since directly sampling from the ideal state distribution given the latest observation and previous state is infeasible, particle filters approximate the posterior belief distribution by propagating states and adjusting weights through prediction and resampling steps. However, in practice, the importance resampling technique often leads to particle degeneracy and sample impoverishment when the state transition model poorly aligns with the posterior belief distribution, especially when the received observation is noisy. We propose an approach that constructs a sequence of bridge distributions between the state-transition and optimal distributions through iterative Monte Carlo steps, better accommodating noisy observations in online POMDP solvers. Our algorithm demonstrates significantly superior performance compared to state-of-the-art methods when evaluated across multiple challenging POMDP domains.},
what = {AIROAS introduces Annealed Importance Resampling for Observation Adaptation in online POMDP planning, addressing the challenge of belief state representation when direct sampling from optimal posterior distributions is infeasible. The approach maintains particle diversity through annealed importance resampling, creating smoothly interpolated intermediate distributions that bridge proposal and target distributions. This enables more efficient belief updates and superior planning performance compared to standard particle filtering approaches, particularly in deep searches where observation uncertainty is high.},
why = {Online POMDP planning requires accurate belief state representation to make decisions under uncertainty, but particle filtering struggles when the received observation provides highly informative evidence that moves beliefs far from prior distributions. AIROAS is innovative because it applies importance sampling tempering principles specifically for belief node updates in planning trees, improving upon standard particle filters by carefully controlling the transition between prior and posterior beliefs through sequence of intermediate distributions.},
results = {Experimental evaluation across multiple POMDP planning domains demonstrates that AIROAS significantly improves planning performance by reducing particle degeneracy issues at deeper search nodes. The approach enables more effective belief representation while maintaining computational efficiency in online planning, achieving better decision quality compared to standard particle filtering approaches.},
keywords = {partially observable Markov decision processes, importance sampling, particle filtering, belief state representation, online planning, Monte Carlo methods},
project_tags = {POMDP, planning, scalable AI}
}
@article{talusanTCPS2025,
title = {An End-to-End Solution for Public Transit Stationing and Dispatch Problem},
author = {Talusan, Jose Paolo and Han, Chaeeun and Rogers, David and Mukhopadhyay, Ayan and Laszka, Aron and Freudberg, Dan and Dubey, Abhishek},
year = {2025},
month = jul,
journal = {ACM Trans. Cyber-Phys. Syst.},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
doi = {10.1145/3754454},
issn = {2378-962X},
url = {https://doi.org/10.1145/3754454},
note = {Just Accepted},
keywords = {public transit, dispatch optimization, stationing problems, Monte Carlo tree search, disruption management, resource allocation},
what = {This work presents an end-to-end solution for public transit stationing and dispatch problems, formulating the dynamic scheduling and dispatch challenge for fixed-line transit under disruptions. The research develops a semi-Markov decision process framework that solves for optimal routing and dispatch policies using Monte Carlo Tree Search. The platform integrates a simulator for evaluating both synthetic benchmarks and real-world transit data, enabling principled evaluation of stationing decisions and dispatch policies under realistic operational constraints.},
why = {Public transit systems face significant challenges from operational disruptions and the need to maintain service quality under uncertainty, yet most research assumes fixed infrastructure and deterministic conditions. This work is innovative because it provides a unified, scalable framework that simultaneously optimizes both strategic stationing decisions and dynamic dispatch policies, using simulation-based validation on real agency data. The approach bridges planning-time and operation-time decisions in transit systems.},
results = {Evaluation on real WeGo Public Transit data from Nashville demonstrates that the proposed approach increases passenger service by 7% while reducing deadhead miles by 42% compared to greedy baselines. The Monte Carlo Tree Search-based planning provides significantly better performance than myopic policies, validating the effectiveness of principled decision-making under operational uncertainty.},
project_tags = {transit, planning, CPS}
}
Public bus transit systems provide critical transportation services for large sections of modern communities. On-time performance and maintaining the reliable quality of service is therefore very important. Unfortunately, disruptions caused by overcrowding, vehicular failures, and road accidents often lead to service performance degradation. Though transit agencies keep a limited number of vehicles in reserve and dispatch them to relieve the affected routes during disruptions, the procedure is often ad-hoc and has to rely on human experience and intuition to allocate resources (vehicles) to affected trips under uncertainty. In this paper, we describe a principled approach using non-myopic sequential decision procedures to solve the problem and decide (a) if it is advantageous to anticipate problems and proactively station transit buses near areas with high-likelihood of disruptions and (b) decide if and which vehicle to dispatch to a particular problem. Our approach was developed in partnership WeGo Public Transit, a public transportation agency based in Nashville, Tennessee and models the system as a semi-Markov decision problem (solved as a Monte-Carlo tree search procedure) and shows that it is possible to obtain an answer to these two coupled decision problems in a way that maximizes the overall reward (number of people served). We sample many possible futures from generative models, each is assigned to a tree and processed using root parallelization. We validate our approach with both real-world and scaled-up data from two agencies in Tennessee. Our experiments show that the proposed framework serves 2% more passengers while reducing deadhead miles by 40%. Finally, we introduce Vectura, a dashboard providing transit dispatchers a complete view of the transit system at a glance along with access to our developed tools.
@inproceedings{zulqarnain2025,
author = {Zulqarnain, Ammar and Buckelew, Jacob and Talusan, Jose Paolo and Mukhopadhyay, Ayan and Dubey, Abhishek},
booktitle = {2025 IEEE International Conference on Smart Computing (SMARTCOMP)},
title = {TRACE: Traffic Response Anomaly Capture Engine for Localization of Traffic Incidents},
year = {2025},
month = jun,
contribution = {lead},
what = {TRACE is a novel framework for real-time traffic anomaly detection and localization that combines Graph Neural Networks, Transformers, and normalizing flows. The system learns the spatial-temporal dependencies in road networks through graph convolutions while capturing long-range temporal interactions through transformer attention. To detect anomalies, TRACE computes log-likelihoods under a learned probability distribution, identifying points where traffic patterns deviate significantly from normal conditions. The framework provides both anomaly detection and localization through density-based analysis.},
why = {Traditional traffic anomaly detection methods struggle with the complexity of capturing spatial-temporal dependencies in interconnected road networks while maintaining scalability and interpretability. TRACE is innovative because it unifies multiple deep learning paradigms (graph neural networks, transformers, normalizing flows) within a probabilistic framework, enabling unsupervised anomaly detection without requiring labeled anomaly data. The density-based approach provides interpretable anomaly scores grounded in learned probability models.},
results = {Evaluation on real-world traffic data from a mid-sized US metropolitan area demonstrates that TRACE significantly improves incident localization precision by 17% compared to methods that identify anomalies without spatial localization. The framework achieves superior detection latency and mean localization error compared to state-of-the-art baselines.},
keywords = {traffic anomaly detection, graph neural networks, transformers, probabilistic modeling, spatial-temporal analysis, smart transportation, anomaly localization},
project_tags = {CPS, ML for CPS, transit}
}
Effective traffic incident management is critical for road safety and operational efficiency. Yet, many transportation agencies rely on reactionary methods, where incidents are reported by human agents and managed through rule- based frameworks like traditional Traffic Incident Management (TIM) systems. However, these are vulnerable to human error, oversight, and delays during high-stress conditions. Although recent initiatives incorporating real-time sensor data for cor- ridor monitoring and enhanced roadway information systems represent strides toward modernization, these systems often still require substantial human intervention. Recent advancements in graph-based deep learning models offer promising potential for addressing the limitations of traditional methods. While state- of-the-art models exist, the complexities of incident localization within dynamic and interconnected road networks, along with limited availability of high-quality labeled data and variability in real-time traffic measurements, are still open challenges. To address these, we propose the Traffic Response Anomaly Capture Engine (TRACE), a novel approach that combines graph neural networks, transformers, and probabilistic normalizing flows to accurately detect and localize traffic anomalies in real time. TRACE captures spatial-temporal dependencies, manages data uncertainty, and enhances automation, supporting more precise and timely incident localization. Our approach is validated on real-world traffic data and improved incident localization by 0.6 miles (17%) than SOTA methods while maintaining similar incident detection accuracy and mean detection delay.
@inproceedings{rogers2025,
author = {Rogers, David and Gupta, Samir and Talusan, Jose Paolo and Baig, Mirza and Ramesh, Arti and Takahashi, Natsu and Kojo, Naoki and Dubey, Abhishek},
booktitle = {2025 IEEE International Conference on Smart Computing (SMARTCOMP)},
title = {AVATAR: Autonomy Aware Routing for On-demand Transit Applications},
year = {2025},
month = jun,
contribution = {lead},
what = {AVATAR is an autonomy-aware routing framework for on-demand transit that prioritizes dependable, low-variance routes by considering factors like road speed, speed variability, construction zones, pedestrian encounters, and school zones. The approach uses multi-criteria decision-making to evaluate routes based on multiple operational objectives including speed, consistency, safety, and user preferences. The framework supports both real-time AV operations and offline analysis, enabling transit operators to assess and refine routing strategies based on user-configurable preferences and real-world constraints.},
why = {Autonomous vehicle deployment in on-demand transit faces fundamental reliability challenges: traditional routing algorithms designed for human drivers prioritize travel time but ignore the consistency and predictability requirements essential for AVs operating in complex urban environments. AVATAR is innovative because it explicitly incorporates AV operational constraints—including construction zones, pedestrian density, and variable traffic conditions—into the routing framework, enabling more reliable and sustainable AV-based transportation systems.},
results = {Real-world validation using data from Nashville, Silicon Valley, and Yokohama demonstrates that AVATAR generates significantly more reliable routes than traditional approaches. Autonomy-aware routing substantially improves route consistency and predictability compared to speed-optimized baselines while maintaining competitive travel times.},
keywords = {autonomous vehicles, path planning, multi-criteria routing, on-demand transit, reliability optimization, traffic management},
project_tags = {transit, planning, CPS}
}
Autonomous vehicles (AVs) are becoming integral to on-demand micro transit, offering the potential for safer, efficient, and sustainable transportation. However, AV deploy- ment faces several challenges, including the lack of suitable roadways, varying travel conditions. Traditional routers prioritize speed and not reliability, leading to unpredictable operations and complications in planning. To address these, we introduce AVATAR, an autonomy-aware routing framework that prioritizes dependable, low-variance routes. Our approach encodes mul- tiple objectives including road speed, speed variability, zoning areas, pedestrian encounters, and operator preferred roadways into edge-level routing engines. Objective optimized routes are generated, then scored using a multi-criteria decision-making process. User-configurable preference profiles, allow operators to define a balance between reliability and speed. AVATAR is a data- driven framework that supports both real-time AV operations and offline analysis, enabling transit operators to assess and refine routing strategies. Our experiments using real-world data from Silicon Valley, California, and Yokohama, Japan show that our approach significantly improves AV reliability and performance and advances the sustainable and scalable integration of AVs into future transportation networks.
@misc{dubey2025forecasting,
author = {Dubey, Abhishek and Wilbur, Michael and Mukhopadhyay, Ayan and Laszka, Aron},
month = jan,
title = {Forecasting energy consumption in a mixed-vehicle fleet},
year = {2025},
journal = {US Patent App. 18/708,438},
url = {https://patents.google.com/patent/US20250030766A1/en}
}
@inproceedings{zhang2025escortefficientsteinvariationalsliced,
title = {ESCORT: Efficient Stein-variational and Sliced Consistency-Optimized Temporal Belief Representation for POMDPs},
author = {Zhang, Yunuo and Luo, Baiting and Mukhopadhyay, Ayan and Karsai, Gabor and Dubey, Abhishek},
booktitle = {Proceeding of the 39th Conference on Neural Information Processing Systems (NeurIPS'25)},
year = {2025},
url = {https://arxiv.org/abs/2510.21107},
eprint = {2510.21107},
archiveprefix = {arXiv},
primaryclass = {cs.LG},
what = {ESCORT is a particle-based framework for belief approximation in partially observable Markov decision processes that addresses the challenge of representing complex, multi-modal belief distributions in high-dimensional spaces. The approach extends Stein Variational Gradient Descent with correlation-aware projections and temporal consistency constraints, enabling particles to concentrate in high-uncertainty regions while preserving learned correlation structures. ESCORT dynamically adapts to belief landscape complexity without requiring resampling, maintaining both representational accuracy and computational efficiency.},
why = {Traditional belief representation methods in POMDPs struggle with high-dimensional, multi-modal distributions due to kernel degeneracy and the need for excessive particles. Recent neural and parametric approaches fail to capture intricate correlation patterns essential for accurate decision-making. ESCORT is innovative because it combines principled geometric methods (sliced Wasserstein distance) with temporal consistency regularization, enabling particle-based methods to scale to complex belief spaces while preserving critical statistical dependencies that impact decision quality.},
results = {Extensive evaluation on Light-Dark Navigation, Kidnapped Robot, and Multi-Target Tracking benchmarks demonstrates that ESCORT consistently outperforms state-of-the-art belief approximation methods including transformers and density-based approaches. The framework achieves superior belief fidelity and decision quality across domains ranging from discrete to continuous high-dimensional problems.},
keywords = {partially observable Markov decision processes, belief representation, particle filtering, Stein variational gradient descent, optimal transport, stochastic optimization},
project_tags = {POMDP, scalable AI, planning}
}
@inproceedings{an2025logiex,
author = {An, Ziyan and Wang, Xia and Baier, Hendrik and Chen, Zirong and Dubey, Abhishek and Johnson, Taylor T. and Sprinkle, Jonathan and Mukhopadhyay, Ayan and Ma, Meiyi},
booktitle = {Proceedings of the 24th International Conference on Autonomous Agents and Multiagent Systems (AAMAS 2025)},
title = {LogiEx: Integrating Formal Logic and Large Language Model for Explainable Planning},
year = {2025},
note = {Extended Abstract},
acceptance = {40},
contribution = {colab}
}
@inproceedings{keplinger2025nsgym,
author = {Keplinger, Nathaniel S. and Luo, Baiting and Bektas, Iliyas and Zhang, Yunuo and Wray, Kyle Hollins and Laszka, Aron and Dubey, Abhishek and Mukhopadhyay, Ayan},
title = {NS-Gym: Open-Source Simulation Environments and Benchmarks for Non-Stationary Markov Decision Processes},
year = {2025},
booktitle = {Proceeding of the 39th Conference on Neural Information Processing Systems (NeurIPS'25)},
archiveprefix = {arXiv},
contribution = {colab},
eprint = {2501.09646},
primaryclass = {cs.AI},
url = {https://arxiv.org/abs/2501.09646},
what = {NS-Gym is an open-source simulation toolkit for non-stationary Markov decision processes that segregates environmental parameter evolution from agent decision-making. The toolkit provides standardized interfaces for defining NS-MDPs, benchmark problems across different environmental change types, and implementations of state-of-the-art algorithmic approaches. NS-Gym enables systematic evaluation of decision-making algorithms under dynamic environments, addressing the gap in standardized benchmarks for non-stationary problems in fields like autonomous driving and resource optimization.},
why = {Many real-world decision-making problems involve non-stationary environments where the reward structure or transition dynamics change over time, yet most research assumes stationary conditions. The lack of standardized benchmarks and simulation interfaces has hindered systematic progress in non-stationary decision-making. NS-Gym is innovative because it provides the first comprehensive toolkit specifically designed for NS-MDPs, enabling researchers to evaluate algorithm robustness and adaptability under realistic environmental change patterns.},
results = {Benchmark results comparing six algorithmic approaches across multiple NS-MDP problem types demonstrate clear performance differences in handling environmental changes. The toolkit enables reproducible evaluation of both model-based and model-free approaches under various environmental conditions.},
keywords = {non-stationary environments, Markov decision processes, benchmark problems, decision-making under change, algorithm evaluation, reinforcement learning},
project_tags = {POMDP, planning, scalable AI}
}
In many real-world applications, agents must make sequential decisions in environments where conditions are subject to change due to various exogenous factors. These non-stationary environments pose significant challenges to traditional decision-making models, which typically assume stationary dynamics. Non-stationary Markov decision processes (NS-MDPs) offer a framework to model and solve decision problems under such changing conditions. However, the lack of standardized benchmarks and simulation tools has hindered systematic evaluation and advance in this field. We present NS-Gym, the first simulation toolkit designed explicitly for NS-MDPs, integrated within the popular Gymnasium framework. In NS-Gym, we segregate the evolution of the environmental parameters that characterize non-stationarity from the agent’s decision-making module, allowing for modular and flexible adaptations to dynamic environments. We review prior work in this domain and present a toolkit encapsulating key problem characteristics and types in NS-MDPs. This toolkit is the first effort to develop a set of standardized interfaces and benchmark problems to enable consistent and reproducible evaluation of algorithms under non-stationary conditions. We also benchmark six algorithmic approaches from prior work on NS-MDPs using NS-Gym. Our vision is that NS-Gym will enable researchers to assess the adaptability and robustness of their decision-making algorithms to non-stationary conditions.
@inproceedings{khanna2025driverbreaks,
author = {Khanna, Agrima and Liu, Fangqi and Gupta, Samir and Pavia, Sophie and Mukhopadhyay, Ayan and Dubey, Abhishek},
booktitle = {Proceedings of the 26th International Conference on Distributed Computing and Networking},
title = {PDPTW-DB: MILP-Based Offline Route Planning for PDPTW with Driver Breaks},
year = {2025},
address = {New York, NY, USA},
pages = {73--83},
acceptance = {32},
publisher = {Association for Computing Machinery},
series = {ICDCN '25},
category = {other},
contribution = {lead},
doi = {10.1145/3700838.3700854},
isbn = {9798400710629},
keywords = {vehicle routing, pickup-delivery problems, driver breaks, hours-of-service, mixed-integer programming, logistics optimization, microtransit},
numpages = {11},
url = {https://doi.org/10.1145/3700838.3700854},
what = {PDPTW-DB presents a mixed-integer linear programming formulation for pickup-delivery problems with time windows that integrates periodic driver break requirements. The work addresses the practical challenge of incorporating mandatory driver rest periods into route planning while maintaining service feasibility. The formulation enables optimization of vehicle routing and break scheduling simultaneously, accounting for realistic constraints like hours-of-service regulations, service time windows, and vehicle capacity limitations.},
why = {Existing vehicle routing formulations often overlook mandatory driver break requirements or handle them as post-hoc constraints, leading to infeasible or inefficient solutions in practice. PDPTW-DB is innovative because it integrates break scheduling directly into the optimization formulation, enabling principled trade-offs between vehicle utilization, travel distance, and driver compliance with service regulations. This bridge between operational planning and human factors considerations addresses a critical real-world constraint.},
results = {Implementation and evaluation using real Microtransit delivery data demonstrates the formulation produces cost-effective solutions while ensuring full regulatory compliance. Experiments validate both computational efficiency of the mixed-integer approach and the quality of solutions achievable when driver breaks are explicitly modeled.},
project_tags = {transit, planning}
}
The Pickup and Delivery Problem with Time Windows (PDPTW) involves optimizing routes for vehicles to meet pickup and delivery requests within specific time constraints, a challenge commonly faced in logistics and transportation. Microtransit, a flexible and demand-responsive service using smaller vehicles within defined zones, can be effectively modeled as a PDPTW. Yet, the need for driver breaks—a key human constraint—is frequently overlooked in PDPTW solutions, despite being necessary for regulatory compliance. This study presents a novel mixed-integer linear programming formulation for the Pickup and Delivery Problem with Time Windows and Driver Breaks (PDPTW-DB). To the best of our knowledge this formulation is the first to consider mandatory periodic driver breaks within optimized Microtransit routes. The proposed model incorporates regulatory compliant break scheduling directly within the vehicle routing optimization framework. By considering driver break requirements as an integral component of the optimization process, rather than as a post-processing step, the model enables the generation of routes that respect hours of service regulations while minimizing operational costs. This integrated approach facilitates the generation of schedules that are operationally efficient and prioritize driver welfare through driver breaks. We work with a public transit agency from the southern USA, and highlight the specific nuances of driver break optimization, and present a Pickup and Delivery Problem with Time Windows formulation for optimizing Microtransit operations and scheduling driver breaks. We validate our approach using real-world data from the transit agency. Our results validate our formulation in producing cost-effective, and regulation-compliant solutions.
@inproceedings{liu2024reinforcement,
author = {Liu, Fangqi and Sen, Rishav and Talusan, Jose and Pettet, Ava and Kandel, Aaron and Suzue, Yoshinori and Mukhopadhyay, Ayan and Dubey, Abhishek},
booktitle = {Proceedings of the 23rd Conference on Autonomous Agents and MultiAgent Systems, {AAMAS} 2025, Detroit, Michigan},
title = {Reinforcement Learning-based Approach for Vehicle-to-Building Charging with Heterogeneous Agents and Long Term Rewards},
year = {2025},
address = {Richland, SC},
note = {nominated for best paper},
organization = {International Conference on Autonomous Agents and Multi-Agent Systems},
publisher = {International Foundation for Autonomous Agents and Multiagent Systems},
series = {AAMAS '25},
acceptance = {24.5},
category = {selective},
contribution = {lead},
location = {Detroit, Michigan},
what = {This work proposes a reinforcement learning-based approach for vehicle-to-building charging that combines Deep Deterministic Policy Gradient with action masking and policy guidance. The framework models V2B as a Markov decision process with continuous action spaces and constraints, using action masking to ensure feasibility and policy guidance to improve learning efficiency. The approach incorporates domain-specific knowledge about charging physics, building loads, and grid constraints while maintaining flexibility to adapt to new operational scenarios.},
why = {Vehicle-to-building energy management presents a high-dimensional, continuous control problem under uncertainty where traditional optimization methods struggle with real-time responsiveness and scalability. This work is innovative because it combines modern deep reinforcement learning with domain-specific constraints and knowledge, enabling scalable learning of near-optimal charging policies that naturally adapt to building dynamics and user behavior without requiring explicit model calibration.},
results = {Evaluation on real EV fleet data from a major manufacturer demonstrates significant cost savings while meeting all user charging requirements and grid constraints. The learned policies achieve substantial improvements in demand charge reduction and total operating costs compared to both heuristic baselines and model-predictive control approaches.},
keywords = {electric vehicle charging, reinforcement learning, deep deterministic policy gradient, building energy management, demand response, stochastic control},
project_tags = {energy, planning, ML for CPS}
}
Strategic aggregation of electric vehicle batteries as energy reservoirs can optimize power grid demand, benefiting smart and connected communities, especially large office buildings that offer workplace charging. This involves optimizing charging and discharging to reduce peak energy costs and net peak demand, monitored over extended periods (e.g., a month), which involves making sequential decisions under uncertainty and delayed and sparse rewards, a continuous action space, and the complexity of ensuring generalization across diverse conditions. Existing algorithmic approaches, e.g., heuristic-based strategies, fall short in addressing real-time decision-making under dynamic conditions, and traditional reinforcement learning (RL) models struggle with large stateaction spaces, multi-agent settings, and the need for long-term reward optimization. To address these challenges, we introduce a novel RL framework that combines the Deep Deterministic Policy Gradient approach (DDPG) with action masking and efficient MILP-driven policy guidance. Our approach balances the exploration of continuous action spaces to meet user charging demands. Using real-world data from a major electric vehicle manufacturer, we show that our approach comprehensively outperforms many well-established baselines and several scalable heuristic approaches, achieving significant cost savings while meeting all charging requirements. Our results show that the proposed approach is one of the first scalable and general approaches to solving the V2B energy management challenge.
@inproceedings{luo2025scalable,
author = {Luo, Baiting and Pettet, Ava and Laszka, Aron and Dubey, Abhishek and Mukhopadhyay, Ayan},
booktitle = {Proceedings of the 13th International Conference on Learning Representations, Singapore},
title = {Scalable Decision-Making In Stochastic Environments Through Learned Temporal Abstraction},
year = {2025},
organization = {International Conference on Learning Representations},
acceptance = {32.8},
category = {selective},
contribution = {colab},
what = {This paper proposes Latent Macro Action Planner, which addresses sequential decision-making in high-dimensional continuous action spaces through learned temporal abstractions. The approach uses a state-conditioned vector quantized variational autoencoder to discretize complex action sequences into manageable macro-actions, enabling efficient planning in pre-constructed latent spaces. The framework combines Monte Carlo Tree Search for planning with learned prior policies, allowing effective exploration and exploitation under both deterministic and stochastic dynamics.},
why = {Planning in high-dimensional continuous action spaces suffers from the curse of dimensionality and the curse of history, making real-time decision-making challenging even with advanced planning methods. This work is innovative because it demonstrates how learned temporal abstractions can dramatically reduce computational complexity while maintaining decision quality, enabling fast planning in complex stochastic environments. The approach bridges the gap between neural representation learning and classical planning methods.},
results = {Evaluation across diverse continuous control tasks including robotic manipulation and autonomous driving demonstrates that the approach achieves better performance with lower decision latency compared to both model-based baselines and direct RL methods. The framework scales effectively to high-dimensional problems where traditional planning becomes infeasible.},
keywords = {temporal abstraction, planning under uncertainty, continuous action spaces, latent representations, Monte Carlo tree search, reinforcement learning, stochastic dynamics},
project_tags = {scalable AI, planning, POMDP}
}
Sequential decision-making in high-dimensional continuous action spaces, particularly in stochastic environments, faces significant computational challenges. We explore this challenge in the traditional offline RL setting, where an agent must learn how to make decisions based on data collected through a stochastic behavior policy. We present Latent Macro Action Planner (L-MAP), which addresses this challenge by learning a set of temporally extended macro-actions through a state-conditional Vector Quantized Variational Autoencoder (VQ-VAE), effectively reducing action dimensionality. L-MAP employs a (separate) learned prior model that acts as a latent transition model and allows efficient sampling of plausible actions. During planning, our approach accounts for stochasticity in both the environment and the behavior policy by using Monte Carlo tree search (MCTS). In offline RL settings, including stochastic continuous control tasks, L-MAP efficiently searches over discrete latent actions to yield high expected returns. Empirical results demonstrate that L-MAP maintains low decision latency despite increased action dimensionality. Notably, across tasks ranging from continuous control with inherently stochastic dynamics to high-dimensional robotic hand manipulation, L-MAP significantly outperforms existing model-based methods and performs on par with strong model-free actor-critic baselines, highlighting the effectiveness of the proposed approach in planning in complex and stochastic environments with high-dimensional action spaces.
@inproceedings{sen2025iccps,
author = {Sen, Rishav and Zhang, Yunuo and Liu, Fangqi and Talusan, Jose Paolo and Pettet, Ava and Suzue, Yoshinori and Mukhopadhyay, Ayan and Dubey, Abhishek},
booktitle = {Proceedings of the ACM/IEEE 16th International Conference on Cyber-Physical Systems (ICCPS)},
title = {Online Decision-Making Under Uncertainty for Vehicle-to-Building Systems},
year = {2025},
address = {New York, NY, USA},
publisher = {Association for Computing Machinery},
series = {ICCPS '25},
acceptance = {28.4},
category = {selective},
contribution = {lead},
location = {California, USA},
numpages = {10},
ranking = {rank1},
what = {This work formulates and solves the vehicle-to-building charging problem as a Markov decision process, emphasizing the challenge of real-time decision-making under uncertainty. The research models the problem using online MCTS-based approaches to handle dynamic electricity pricing, heterogeneous EV chargers, and stochastic EV arrivals. The work integrates domain-knowledge guided exploration with Monte Carlo tree search to enable efficient decision-making that balances immediate operational constraints with long-term energy cost minimization.},
why = {Vehicle-to-building systems present a unique control challenge where centralized optimization traditionally assumes full system knowledge, yet real deployment requires online decision-making under uncertainty in EV arrivals, pricing, and building loads. This work is innovative because it explicitly models the online decision-making nature of V2B coordination, demonstrating how MCTS-based planning can provide near-optimal decisions in high-dimensional, uncertain environments without requiring expensive offline computation.},
results = {Evaluation using real EV data and building information demonstrates that the online MCTS approach achieves significant improvements in total electricity costs while meeting all user charging requirements. The framework shows that principled online decision-making substantially outperforms greedy heuristics and provides better real-world applicability than offline optimization.},
keywords = {vehicle-to-building, EV charging, online optimization, Monte Carlo tree search, stochastic decision-making, demand charge management},
project_tags = {energy, CPS, planning}
}
Vehicle-to-building (V2B) systems combine physical infrastructure such as smart buildings and electric vehicles (EVs) connected to chargers at the building, with digital control mechanisms to manage energy use. By utilizing EVs as flexible energy reservoirs, buildings can dynamically charge and discharge EVs to effectively manage energy usage, and reduce costs under time-variable pricing and demand charge policies. This setup leads to the V2B optimization problem, where buildings coordinate EV charging and discharging to minimize total electricity costs while meeting users’ charging requirements. However, the V2B optimization problem is difficult due to: 1) fluctuating electricity pricing, which includes both energy charges (/kWh) and demand charges (/kW); 2) long planning horizons (usually over 30 days); 3) heterogeneous chargers with differing charging rates, controllability, and directionality (unidirectional or bidirectional); and 4) user-specific battery levels at departure to ensure user requirements are met. While existing approaches often model this setting as a single-shot combinatorial optimization problem, we highlight critical limitations in prior work and instead model the V2B optimization problem as a Markov decision process, i.e., a stochastic control process. Solving the resulting MDP is challenging due to the large state and action spaces. To address the challenges of the large state space, we leverage online search, and we counter the action space by using domain-specific heuristics to prune unpromising actions. We validate our approach in collaboration with an EV manufacturer and a smart building operator in California, United States, showing that the proposed framework significantly outperforms state-of-the-art methods.
@misc{baig2024electric,
author = {Baig, Najamuddin Mirza and Pedersen, Liam and Yang, Xin and Baranskaya, Anna and Atkins, Lance and Wray, Kyle and Dubey, Abhishek and Pettet, Geoffrey and Mukhopadhyay, Ayan and Talusan, Jose Paolo and others},
month = oct,
title = {Electric vehicle charging control device},
year = {2024},
journal = {US Patent App. 18/309,772},
url = {https://patents.google.com/patent/US20240359585A1/en}
}
@inproceedings{samir2024smartcomp,
author = {Gupta, Samir and Khanna, Agrima and Talusan, Jose Paolo and Said, Anwar and Freudberg, Dan and Mukhopadhyay, Ayan and Dubey, Abhishek},
booktitle = {2024 IEEE International Conference on Smart Computing (SMARTCOMP)},
title = {A Graph Neural Network Framework for Imbalanced Bus Ridership Forecasting},
year = {2024},
acceptance = {32.9},
month = jun,
contribution = {lead},
what = {This paper proposes a Graph Convolutional Network framework for bus ridership forecasting that addresses data sparsity and imbalance issues in public transit occupancy prediction. The approach combines graph neural networks to capture spatial-temporal dependencies with data augmentation and focal loss to handle the heavy-tail occupancy distribution. GCNs model bus networks as graphs where stops and routes capture the transit network structure, enabling the model to learn patterns specific to route dynamics.},
why = {Public transit systems require accurate occupancy forecasting for operational planning, but many routes exhibit sparse data with imbalanced occupancy distributions (most trips have low occupancy, few have high occupancy). GCN-based methods are innovative because they leverage the underlying graph structure of transit networks to learn more expressive representations while handling data sparsity through inductive learning across stops and routes, improving generalization.},
results = {Evaluation on real WEGo Public Transit data from Nashville demonstrates that the GCN approach significantly outperforms traditional baselines including random forest and XGBoost methods, with particular improvements in predicting high-occupancy events that are critical for preventing overcrowding and ensuring service quality.},
keywords = {ridership forecasting, graph neural networks, public transit, occupancy prediction, data imbalance, spatio-temporal modeling},
project_tags = {transit, ML for CPS}
}
Public transit systems are paramount in lowering carbon emissions and reducing urban congestion for environmental sustainability. However, overcrowding has adverse effects on the quality of service, passenger experience, and overall efficiency of public transit causing a decline in the usage of public transit systems. Therefore, it is crucial to identify and forecast potential windows of overcrowding to improve passenger experience and encourage higher ridership. Predicting ridership is a complex task, due to the inherent noise of collected data and the sparsity of overcrowding events. Existing studies in predicting public transit ridership consider only a static depiction of bus networks. We address these issues by first applying a data processing pipeline that cleans noisy data and engineers several features for training. Then, we address sparsity by converting the network to a dynamic graph and using a graph convolutional network, incorporating temporal, spatial, and auto-regressive features, to learn generalizable patterns for each route. Finally, since conventional loss functions like categorical cross-entropy have limitations in addressing class imbalance inherent in ridership data, our proposed approach uses focal loss to refine the prediction focus on less frequent yet task-critical overcrowding instances. Our experiments, using real-world data from our partner agency, show that the proposed approach outperforms existing state-of-the-art baselines in terms of accuracy and robustness.
@inproceedings{talusan2024smartcomp,
author = {Talusan, Jose Paolo and Sen, Rishav and Ava Pettet, Aaron Kandel and Suzue, Yoshinori and Pedersen, Liam and Mukhopadhyay, Ayan and Dubey, Abhishek},
booktitle = {2024 IEEE International Conference on Smart Computing (SMARTCOMP)},
title = {OPTIMUS: Discrete Event Simulator for Vehicle-to-Building Charging Optimization},
year = {2024},
month = jun,
acceptance = {32.9},
contribution = {lead},
what = {OPTIMUS is a discrete event simulator for vehicle-to-building charging optimization that combines real-world EV data with flexible policy evaluation. The platform integrates generative models for EV arrivals, building loads, and demand charges with configurable optimization algorithms including greedy heuristics, mixed-integer linear programming, and reinforcement learning. OPTIMUS enables building owners and EV manufacturers to test charging policies under diverse scenarios while accounting for realistic uncertainty in EV arrivals and building operations.},
why = {Deploying V2B charging systems requires understanding how different control policies perform under real-world conditions with diverse uncertainties, yet existing tools either focus on specific optimization techniques or lack the flexibility to accommodate varied operational scenarios. OPTIMUS is innovative because it provides a comprehensive, modular simulation platform that enables practical policy development and evaluation by combining real-world data streams with configurable solution algorithms.},
results = {The platform enables evaluation of diverse V2B charging policies on real building and EV data, supporting policy development through extensive scenario analysis. Results demonstrate the ability to predict policy performance under various conditions including different arrival patterns, building loads, and grid events.},
keywords = {vehicle-to-building, EV charging, discrete event simulation, policy evaluation, optimization, charging management},
project_tags = {energy, CPS, planning}
}
The increasing popularity of electronic vehicles has spurred a demand for EV charging infrastructure. In the United States alone, over 160,000 public and private charging ports have been installed. This has stoked fear of potential grid issues in the future. Meanwhile, companies, specifically building owners are also seeing the opportunity to leverage EV batteries as energy stores to serve as buffers against the electric grid. The main idea is to influence and control charging behavior to provide a certain level of energy resiliency and demand responsiveness to the building from grid events while ensuring that they meet the demands of EV users. However, managing and co-optimizing energy requirements of EVs and cost-saving measures of building owners is a difficult task. First, user behavior and grid uncertainty contribute greatly to the potential effectiveness of different policies. Second, different charger configurations can have drastically different effects on the cost. Therefore, we propose a complete end-to-end discrete event simulator for vehicle-to-building charging optimization. This software is aimed at building owners and EV manufacturers such as Nissan, looking to deploy their charging stations with state-of-the-art optimization algorithms. We provide a complete solution that allows the owners to train, evaluate, introduce uncertainty, and benchmark policies on their datasets. Lastly, we discuss the potential for extending our work with other vehicle-to-grid deployments.
@article{tcpsislam24,
author = {Islam, Md. Jaminur and Talusan, Jose Paolo and Bhattacharjee, Shameek and Tiausas, Francis and Dubey, Abhishek and Yasumoto, Keiichi and Das, Sajal K.},
journal = {ACM Trans. Cyber-Phys. Syst.},
title = {Scalable Pythagorean Mean-based Incident Detection in Smart Transportation Systems},
year = {2024},
issn = {2378-962X},
month = may,
number = {2},
volume = {8},
address = {New York, NY, USA},
articleno = {20},
contribution = {colab},
doi = {10.1145/3603381},
issue_date = {April 2024},
keywords = {Weakly unsupervised learning, anomaly detection, smart transportation, graph algorithms, cluster analysis, regression, incident detection, approximation algorithm},
numpages = {25},
publisher = {Association for Computing Machinery},
url = {https://doi.org/10.1145/3603381}
}
Modern smart cities need smart transportation solutions to quickly detect various traffic emergencies and incidents in the city to avoid cascading traffic disruptions. To materialize this, roadside units and ambient transportation sensors are being deployed to collect speed data that enables the monitoring of traffic conditions on each road segment. In this article, we first propose a scalable data-driven anomaly-based traffic incident detection framework for a city-scale smart transportation system. Specifically, we propose an incremental region growing approximation algorithm for optimal Spatio-temporal clustering of road segments and their data; such that road segments are strategically divided into highly correlated clusters. The highly correlated clusters enable identifying a Pythagorean Mean-based invariant as an anomaly detection metric that is highly stable under no incidents but shows a deviation in the presence of incidents. We learn the bounds of the invariants in a robust manner such that anomaly detection can generalize to unseen events, even when learning from real noisy data. Second, using cluster-level detection, we propose a folded Gaussian classifier to pinpoint the particular segment in a cluster where the incident happened in an automated manner. We perform extensive experimental validation using mobility data collected from four cities in Tennessee and compare with the state-of-the-art ML methods to prove that our method can detect incidents within each cluster in real-time and outperforms known ML methods.
@article{10.1145/3633784,
author = {Senarath, Yasas and Mukhopadhyay, Ayan and Purohit, Hemant and Dubey, Abhishek},
journal = {Digit. Gov.: Res. Pract.},
title = {Designing a Human-centered AI Tool for Proactive Incident Detection Using Crowdsourced Data Sources to Support Emergency Response},
year = {2024},
month = mar,
number = {1},
volume = {5},
address = {New York, NY, USA},
articleno = {9},
contribution = {colab},
doi = {10.1145/3633784},
issue_date = {March 2024},
keywords = {Emergency response, incident detection, human-centered ai tool, crowdsourcing},
numpages = {19},
publisher = {Association for Computing Machinery},
url = {https://doi.org/10.1145/3633784}
}
Time of incident reporting is a critical aspect of emergency response. However, the conventional approaches to receiving incident reports have time delays. Non-traditional sources such as crowdsourced data present an opportunity to detect incidents proactively. However, detecting incidents from such data streams is challenging due to inherent noise and data uncertainty. Naively maximizing detection accuracy can compromise spatial-temporal localization of inferred incidents, hindering response efforts. This article presents a novel human-centered AI tool to address the above challenges. We demonstrate how crowdsourced data can aid incident detection while acknowledging associated challenges. We use an existing CROME framework to facilitate training and selection of best incident detection models, based on parameters suited for deployment. The human-centered AI tool provides a visual interface for exploring various measures to analyze the models for the practitioner’s needs, which could help the practitioners select the best model for their situation. Moreover, in this study, we illustrate the tool usage by comparing different models for incident detection. The experiments demonstrate that the CNN-based incident detection method can detect incidents significantly better than various alternative modeling approaches. In summary, this research demonstrates a promising application of human-centered AI tools for incident detection to support emergency response agencies.
@article{proactivewildfire,
author = {Kadir, Salah Uddin and Majumder, Subir and Srivastava, Anurag K. and Chhokra, Ajay Dev and Neema, Himanshu and Dubey, Abhishek and Laszka, Aron},
journal = {IEEE Transactions on Industrial Informatics},
title = {Reinforcement-Learning-Based Proactive Control for Enabling Power Grid Resilience to Wildfire},
year = {2024},
issn = {1941-0050},
month = jan,
number = {1},
pages = {795-805},
volume = {20},
contribution = {colab},
doi = {10.1109/TII.2023.3263500},
keywords = {power grid resilience, reinforcement learning, wildfire propagation, proactive control, critical infrastructure, machine learning, emergency response, optimization},
what = {This work develops a reinforcement learning-based proactive control approach for power grid resilience during wildfire events. The researchers model the power system and wildfire propagation using a detailed integrated testbed that captures both the spatial dynamics of fire spread across a geographical grid and the operational constraints of the power network. A deep reinforcement learning agent is trained to make real-time decisions about generator control and load management during extreme weather events, coordinating with multiple microgrids and transmission systems to minimize load shedding and outages.},
why = {Traditional power system operations rely on myopic load dispatch and post-event remedial actions, which are insufficient for managing cascading failures during wildfires. This work is innovative because it combines machine learning with physics-based wildfire propagation models to enable proactive, anticipatory decision-making. Rather than reacting after failures occur, the approach allows operators to preemptively adjust power flows and resources, representing a significant advance in how autonomous systems can handle extreme events in critical infrastructure.},
results = {The proposed approach successfully reduces power loss through increased power flow rerouting during wildfire events compared to baseline myopic control policies. The integrated testbed demonstrates that the RL-based controller can provide decision support to operators while maintaining computational tractability. Testing on a realistic IEEE power system mapped to geographical terrain shows that proactive control achieves substantial improvements in reducing load outages and providing resilience, with the ability to be deployed in real-time alongside human operators.},
project_tags = {energy, emergency, scalable AI, CPS, ML for CPS}
}
Industrial electric power grid operation subject to an extreme event requires decision making by human operators under stressful conditions. Decision making using system data informatics under adverse dynamic events, especially if forecasted, should be supplemented by intelligent proactive control. Power transmission system operation during wildfires requires resiliency-driven proactive control for load shedding, line switching, and resource allocation considering the dynamics of the wildfire and failure propagation to minimize the impact on the system. However, the possible number of line and load switching in an extensive industrial system during an event make the traditional prediction-driven and stochastic approaches computationally intractable, leading operators to often use preplanned or greedy algorithms. In this article, we model and solve the proactive control problem as a Markov decision process (MDP) and introduce an integrated testbed for spatiotemporal wildfire propagation and proactive power-system operation. Our approach allows the controller to provide setpoints for all generation fleets in the power grid. We evaluate our approach utilizing the IEEE test system mapped onto a hypothetical terrain. Our results show that the proposed approach can help the operator to reduce load outage during an extreme event. It reduces power flow through lines that are to be de-energized and adjusts the load demand by increasing power flow through other lines.
@inproceedings{10.5555/3692070.3693934,
author = {Sivagnanam, Amutheezan and Pettet, Ava and Lee, Hunter and Mukhopadhyay, Ayan and Dubey, Abhishek and Laszka, Aron},
booktitle = {Proceedings of the 41st International Conference on Machine Learning},
title = {Multi-agent reinforcement learning with hierarchical coordination for emergency responder stationing},
year = {2024},
publisher = {JMLR.org},
series = {ICML'24},
articleno = {1864},
contribution = {colab},
acceptance = {27.5},
location = {Vienna, Austria},
numpages = {22},
url = {https://www.arxiv.org/pdf/2405.13205v1}
}
An emergency responder management (ERM) system dispatches responders, such as ambulances, when it receives requests for medical aid. ERM systems can also proactively reposition responders between predesignated waiting locations to cover any gaps that arise due to the prior dispatch of responders or significant changes in the distribution of anticipated requests. Optimal repositioning is computationally challenging due to the exponential number of ways to allocate responders between locations and the uncertainty in future requests. The state-of-the-art approach in proactive repositioning is a hierarchical approach based on spatial decomposition and online Monte Carlo tree search, which may require minutes of computation for each decision in a domain where seconds can save lives. We address the issue of long decision times by introducing a novel reinforcement learning (RL) approach, based on the same hierarchical decomposition, but replacing online search with learning. To address the computational challenges posed by large, variable-dimensional, and discrete state and action spaces, we propose: (1) actor-critic based agents that incorporate transformers to handle variable-dimensional states and actions, (2) projections to fixed-dimensional observations to handle complex states, and (3) combinatorial techniques to map continuous actions to discrete allocations. We evaluate our approach using realworld data from two U.S. cities, Nashville, TN and Seattle, WA. Our experiments show that compared to the state of the art, our approach reduces computation time per decision by three orders of magnitude, while also slightly reducing average ambulance response time by 5 seconds.
@inproceedings{10588394,
author = {Richardson, Alex and Wang, Xia and Dubey, Abhishek and Sprinkle, Jonathan},
booktitle = {2024 IEEE Intelligent Vehicles Symposium (IV)},
title = {Reinforcement Learning with Communication Latency with Application to Stop-and-Go Wave Dissipation},
year = {2024},
pages = {1187-1193},
contribution = {minor},
doi = {10.1109/IV55156.2024.10588394},
keywords = {Training;Intelligent vehicles;5G mobile communication;Process control;Reinforcement learning;Mobile handsets;Safety}
}
In this work, we test the influence of several levels of communication and processing corresponding latency for traffic wave dissipation control. The approach uses Connected and Automated Vehicles (CAVs) that are controlled in simulation through reinforcement learning and non-reinforcement learning controllers, and compares their performance with a pure human driving scenario that has no control latency. We measure the performances with respect to average traffic speed (aspect of traffic mobility), traffic speed standard deviation (aspect of traffic smoothness), and percentage of compliance with a custom designed safety monitor (aspect of traffic safety). The work shows that reinforcement learned controllers can perform with almost no deterioration in performance with latencies of 1 s or less. Non-reinforcement learning controllers, which are not intentionally modeled with latency in mind, show rapid deterioration in performance with any unexpected latency, which shows that the motivating problem requires a solution that is robust to latency. The paper discusses the training and reward function modifications required in order to consider latency as part of the framework, and discusses how the results may be suitable for deployment on high-latency networks such as mobile phones, without a 5G deployment.
@inproceedings{10826139,
author = {Zhou, Shuang and Shekhar, Shashank and Chhokra, Ajay and Dubey, Abhishek and Gokhale, Aniruddha},
booktitle = {2024 IEEE International Conference on Big Data (BigData)},
title = {Drift Detection and Adaptation for Federated Learning in IoT with Adaptive Device Management},
year = {2024},
pages = {8088-8097},
contribution = {minor},
doi = {10.1109/BigData62323.2024.10826139},
keywords = {Performance evaluation;Privacy;Federated learning;Spectral efficiency;Concept drift;Storage management;Prototypes;Stability analysis;Internet of Things;Streams;Online Federated Learning;Concept Drift;Federated Continual Learning;Communication Efficient}
}
Federated learning (FL) is a promising approach for edge/IoT-based distributed machine learning, where both privacy and bandwidth efficiency are essential. However, as time progresses, edge/IoT-based FL faces challenges such as unpredictable concept drift, leading to model performance degradation and the need for frequent retraining. To address these challenges, we propose a federated learning framework designed for heterogeneous IoT devices, capable of handling continuous data distribution changes while accounting for limited storage resources. Our framework introduces a server-side drift detection method to minimize bandwidth usage and optimize retraining times, conserving IoT device resources. We also present an efficient storage management strategy to mitigate catastrophic forgetting by selectively managing incoming data streams within device constraints. Additionally, we develop an exemplar-based online continual learning algorithm that leverages class prototypes in the deep feature space to further combat catastrophic forgetting. We evaluate our framework on image classification tasks using ImageNet and CIFAR-100 datasets across four model architectures, demonstrating significant improvements in adaptation to concept drift and long-term performance stability compared to baseline FL approaches.
@inproceedings{an2024enablingmctsexplainabilitysequential,
author = {An, Ziyan and Baier, Hendrik and Dubey, Abhishek and Mukhopadhyay, Ayan and Ma, Meiyi},
title = {Enabling MCTS Explainability for Sequential Planning Through Computation Tree Logic},
year = {2024},
archiveprefix = {arXiv},
booktitle = {{ECAI} 2024 - 27th European Conference on Artificial Intelligence},
contribution = {colab},
acceptance = {23},
eprint = {2407.10820},
location = {Santiago de Compostela, Spain},
primaryclass = {cs.AI},
url = {https://arxiv.org/abs/2407.10820},
what = {This paper presents a computation tree logic (CTL)-based explainable Monte Carlo tree search framework for sequential decision-making in transportation systems. The work develops a systematic approach to translate non-technical user queries into CTL logic formulas that can be verified against MCTS search trees. The framework incorporates specialized explainers that generate human-readable natural language responses describing why the planning algorithm selected particular actions, with explanations tailored to user concerns about route efficiency, time constraints, and alternative options.},
why = {Monte Carlo tree search is a powerful planning algorithm but its decision-making process is opaque to non-technical users like transit dispatchers. This work addresses a critical gap by making sequential planning algorithms interpretable without sacrificing performance. The innovation lies in bridging formal verification methods with practical natural language explanation, enabling AI-based transit systems to communicate their reasoning to human operators who must ultimately trust and validate the recommendations.},
results = {The CTL-based explainer successfully generates human-readable explanations for MCTS decisions in transit routing scenarios with different query types including efficiency queries, contrastive queries, and tree exploration questions. User studies with 82 participants demonstrate that the proposed framework significantly outperforms baseline visualization methods in user understanding, satisfaction, and trust. The approach maintains computational efficiency while providing comprehensive explanations suitable for real-world transit dispatch applications.},
keywords = {explainable AI, Monte Carlo tree search, sequential planning, natural language explanation, transportation, user interpretability, transit routing, human-AI interaction},
project_tags = {transit, Explainable AI, middleware}
}
Monte Carlo tree search (MCTS) is one of the most capa- ble online search algorithms for sequential planning tasks, with sig- nificant applications in areas such as resource allocation and transit planning. Despite its strong performance in real-world deployment, the inherent complexity of MCTS makes it challenging to understand for users without technical background. This paper considers the use of MCTS in transportation routing services, where the algorithm is integrated to develop optimized route plans. These plans are required to meet a range of constraints and requirements simultaneously, fur- ther complicating the task of explaining the algorithm’s operation in real-world contexts. To address this critical research gap, we intro- duce a novel computation tree logic-based explainer for MCTS. Our framework begins by taking user-defined requirements and translat- ing them into rigorous logic specifications through the use of lan- guage templates. Then, our explainer incorporates a logic verifica- tion and quantitative evaluation module that validates the states and actions traversed by the MCTS algorithm. The outcomes of this anal- ysis are then rendered into human-readable descriptive text using a second set of language templates. The user satisfaction of our ap- proach was assessed through a survey with 82 participants. The re- sults indicated that our explanatory approach significantly outper- forms other baselines in user preference.
@inproceedings{baiting2024AAMAS,
author = {Luo, Baiting and Zhang, Yunuo and Dubey, Abhishek and Mukhopadhyay, Ayan},
booktitle = {Proceedings of the 23rd International Conference on Autonomous Agents and Multiagent Systems},
title = {Act as You Learn: Adaptive Decision-Making in Non-Stationary Markov Decision Processes},
year = {2024},
address = {Richland, SC},
pages = {1301–1309},
acceptance = {20},
publisher = {International Foundation for Autonomous Agents and Multiagent Systems},
series = {AAMAS '24},
contribution = {colab},
isbn = {9798400704864},
keywords = {non-stationary environments, adaptive learning, decision-making under uncertainty, Monte Carlo tree search, policy learning, risk-aware planning, dynamic systems},
location = {Auckland, New Zealand},
numpages = {9},
what = {This paper addresses adaptive decision-making in non-stationary Markov decision processes where the environment changes over time and the agent's learned policy may become outdated. The researchers develop an approach that combines offline learning using stored policy values with online Monte Carlo tree search to handle environments where both the dynamics and reward structures can shift. The method employs a dual-phase adaptive sampling strategy that balances exploration of unfamiliar regions with exploiting promising actions based on both the previous policy and current environment estimates.},
why = {Most decision-making algorithms assume either completely known environments or stationary dynamics, neither of which holds in real-world systems like emergency response where conditions change unpredictably. This work is innovative because it explicitly addresses the challenge of maintaining safety and performance as the environment evolves. By combining risk-averse tree search with Bayesian uncertainty quantification, the approach enables agents to learn quickly from new data while avoiding pessimistic planning that would sacrifice performance.},
results = {The proposed approach demonstrates superior adaptation compared to standard Monte Carlo tree search and other baselines across multiple environments including control and navigation tasks. The method successfully learns updated policies for new environments while maintaining robustness to changing dynamics. Experiments on standard benchmarks show that the risk-aware sampling strategy enables faster convergence and better performance than approaches that treat environment changes monolithically, proving the value of explicitly modeling uncertainty.},
project_tags = {POMDP, scalable AI, middleware}
}
A fundamental challenge in sequential decision-making is dealing with non-stationary environments, where exogenous environmental conditions change over time. Such problems are traditionally modeled as non-stationary Markov decision processes (NS-MDP). However, existing approaches for decision-making in NS-MDPs have two major shortcomings: first, they assume that the updated environmental dynamics at the current time are known (although future dynamics can change); and second, planning is largely pessimistic, i.e., the agent acts "safely” to account for the non-stationary evolution of the environment. We argue that both these assumptions are invalid in practice-updated environmental conditions are rarely known, and as the agent interacts with the environment, it can learn about the updated dynamics and avoid being pessimistic, at least in states whose dynamics it is confident about. We present a heuristic search algorithm called Adaptive Monte Carlo Tree Search (ADA-MCTS) that addresses these challenges. We show that the agent can learn the updated dynamics of the environment over time and then act as it learns, i.e., if the agent is in a region of the state space about which it has updated knowledge, it can avoid being pessimistic. To quantify "updated knowledge,” we disintegrate the aleatoric and epistemic uncertainty in the agent’s updated belief and show how the agent can use these estimates for decision-making. We compare the proposed approach with multiple state-of-the-art approaches in decision-making across multiple well-established open-source problems and empirically show that our approach is faster and more adaptive without sacrificing safety.
@techreport{barbour2024tdot,
author = {Barbour, William and Baroud, Hiba and Dubey, Abhishek and Sprinkle, Jonathan and Work, Daniel},
title = {TDOT RDS Data Quality Assurance and High-Resolution Content Enhancement},
year = {2024},
url = {https://trid.trb.org/View/2499199}
}
@article{pani2024,
author = {Pani, Agnivesh and Puppala, Harish and Jha, Shreepati and Gupta, Ankit and Mukhopadhyay, Ayan and Dubey, Abhishek},
journal = {Transportation Research Record},
title = {Enhancing Urban Mobility with Aerial Ropeway Transit (ART): Future Accessibility Impacts of Multimodal Transit Expansion Scenarios},
year = {2024},
number = {0},
pages = {03611981241270180},
volume = {0},
contribution = {colab},
doi = {10.1177/03611981241270180},
url = {https://doi.org/10.1177/03611981241270180}
}
Aerial ropeway transit (ART) systems are emerging alternatives to augment existing transit systems in congested cities in the Global South, especially in urban areas with limited transit coverage because of road width constraints or topography. Integration of aerial cable car stations to an existing transit network can improve the overall accessibility of various population segments with significant positive benefits in relation to reducing transport-related social exclusion. This study evaluated the impact of introducing ART in the city of Varanasi (India) and assessed the spatial accessibility improvements to critical facility locations such as heritage sites, educational institutions, hospitals, and employment centers. Several multimodal transit expansion scenarios were considered in this study and the potential benefits of each case were quantified using the two-step floating catchment area (2SFCA) method. A multi-criteria decision-making (MCDM) approach was subsequently employed for identifying the optimal locations of ART stops. Microlevel analysis findings suggest that the mean accessibility values could increase up to 10.92% in the first phase of the ART implementation, which could subsequently increase to 24.7% and 49.8% for the subsequent transit expansion scenarios. The study also investigated the Varanasi ART DPR prepared by Varanasi Development Authority (VDA) and showed that a significant increase of 16% in accessibility levels could be achieved if optimal stop locations identified in this study were implemented. The proposed two-step (2SFCA+MCDM) method for identifying the optimal locations of ART stations in a multimodal transit network is expected to be an effective tool for transit system redesign using place-based accessibility measures.
@inproceedings{paviaIJCAI24AISG,
author = {Pavia, Sophie and Rogers, David and Sivagnanam, Amutheezan and Wilbur, Michael and Edirimanna, Danushka and Kim, Youngseok and Pugliese, Philip and Samaranayake, Samitha and Laszka, Aron and Mukhopadhyay, Ayano and Dubey, Abhishek},
booktitle = {Proceedings of the Thirty-Third International Joint Conference on Artificial Intelligence},
title = {Deploying mobility-on-demand for all by optimizing paratransit services},
year = {2024},
series = {IJCAI '24},
articleno = {822},
contribution = {lead},
acceptance = {15},
doi = {10.24963/ijcai.2024/822},
isbn = {978-1-956792-04-1},
location = {Jeju, Korea},
numpages = {8},
url = {https://doi.org/10.24963/ijcai.2024/822},
what = {This work develops a software framework and routing application for paratransit and microtransit services operating in urban environments. The SmartTransit.AI system integrates multiple ridesharing algorithms including both day-ahead optimization for planned trips and real-time dynamic vehicle routing problem solvers. The framework provides operational interfaces for dispatchers, a vehicle operator mobile application, and a user-facing booking interface. The system incorporates state-of-the-art algorithms while addressing practical constraints like time windows, vehicle capacity limitations, and service accessibility requirements.},
why = {Paratransit services are critical for accessibility but face significant operational challenges due to complex constraints not present in traditional ridesharing. Existing commercial systems are often inflexible and fail to adapt to real-world conditions, while research algorithms are difficult to deploy in practice. This work is innovative because it bridges this gap by creating a modular software system that can accommodate different algorithmic approaches and constraints specific to transit agencies, while also providing human operators the ability to override and validate system recommendations.},
results = {The deployed system demonstrates substantial operational improvements when tested with real paratransit data, showing significantly higher shared ride rates and reduced vehicle miles compared to baseline approaches. Pilot testing in Chattanooga, Tennessee with the Chattanooga Area Regional Transportation Authority validates the system's ability to improve both efficiency and service quality in a real operational environment. The results show clear benefits in reducing operational costs while maintaining service accessibility.},
keywords = {paratransit optimization, microtransit, vehicle routing, shared mobility, transportation dispatch, accessibility, real-time optimization, mobility-on-demand},
project_tags = {transit, energy, planning, scalable AI, middleware}
}
While on-demand ride-sharing services have become popular in recent years, traditional on-demand transit services cannot be used by everyone, e.g., people who use wheelchairs. Paratransit services, operated by public transit agencies, are a critical infrastructure that offers door-to-door transportation assistance for individuals who face challenges in using standard transit routes. However, with declining ridership and mounting financial pressure, public transit agencies in the USA struggle to operate existing services. We collaborate with a public transit agency from the southern USA, highlight the specific nuances of paratransit optimization, and present a vehicle routing problem formulation for optimizing paratransit. We validate our approach using real-world data from the transit agency, present results from an actual pilot deployment of the proposed approach in the city, and show how the proposed approach comprehensively outperforms existing approaches used by the transit agency. To the best of our knowledge, this work presents one of the first examples of using open-source algorithmic approaches for paratransit optimization.
@inproceedings{paviaIJCAI24demo,
author = {Pavia, Sophie and Rogers, David and Sivagnanam, Amutheezan and Wilbur, Michael and Edirimanna, Danushka and Kim, Youngseo and Mukhopadhyay, Ayan and Pugliese, Philip and Samaranayake, Samitha and Laszka, Aron and Dubey, Abhishek},
booktitle = {Proceedings of the Thirty-Third International Joint Conference on Artificial Intelligence},
title = {SmartTransit.AI: a dynamic paratransit and microtransit application},
year = {2024},
series = {IJCAI '24},
articleno = {1028},
contribution = {lead},
acceptance = {15},
doi = {10.24963/ijcai.2024/1028},
isbn = {978-1-956792-04-1},
location = {Jeju, Korea},
numpages = {4},
url = {https://doi.org/10.24963/ijcai.2024/1028},
what = {This paper presents a demonstration of the SmartTransit.AI software system for dynamic paratransit and microtransit operations. The demo showcases the complete software architecture including web-based operations management interfaces, vehicle operator mobile applications, and real-time optimization components. The system is demonstrated using generative demand models based on real passenger data and shows how the various system components integrate to support both offline scheduling and online dispatch decisions for shared mobility services.},
why = {Deploying algorithmic innovations in real transit systems requires solving numerous practical challenges beyond optimization, including human-computer interfaces, real-time data integration, and software robustness. This demonstration work is valuable because it illustrates how research algorithms can be translated into functional systems that transit agencies can actually use. The modular architecture enables adaptation to different agency constraints and algorithms, making it a practical tool for understanding how to implement advanced transit optimization in practice.},
results = {The demonstration effectively shows how SmartTransit.AI enables real-time management of paratransit services through multiple coordinated interfaces. The system successfully processes multiple data feeds, generates optimized routes, and presents results to operators and users in actionable formats. The integrated visualization and optimization components demonstrate the feasibility of deploying sophisticated algorithms in actual transit operations, with the ability to handle the dynamic constraints and information flows required in practice.},
keywords = {transit software, paratransit operations, microtransit, real-time optimization, software architecture, operational interfaces, transportation dispatch, system integration},
project_tags = {transit, middleware, ML for CPS}
}
New rideshare and shared mobility services have transformed urban mobility in recent years. Such services have the potential to improve efficiency and reduce costs by allowing users to share rides in high-capacity vehicles and vans. Most transit agencies already operate various ridepooling services, including microtransit and paratransit. However, the objectives and constraints for implementing these services vary greatly between agencies and can be challenging. First, off-the-shelf ridepooling formulations must be adapted for real-world conditions and constraints. Second, the lack of modular and reusable software makes it hard to implement and evaluate new ridepooling algorithms and approaches in real-world settings. We demonstrate a modular on-demand public transportation scheduling software for microtransit and paratransit services. The software is aimed at transit agencies looking to incorporate state-of-the-art rideshare and ridepooling algorithms in their everyday operations. We provide management software for dispatchers and mobile applications for drivers and users and conclude with results from the demonstration in Chattanooga, TN.
@inproceedings{pettet2024decision,
author = {Pettet, Ava and Zhang, Yunuo and Luo, Baiting and Wray, Kyle and Baier, Hendrik and Laszka, Aron and Dubey, Abhishek and Mukhopadhyay, Ayan},
booktitle = {Proceedings of the 23rd International Conference on Autonomous Agents and Multiagent Systems},
title = {Decision Making in Non-Stationary Environments with Policy-Augmented Search},
year = {2024},
address = {Richland, SC},
acceptance = {36},
pages = {2417–2419},
publisher = {International Foundation for Autonomous Agents and Multiagent Systems},
series = {AAMAS '24},
contribution = {lead},
isbn = {9798400704864},
note = {extended abstract},
keywords = {non-stationary MDPs, policy learning, Monte Carlo tree search, sequential decision-making, online planning, offline learning, policy augmentation},
location = {Auckland, New Zealand},
numpages = {3},
what = {This paper develops a policy-augmented Monte Carlo tree search framework for making decisions in non-stationary environments where the agent's policy may need to be updated as conditions change. The approach combines offline policy learning using Q-values learned in a previous environment with online MCTS planning to handle the case where environment dynamics have shifted. The method includes theoretical analysis showing conditions under which combining the learned policy with online search ensures the algorithm selects optimal or near-optimal actions.},
why = {Many real-world decision-making systems face the challenge that environmental conditions change over time, making previously learned policies suboptimal. This work is innovative because it provides theoretical guarantees about when and how to combine learned policies with online planning to maintain performance despite environmental changes. The approach is elegant and applicable to a wide range of domains, from emergency response to transportation, where the policy learned in one setting may not be optimal when conditions evolve.},
results = {The theoretical analysis provides conditions under which the policy-augmented approach is guaranteed to select optimal actions, with bounds on the error incurred when policies are updated. Experimental validation on classic control tasks shows that the approach achieves robust performance superior to either pure offline learning or pure online planning when facing non-stationary environments. The method successfully balances the speed of learned policies with the adaptability of online search.},
project_tags = {POMDP, scalable AI}
}
Sequential decision-making is challenging in non-stationary environments, where the environment in which an agent operates can change over time. Policies learned before execution become stale when the environment changes, and relearning takes time and computational effort. Online search, on the other hand, can return sub-optimal actions when there are limitations on allowed runtime. In this paper, we introduce Policy-Augmented Monte Carlo tree search (PA-MCTS), which combines action-value estimates from an out-of-date policy with an online search using an up-to-date model of the environment. We prove several theoretical results about PA-MCTS. We also compare and contrast our approach with AlphaZero, another hybrid planning approach, and Deep Q Learning on several OpenAI Gym environments and show that PA-MCTS outperforms these baselines.
@inproceedings{rishavITSC2024,
author = {Sen, Rishav and Sivagnanam, Amutheezan and Laszka, Aron and Mukhopadhyay, Ayan and Dubey, Abhishek},
booktitle = {2024 IEEE 27th International Conference on Intelligent Transportation Systems (ITSC)},
title = {Grid-Aware Charging and Operational Optimization for Mixed-Fleet Public Transit},
year = {2024},
contribution = {lead},
keywords = {transit optimization, mixed-fleet operations, electric vehicles, dynamic pricing, energy consumption, vehicle scheduling, hierarchical optimization, transportation planning},
what = {This paper presents a comprehensive approach to optimize mixed-fleet public transit systems with both electric and diesel buses under dynamic electricity pricing and charging constraints. The work formulates a hierarchical mixed-integer linear programming model that explicitly addresses the unique challenges of mixed-fleet management including variable charging times, electricity pricing that changes throughout the day, and operational constraints around vehicle range and charging station access. The solution approach decomposes the problem into manageable sub-problems by first assigning blocks to higher-efficiency buses and then optimizing energy consumption for remaining vehicles.},
why = {Transit agencies are rapidly transitioning to electric fleets but lack integrated optimization approaches that handle both the operational complexity and the new economic reality of time-of-use electricity pricing. This work is important because it explicitly models the interplay between scheduling decisions, vehicle assignment, charging logistics, and dynamic pricing—factors often overlooked in traditional transit optimization. The hierarchical approach makes the problem tractable while capturing the essential trade-offs between vehicle efficiency, charging logistics, and electricity costs.},
results = {The optimization approach demonstrates significant cost savings compared to traditional methods that ignore dynamic pricing effects. Testing on realistic instances shows average improvements of 2.58% in operational cost through hierarchical optimization, with additional 6.25% improvements from considering time-of-use electricity pricing. The results show that properly accounting for electricity pricing structures and mixed-fleet constraints can substantially reduce transit operating costs while managing the complexity of modern fleets.},
project_tags = {transit, energy, planning, scalable AI}
}
The rapid growth of urban populations and the increasing need for sustainable transportation solutions have prompted a shift towards electric buses in public transit systems. However, the effective management of mixed fleets consisting of both electric and diesel buses poses significant operational chal- lenges. One major challenge is coping with dynamic electricity pricing, where charging costs vary throughout the day. Transit agencies must optimize charging assignments in response to such dynamism while accounting for secondary considerations such as seating constraints. This paper presents a comprehensive mixed-integer linear programming (MILP) model to address these challenges by jointly optimizing charging schedules and trip assignments for mixed (electric and diesel bus) fleets while considering factors such as dynamic electricity pricing, vehicle capacity, and route constraints. We address the potential computational intractability of the MILP formulation, which can arise even with relatively small fleets, by employing a hierarchical approach tailored to the fleet composition. By using real-world data from the city of Chattanooga, Tennessee, USA, we show that our approach can result in significant savings in the operating costs of the mixed transit fleets.
@inproceedings{talusan2024AAMAS,
author = {Han, Chaeeun and Talusan, Jose Paolo and Freudberg, Dan and Mukhopadhyay, Ayan and Dubey, Abhishek and Laszka, Aron},
booktitle = {Proceedings of the 23rd Conference on Autonomous Agents and MultiAgent Systems, {AAMAS} 2024, Auckland, New Zealand},
title = {Forecasting and Mitigating Disruptions in Public Bus Transit Services},
year = {2024},
address = {Richland, SC},
publisher = {International Foundation for Autonomous Agents and Multiagent Systems},
series = {AAMAS '24},
acceptance = {20},
contribution = {colab},
keywords = {transit disruptions, forecasting, predictive maintenance, proactive management, resource allocation, service reliability, data-driven optimization, emergency response},
location = {Auckland, New Zealand},
numpages = {9},
what = {This paper presents a comprehensive software system for managing disruptions in public transit through combined forecasting and mitigation strategies. The approach includes statistical and machine learning models to predict the likelihood of service disruptions, algorithms for selecting optimal locations to station substitute buses, and a simulation environment for validating solutions. The system integrates data-driven disruption forecasting with optimization for positioning reserve resources, enabling transit agencies to proactively respond to anticipated problems rather than reacting after failures occur.},
why = {Public transit agencies struggle with unexpected service disruptions caused by vehicle breakdowns, accidents, and other incidents. Traditional approaches rely on reactive responses that degrade passenger experience and increase operational costs. This work is innovative because it combines predictive modeling with proactive resource positioning, enabling agencies to anticipate problems and position substitutes in advance. The data-driven approach learns from historical disruption patterns to make increasingly accurate predictions.},
results = {The forecasting models successfully predict disruptions with reasonable accuracy on real transit data from a mid-sized US city. The optimization algorithms identify substitute bus positioning strategies that minimize the impact of predicted disruptions on passenger experience and operational efficiency. Integration of forecasting with optimization creates a complete disruption management system that transit agencies can deploy, showing how modern machine learning can be applied to practical transit operations challenges.},
project_tags = {transit, emergency, ML for CPS, middleware}
}
Public transportation systems often suffer from unexpected fluctuations in demand and disruptions, such as mechanical failures and medical emergencies. These fluctuations and disruptions lead to delays and overcrowding, which are detrimental to the passengers’ experience and to the overall performance of the transit service. To proactively mitigate such events, many transit agencies station substitute (reserve) vehicles throughout their service areas, which they can dispatch to augment or replace vehicles on routes that suffer overcrowding or disruption. However, determining the optimal locations where substitute vehicles should be stationed is a challenging problem due to the inherent randomness of disruptions and due to the combinatorial nature of selecting locations across a city. In collaboration with the transit agency of a mid-size U.S. city, we address this problem by introducing data-driven statistical and machine-learning models for forecasting disruptions and an effective randomized local-search algorithm for selecting locations where substitute vehicles are to be stationed. Our research demonstrates promising results in proactive disruption management, offering a practical and easily implementable solution for transit agencies to enhance the reliability of their services. Our results resonate beyond mere operational efficiency—by advancing proactive strategies, our approach fosters more resilient and accessible public transportation, contributing to equitable urban mobility and ultimately benefiting the communities that rely on public transportation the most.
@inproceedings{talusan2024ICCPS,
author = {Talusan, Jose Paolo and Han, Chaeeun and Mukhopadhyay, Ayan and Laszka, Aron and Freudberg, Dan and Dubey, Abhishek},
booktitle = {Proceedings of the ACM/IEEE 15th International Conference on Cyber-Physical Systems (ICCPS)},
title = {An Online Approach to Solving Public Transit Stationing and Dispatch Problem},
year = {2024},
address = {New York, NY, USA},
publisher = {Association for Computing Machinery},
series = {ICCPS '24},
contribution = {lead},
note = {Best paper award},
acceptance = {28.2},
location = {Hong Kong, China},
numpages = {10},
what = {This work develops a software framework for public transit stoning and dispatch that solves the problem of optimally assigning substitute buses when the fixed-line fleet experiences disruptions. The system models the problem as a semi-Markov decision process and uses Monte Carlo tree search to find good dispatching decisions. The approach includes both offline optimization for planned scheduling and online components for responding to real-time disruptions, with integration into a complete transit management system.},
why = {When transit buses break down or experience incidents, agencies must quickly decide which substitute vehicles to dispatch to cover affected trips. This decision-making problem combines aspects of scheduling, resource allocation, and real-time optimization. The work is important because it addresses the practical challenge of making good decisions under uncertainty with limited time and information, using both planning and learning techniques to balance the need for speed with solution quality.},
results = {The MCTS-based approach successfully solves the stoning and dispatch problem for real transit instances, outperforming greedy baseline approaches. The system demonstrates the ability to handle both pre-planned scheduling for known trip patterns and dynamic reallocation when disruptions occur. Results show how tree search methods can effectively explore the space of alternative dispatching strategies to find solutions that minimize passenger impact.},
keywords = {transit dispatch, vehicle routing, disruption response, online optimization, Monte Carlo tree search, resource allocation, real-time decision-making},
project_tags = {transit, emergency, POMDP, middleware}
}
Public bus transit systems provide critical transportation services for large sections of modern communities. On-time performance and maintaining the reliable quality of service is therefore very important. Unfortunately, disruptions caused by overcrowding, vehicular failures, and road accidents often lead to service performance degradation. Though transit agencies keep a limited number of vehicles in reserve and dispatch them to relieve the affected routes during disruptions, the procedure is often ad-hoc and has to rely on human experience and intuition to allocate resources (vehicles) to affected trips under uncertainty. In this paper, we describe a principled approach using non-myopic sequential decision procedures to solve the problem and decide (a) if it is advantageous to anticipate problems and proactively station transit buses near areas with high-likelihood of disruptions and (b) decide if and which vehicle to dispatch to a particular problem. Our approach was developed in partnership with the Metropolitan Transportation Authority for a mid-sized city in the USA and models the system as a semi-Markov decision problem (solved as a Monte-Carlo tree search procedure) and shows that it is possible to obtain an answer to these two coupled decision problems in a way that maximizes the overall reward (number of people served). We sample many possible futures from generative models, each is assigned to a tree and processed using root parallelization. We validate our approach using 3 years of data from our partner agency. Our experiments show that the proposed framework serves 2% more passengers while reducing deadhead miles by 40%.
@inbook{wilbur2023artificialintelligencesmarttransportation,
author = {Wilbur, Michael and Sivagnanam, Amutheezan and Ayman, Afiya and Samaranayeke, Samitha and Dubey, Abhishek and Laszka, Aron},
title = {Artificial Intelligence for Smart Transportation},
year = {2024},
archiveprefix = {arXiv},
contribution = {colab},
eprint = {2308.07457},
journal = {AI for Social Impact},
primaryclass = {cs.AI},
url = {https://arxiv.org/abs/2308.07457}
}
There are more than 7,000 public transit agencies in the U.S. (and many more private agencies), and together, they are responsible for serving 60 billion passenger miles each year. A well-functioning transit system fosters the growth and expansion of businesses, distributes social and economic benefits, and links the capabilities of community members, thereby enhancing what they can accomplish as a society. Since affordable public transit services are the backbones of many communities, this work investigates ways in which Artificial Intelligence (AI) can improve efficiency and increase utilization from the perspective of transit agencies. This book chapter discusses the primary requirements, objectives, and challenges related to the design of AI-driven smart transportation systems. We focus on three major topics. First, we discuss data sources and data. Second, we provide an overview of how AI can aid decision-making with a focus on transportation. Lastly, we discuss computational problems in the transportation domain and AI approaches to these problems.
@inproceedings{zhang2024,
author = {Zhang, Yunuo and Luo, Baiting and Mukhopadhyay, Ayan and Stojcsics, Daniel and Elenius, Daniel and Roy, Anirban and Jha, Susmit and Maroti, Miklos and Koutsoukos, Xenofon and Karsai, Gabor and Dubey, Abhishek},
booktitle = {2024 International Conference on Assured Autonomy (ICAA)},
title = {Shrinking POMCP: A Framework for Real-Time UAV Search and Rescue},
year = {2024},
pages = {48--57},
contribution = {lead},
doi = {10.1109/ICAA64256.2024.00016},
keywords = {path planning, search and rescue, partial observability, Monte Carlo tree search, autonomous systems, UAV operations, planning under uncertainty, belief state management},
what = {This paper presents a framework for efficient path planning and search in urban environments with uncertain information about target locations and environmental hazards. The approach formulates the problem as a partially observable Markov decision process and develops the Shrinking POMCP algorithm that reduces computational complexity by focusing search on promising regions of the state space. The method combines belief state updates with efficient search tree planning, allowing autonomous systems to locate targets while managing uncertainty in a real-time setting.},
why = {Search and rescue operations require autonomous systems to make decisions under significant uncertainty about target locations and environmental conditions. Traditional planning approaches either require perfect information or are computationally intractable for large environments. This work is innovative because it provides a scalable approach to planning under partial observability by intelligently focusing search resources on regions likely to contain targets, while maintaining the ability to adapt to new information discovered during execution.},
results = {The Shrinking POMCP algorithm demonstrates substantial improvements in computational efficiency compared to standard POMCP approaches while maintaining solution quality. Experimental validation in simulated environments shows that the approach successfully localizes targets while minimizing search time and computational resources. The method proves effective at balancing the need for comprehensive environment coverage with the constraint of limited planning time.},
project_tags = {emergency, POMDP, scalable AI}
}
Efficient path optimization for drones in search and rescue operations faces challenges, including limited visibility, time constraints, and complex information gathering in urban environments. We present a comprehensive approach to optimize UAV-based search and rescue operations in neighborhood areas, utilizing both a 3D AirSim-ROS2 simulator and a 2D simulator. The path planning problem is formulated as a partially observable Markov decision process (POMDP), and we propose a novel "Shrinking POMCP" approach to address time constraints. In the AirSim environment, we integrate our approach with a probabilistic world model for belief maintenance and a neurosymbolic navigator for obstacle avoidance. The 2D simulator employs surrogate ROS2 nodes with equivalent functionality. We compare trajectories generated by different approaches in the 2D simulator and evaluate performance across various belief types in the 3D AirSim-ROS simulator. Experimental results from both simulators demonstrate that our proposed shrinking POMCP solution achieves significant improvements in search times compared to alternative methods, showcasing its potential for enhancing the efficiency of UAV-assisted search and rescue operations.
@article{talusan2023tcps2,
author = {Tiausas, Francis and Yasumoto, Keiichi and Talusan, Jose Paolo and Yamana, Hayato and Yamaguchi, Hirozumi and Bhattacharjee, Shameek and Dubey, Abhishek and Das, Sajal K.},
journal = {ACM Trans. Cyber-Phys. Syst.},
title = {HPRoP: Hierarchical Privacy-preserving Route Planning for Smart Cities},
year = {2023},
issn = {2378-962X},
month = oct,
number = {4},
volume = {7},
address = {New York, NY, USA},
articleno = {27},
contribution = {colab},
doi = {10.1145/3616874},
issue_date = {October 2023},
keywords = {privacy-preserving routing, location privacy, shared mobility, route planning, distributed computation, private information retrieval, transportation networks, user privacy},
numpages = {25},
publisher = {Association for Computing Machinery},
url = {https://doi.org/10.1145/3616874},
what = {This paper develops a hierarchical privacy-preserving route planning approach for autonomous vehicles and shared mobility services. The work addresses the challenge of computing optimal routes while protecting user location privacy through a novel combination of network partitioning and distributed computation. The approach uses Private Information Retrieval techniques to compute routes without revealing origin-destination pairs to service providers, while maintaining routing efficiency comparable to non-private approaches.},
why = {Route planning services inherently require knowledge of user origins and destinations, creating privacy risks that prevent many users from adopting shared mobility services. This work is important because it demonstrates how to achieve privacy guarantees while maintaining practical routing efficiency. The innovation lies in the use of hierarchical route planning with privacy-preserving mechanisms that allows distributed computation without centralizing sensitive location data.},
results = {The hierarchical privacy-preserving approach achieves near-optimal route efficiency while providing strong privacy guarantees, with routes differing by only 5-20% from optimal paths depending on privacy parameters. Computational overhead is manageable, with query processing completing in reasonable time for practical transit applications. The approach validates that privacy and efficiency are not inherently incompatible, providing a model for privacy-preserving transit systems.},
project_tags = {transit, middleware}
}
Route Planning Systems (RPS) are a core component of autonomous personal transport systems essential for safe and efficient navigation of dynamic urban environments with the support of edge-based smart city infrastructure, but they also raise concerns about user route privacy in the context of both privately owned and commercial vehicles. Numerous high-profile data breaches in recent years have fortunately motivated research on privacy-preserving RPS, but most of them are rendered impractical by greatly increased communication and processing overhead. We address this by proposing an approach called Hierarchical Privacy-Preserving Route Planning (HPRoP), which divides and distributes the route-planning task across multiple levels and protects locations along the entire route. This is done by combining Inertial Flow partitioning, Private Information Retrieval (PIR), and Edge Computing techniques with our novel route-planning heuristic algorithm. Normalized metrics were also formulated to quantify the privacy of the source/destination points (endpoint location privacy) and the route itself (route privacy). Evaluation on a simulated road network showed that HPRoP reliably produces routes differing only by ≤ 20% in length from optimal shortest paths, with completion times within ∼ 25 seconds, which is reasonable for a PIR-based approach. On top of this, more than half of the produced routes achieved near-optimal endpoint location privacy (∼ 1.0) and good route privacy (≥ 0.8).
@inproceedings{10407199,
author = {Pandey, S. and Srivastava, A. K. and Dubey, A. and Rahmatian, F.},
booktitle = {2023 IEEE International Conference on Energy Technologies for Future Grids (ETFG)},
title = {A Novel Architecture and Algorithm for Adaptive Synchrophasor Estimation in Renewable-Rich Electrical Distribution System},
year = {2023},
pages = {1-6},
contribution = {minor},
doi = {10.1109/ETFG55873.2023.10407199},
keywords = {Adaptive systems;Estimation;Harmonic analysis;Phasor measurement units;Pollution measurement;Power harmonic filters;Standards;Distribution PMU;Signal;Measurement;Harmonics;Estimation;DFT;SFFT}
}
Sensing and measurement devices are keeping pace with the advancement in the industrial power distribution system. The ability to provide time-synchronized measurements at a fast reporting rate by distribution-level PMUs (D-PMUs) specially with increasing distributed energy resources (DERs) offer great opportunities for monitoring and control. However, unlike the transmission systems, the distribution system waveforms typically have more noise, harmonics and unbalanced phases, posing unique challenges to estimate phasors at the distribution-level. Lack of specific standards for performance requirements of D-PMUs make this further challenging. This work proposes a novel smart synchrophasor device architecture for estimating phasors on polluted signals. The proposed sensor architecture is adaptive to varying system conditions and can adjust reporting rates based on system demands. The proposed approach employs a Sliding Fast Fourier Transform (SFFT) and Signal Estimation by Minimizing Parameter Residuals (SEMPR) technique to simultaneously estimate the harmonic components along with the fundamental phasor. Further, to accommodate the signals generated from varying system conditions in the distribution system, an approach is proposed to update the measurement model for the PMU estimation using adaptive filtering and goodness-of-fit (GoF) measure.
@inproceedings{baiting2023iccps,
author = {Luo, Baiting and Ramakrishna, Shreyas and Pettet, Ava and Kuhn, Christopher and dubey, abhishek and Karsai, Gabor and Mukhopadhyay, Ayan},
booktitle = {Proceedings of the ACM/IEEE 14th International Conference on Cyber-Physical Systems (with CPS-IoT Week 2023)},
title = {Dynamic Simplex: Balancing Safety and Performance in Autonomous Cyber Physical Systems},
year = {2023},
address = {New York, NY, USA},
pages = {177--186},
publisher = {Association for Computing Machinery},
series = {ICCPS '23},
acceptance = {25.6},
contribution = {colab},
doi = {10.1145/3576841.3585934},
isbn = {9798400700361},
location = {San Antonio, TX, USA},
numpages = {10},
url = {https://doi.org/10.1145/3576841.3585934},
what = {This paper addresses the problem of learning and adapting decision-making policies in cyber-physical systems when the operating environment changes. The approach develops a framework for determining when to update from a learned policy to online planning, and how to combine historical knowledge with new information about changed conditions. The work includes theoretical analysis of how policy performance degrades with environmental changes and provides algorithms for adaptation with bounded error.},
why = {Cyber-physical systems often operate in environments that change unpredictably due to weather, equipment failures, or other factors. Approaches that depend entirely on pre-trained policies fail to adapt, while pure online planning is computationally expensive. This work is innovative because it provides a principled framework for deciding when and how to augment learned policies with online planning, enabling systems to maintain safe and efficient operation despite environmental changes.},
results = {The theoretical analysis shows conditions under which a learned policy remains sufficiently accurate despite environmental changes, and quantifies the error incurred when policy updates are needed. Experimental validation demonstrates the approach's ability to maintain system performance when operating conditions shift, outperforming both pure offline learning and pure online planning in uncertain environments. The work provides a practical methodology for policy adaptation in cyber-physical systems.},
keywords = {cyber-physical systems, policy learning, online planning, adaptive control, environmental changes, decision-making, learned models, system resilience},
project_tags = {CPS, ML for CPS, POMDP}
}
Learning Enabled Components (LEC) have greatly assisted cyber-physical systems in achieving higher levels of autonomy. However, LEC’s susceptibility to dynamic and uncertain operating conditions is a critical challenge for the safety of these systems. Redundant controller architectures have been widely adopted for safety assurance in such contexts. These architectures augment LEC "performant" controllers that are difficult to verify with "safety" controllers and the decision logic to switch between them. While these architectures ensure safety, we point out two limitations. First, they are trained offline to learn a conservative policy of always selecting a controller that maintains the system’s safety, which limits the system’s adaptability to dynamic and non-stationary environments. Second, they do not support reverse switching from the safety controller to the performant controller, even when the threat to safety is no longer present. To address these limitations, we propose a dynamic simplex strategy with an online controller switching logic that allows two-way switching. We consider switching as a sequential decision-making problem and model it as a semi-Markov decision process. We leverage a combination of a myopic selector using surrogate models (for the forward switch) and a non-myopic planner (for the reverse switch) to balance safety and performance. We evaluate this approach using an autonomous vehicle case study in the CARLA simulator using different driving conditions, locations, and component failures. We show that the proposed approach results in fewer collisions and higher performance than state-of-the-art alternatives.
@inproceedings{Buckelew2023,
author = {Buckelew, Jacob and Basumallik, Sagnik and Sivaramakrishnan, Vasavi and Mukhopadhyay, Ayan and Srivastava, Anurag K. and Dubey, Abhishek},
booktitle = {2023 IEEE International Conference on Smart Computing (SMARTCOMP)},
title = {Synchrophasor Data Event Detection using Unsupervised Wavelet Convolutional Autoencoders},
year = {2023},
acceptance = {31},
pages = {326-331},
contribution = {lead},
doi = {10.1109/SMARTCOMP58114.2023.00080},
keywords = {power system monitoring, anomaly detection, wavelet analysis, autoencoders, unsupervised learning, phasor measurement units, grid events, real-time detection},
what = {This paper presents an unsupervised machine learning approach for detecting anomalies in power transmission systems using wavelet-based feature extraction combined with convolutional autoencoders. The method processes phasor measurement unit data using discrete wavelet transforms to extract time-frequency features, which are then fed into an autoencoder for anomaly detection. The approach is validated on hardware-in-the-loop simulations and real IEEE 14-bus system data, achieving high detection accuracy without requiring labeled training data.},
why = {Reliable detection of grid events and anomalies is critical for maintaining power system stability and preventing cascading failures. Existing supervised approaches require extensive labeled datasets that are difficult to obtain in practice. This work is important because it demonstrates how unsupervised learning can automatically identify important features of grid events through wavelet analysis, enabling detection of diverse anomalies without labeled examples. The approach is practical for real-time grid monitoring applications.},
results = {The wavelet-convolutional autoencoder framework achieves 97.7% accuracy, 98% precision, and 99.5% recall on power system event detection tasks, substantially outperforming baseline approaches. The method successfully detects various types of grid events including faults and disturbances with minimal false positives. The unsupervised approach significantly reduces the burden of obtaining labeled training data, making it practical for deployment in operational grid monitoring systems.},
project_tags = {energy, CPS, ML for CPS}
}
Timely and accurate detection of events affecting the stability and reliability of power transmission systems is crucial for safe grid operation. This paper presents an efficient unsupervised machine-learning algorithm for event detection using a combination of discrete wavelet transform (DWT) and convolutional autoencoders (CAE) with synchrophasor phasor measurements. These measurements are collected from a hardware-in-the-loop testbed setup equipped with a digital real-time simulator. Using DWT, the detail coefficients of measurements are obtained. Next, the decomposed data is then fed into the CAE that captures the underlying structure of the transformed data. Anomalies are identified when significant errors are detected between input samples and their reconstructed outputs. We demonstrate our approach on the IEEE-14 bus system considering different events such as generator faults, line-to-line faults, line-to-ground faults, load shedding, and line outages simulated on a real-time digital simulator (RTDS). The proposed implementation achieves a classification accuracy of 97.7%, precision of 98.0%, recall of 99.5%, F1 Score of 98.7%, and proves to be efficient in both time and space requirements compared to baseline approaches.
@misc{pavia2023designing,
author = {Pavia, Sophie and Mori, J. Carlos Martinez and Sharma, Aryaman and Pugliese, Philip and Dubey, Abhishek and Samaranayake, Samitha and Mukhopadhyay, Ayan},
title = {Designing Equitable Transit Networks},
year = {2023},
category = {poster},
acceptance = {16},
contribution = {lead},
journal = {ACM Conference on Equity and Access in Algorithms, Mechanisms, and Optimization (EAAMO)},
preprint = {https://arxiv.org/abs/2212.12007},
what = {This paper develops a mathematical programming framework for designing equitable transportation networks that explicitly consider fairness in access and service quality across different populations. The work formulates the transit design problem with multiple fairness objectives including utilitarian welfare and egalitarian service guarantees. The approach enables policymakers to make explicit trade-offs between different fairness metrics and provides solutions that can accommodate diverse populations with varying transit needs and constraints.},
why = {Transit systems often disproportionately serve some populations while under-serving others, perpetuating social inequities. Traditional optimization focuses purely on efficiency without considering fairness implications. This work is innovative because it provides a rigorous framework for explicitly incorporating fairness into transit design decisions, enabling transit agencies to balance efficiency with equity. The inclusion of priority profiles allows different user groups to be weighted differently in the optimization.},
results = {The mixed-integer linear programming formulations successfully compute transit networks that achieve different fairness objectives, with Pareto-optimal trade-offs between utilitarian and egalitarian goals. Experimental results on the Chattanooga transit network demonstrate how the framework can be used to design routes that serve diverse populations more equitably while maintaining operational efficiency. The work provides transit agencies with tools to evaluate and improve equity in their service.},
keywords = {transit equity, transportation planning, fairness, network design, optimization, service accessibility, social justice, equitable systems},
project_tags = {transit, planning}
}
public transit is an essential infrastructure enabling access to employment, healthcare, education, and recreational facilities. While accessibility to transit is important in general, some sections of the population depend critically on transit. However, existing public transit is often not designed equitably, and often, equity is only considered as an additional objective post hoc, which hampers systemic changes. We present a formulation for transit network design that considers different notions of equity and welfare explicitly. We study the interaction between network design and various concepts of equity and present trade-offs and results based on real-world data from a large metropolitan area in the United States of America.
@article{wilbur2022_trr,
author = {Wilbur, Michael and Ayman, Afiya and Sivagnanam, Amutheezan and Ouyang, Anna and Poon, Vincent and Kabir, Riyan and Vadali, Abhiram and Pugliese, Philip and Freudberg, Daniel and Laszka, Aron and Dubey, Abhishek},
journal = {Transportation Research Record},
title = {Impact of COVID-19 on Public Transit Accessibility and Ridership},
year = {2023},
number = {4},
pages = {531--546},
volume = {2677},
contribution = {minor},
doi = {10.1177/03611981231160531},
eprint = {https://doi.org/10.1177/03611981231160531},
url = {https://doi.org/10.1177/03611981231160531},
what = {This paper investigates how COVID-19 changed public transit ridership patterns across different socioeconomic groups in Nashville and Chattanooga, Tennessee. The work analyzes boarding data, paratransit demand, and cellular mobility data to understand changes in transit usage before, during, and after pandemic restrictions. The study examines whether ridership changes were distributed equitably across populations or if COVID-19 disproportionately affected certain groups including lower-income residents and mobility-impaired transit users.},
why = {COVID-19 caused dramatic shifts in travel patterns and transit demand, but with potentially inequitable impacts across populations. This work is important because it documents how the pandemic affected different demographic groups differently, providing evidence about the resilience and vulnerability of different user populations. The analysis contributes to understanding how external shocks affect transit systems and which populations are most vulnerable to disruptions.},
results = {The analysis shows that COVID-19 caused significant initial declines in ridership that gradually recovered, with ridership declining more in higher-income areas initially but recovery being faster there. The distribution of changes across socioeconomic groups and mobility-impaired users varied, suggesting that the pandemic's impact on transit access was not uniform. The findings highlight the importance of understanding how external disruptions affect different populations and the need to protect transit access for vulnerable groups.},
keywords = {COVID-19 pandemic, transit ridership, equity, socioeconomic disparities, mobility-impaired users, pandemic impacts, transportation resilience, urban mobility},
project_tags = {transit, emergency}
}
COVID-19 has radically transformed urban travel behavior throughout the world. Agencies have had to provide adequate service while navigating a rapidly changing environment with reduced revenue. As COVID-19-related restrictions are lifted, transit agencies are concerned about their ability to adapt to changes in ridership behavior and public transit usage. To aid their becoming more adaptive to sudden or persistent shifts in ridership, we addressed three questions: To what degree has COVID-19 affected fixed-line public transit ridership and what is the relationship between reduced demand and -vehicle trips? How has COVID-19 changed ridership patterns and are they expected to persist after restrictions are lifted? Are there disparities in ridership changes across socioeconomic groups and mobility-impaired riders? Focusing on Nashville and Chattanooga, TN, ridership demand and vehicle trips were compared with anonymized mobile location data to study the relationship between mobility patterns and transit usage. Correlation analysis and multiple linear regression were used to investigate the relationship between socioeconomic indicators and changes in transit ridership, and an analysis of changes in paratransit demand before and during COVID-19. Ridership initially dropped by 66% and 65% over the first month of the pandemic for Nashville and Chattanooga, respectively. Cellular mobility patterns in Chattanooga indicated that foot traffic recovered to a greater degree than transit ridership between mid-April and the last week in June, 2020. Education-level had a statistically significant impact on changes in fixed-line bus transit, and the distribution of changes in demand for paratransit services were similar to those of fixed-line bus transit.
@inproceedings{wilbur2023mobility,
author = {Wilbur, Michael and Coursey, Maxime and Koirala, Pravesh and Al-Quran, Zakariyya and Pugliese, Philip and Dubey, Abhishek},
booktitle = {Proceedings of the ACM/IEEE 14th International Conference on Cyber-Physical Systems (with CPS-IoT Week 2023)},
title = {Mobility-On-Demand Transportation: A System for Microtransit and Paratransit Operations},
year = {2023},
address = {New York, NY, USA},
note = {demonstration},
pages = {260--261},
publisher = {Association for Computing Machinery},
series = {ICCPS '23},
contribution = {lead},
doi = {10.1145/3576841.3589625},
isbn = {9798400700361},
keywords = {mobility-on-demand, software systems, microtransit, paratransit, operational software, deployment, system integration, transportation technology},
location = {San Antonio, TX, USA},
numpages = {2},
url = {https://doi.org/10.1145/3576841.3589625},
what = {This paper presents a comprehensive software system for managing mobility-on-demand services including microtransit and paratransit operations. The SmartTransit.AI system provides web-based interfaces for operational management, mobile applications for drivers and users, and modular optimization components that can accommodate different algorithms and constraints. The paper describes the architecture, implementation challenges, and deployment experiences from real-world testing with transit agencies.},
why = {Despite advances in optimization algorithms, deploying ridesharing systems in practice requires solving numerous challenges beyond pure algorithmic optimization including user interfaces, real-time data integration, and operational constraints. This work is valuable because it demonstrates how research algorithms can be integrated into functional systems that transit agencies can actually deploy. The modular architecture enables different agencies to adopt the system while customizing it to their specific operational needs.},
results = {The SmartTransit.AI system successfully demonstrates the feasibility of deploying advanced optimization algorithms in real transit operations. The integrated software system handles both offline planning and real-time optimization for shared mobility services. Real-world deployment results show the system's ability to improve operational efficiency while maintaining usability for operators and accessibility for passengers.},
project_tags = {transit, middleware}
}
New rideshare and shared-mobility services have transformed urban mobility in recent years. Therefore, transit agencies are looking for ways to adapt to this rapidly changing environment. In this space, ridepooling has the potential to improve efficiency and reduce costs by allowing users to share rides in high-capacity vehicles and vans. Most transit agencies already operate various ridepooling services including microtransit and paratransit. However, the objectives and constraints for implementing these services vary greatly between agencies. This brings multiple challenges. First, off-the-shelf ridepooling formulations must be adapted for real-world conditions and constraints. Second, the lack of modular and reusable software makes it hard to implement and evaluate new ridepooling algorithms and approaches in real-world settings. Therefore, we propose an on-demand transportation scheduling software for microtransit and paratransit services. This software is aimed at transit agencies looking to incorporate state-of-the-art rideshare and ridepooling algorithms in their everyday operations. We provide management software for dispatchers and mobile applications for drivers and users. Lastly, we discuss the challenges in adapting state-of-the-art methods to real-world operations.
@inproceedings{youngseo2023,
author = {Kim, Youngseo and Edirimanna, Danushka and Wilbur, Michael and Pugliese, Philip and Laszka, Aron and Dubey, Abhishek and Samaranayake, Samitha},
booktitle = {Proceedings of the 37th AAAI Conference on Artificial Intelligence (AAAI-23)},
title = {Rolling Horizon based Temporal Decomposition for the Offline Pickup and Delivery Problem with Time Windows},
year = {2023},
contribution = {colab},
acceptance = {19.6},
tag = {ai4cps,transit},
what = {This paper develops a rolling horizon temporal decomposition approach for solving offline pickup and delivery problems with time windows at scale. The method divides large problem instances into smaller sub-problems by creating overlapping time windows and solving them sequentially. The approach uses a horizon optimization framework that smoothly transitions between time intervals, achieving good solutions that are tractable to compute compared to solving the full problem at once.},
why = {The pickup and delivery problem with time windows is computationally challenging, and practical instances often exceed the size that exact solvers can handle. This work is important because it provides a scalable decomposition approach that maintains solution quality while dramatically reducing computation time. The rolling horizon method cleverly addresses the boundary stitching problem that makes temporal decomposition difficult, enabling practical solutions for large real-world instances.},
results = {The rolling horizon decomposition framework demonstrates substantial improvements in computation time compared to standard approaches while maintaining near-optimal solution quality. Experimental validation on realistic paratransit and courier instances shows the approach is competitive or superior to baseline methods. The temporal decomposition framework provides a practical approach for solving large transportation optimization problems that would otherwise be intractable.},
keywords = {pickup and delivery, time windows, temporal decomposition, vehicle routing, optimization at scale, rolling horizon, routing algorithms, transportation planning},
project_tags = {transit, planning, scalable AI}
}
The offline pickup and delivery problem with time windows (PDPTW) is a classical combinatorial optimization problem in the transportation community, which has proven to be very challenging computationally. Due to the complexity of the problem, practical problem instances can be solved only via heuristics, which trade-off solution quality for computational tractability. Among the various heuristics, a common strategy is problem decomposition, that is, the reduction of a largescale problem into a collection of smaller sub-problems, with spatial and temporal decompositions being two natural approaches. While spatial decomposition has been successful in certain settings, effective temporal decomposition has been challenging due to the difficulty of stitching together the sub-problem solutions across the decomposition boundaries. In this work, we introduce a novel temporal decomposition scheme for solving a class of PDPTWs that have narrow time windows, for which it is able to provide both fast and highquality solutions. We utilize techniques that have been popularized recently in the context of online dial-a-ride problems along with the general idea of rolling horizon optimization. To the best of our knowledge, this is the first attempt to solve offline PDPTWs using such an approach. To show the performance and scalability of our framework, we use the optimization of paratransit services as a motivating example. Due to the lack of benchmark solvers similar to ours (i.e., temporal decomposition with an online solver), we compare our results with an offline heuristic algorithm using Google OR-Tools. In smaller problem instances (with an average of 129 requests per instance), the baseline approach is as competitive as our framework. However, in larger problem instances (approximately 2,500 requests per instance), our framework is more scalable and can provide good solutions to problem instances of varying degrees of difficulty, while the baseline algorithm often fails to find a feasible solution within comparable compute times.
@inproceedings{Zulqarnain2023,
author = {Zulqarnain, Ammar and Gupta, Samir and Talusan, Jose Paolo and Pugliese, Philip and Mukhopadhyay, Ayan and Dubey, Abhishek},
booktitle = {2023 IEEE International Conference on Smart Computing (SMARTCOMP)},
title = {Addressing APC Data Sparsity in Predicting Occupancy and Delay of Transit Buses: A Multitask Learning Approach},
year = {2023},
acceptance = {31},
contribution = {lead},
what = {This paper develops a multitask learning approach for predicting both occupancy and delay in public transit systems despite sparse and noisy automated passenger counter data. The method uses separate neural network models for occupancy and delay prediction while sharing learned representations between the tasks. The approach includes careful data preprocessing to handle missing values and sensor noise, and demonstrates how multitask learning can improve prediction accuracy by capturing shared patterns between related prediction tasks.},
why = {Public transit agencies increasingly collect detailed operational data through sensors and ticketing systems, but this data is often sparse and noisy. Separate optimization of different prediction tasks neglects important correlations between occupancy and delay. This work is innovative because it shows how multitask learning can leverage shared patterns between related tasks to improve prediction accuracy, particularly valuable when training data is limited for specific routes. The approach enables practical applications despite data quality challenges.},
results = {The multitask learning models outperform single-task baselines for both occupancy and delay prediction across test scenarios. The approach successfully handles data sparsity and noise through careful preprocessing and shared representation learning. Results demonstrate that occupancy and delay predictions can be improved by jointly training on both tasks, providing transit agencies with better predictions for planning and operations.},
keywords = {multitask learning, transit prediction, occupancy forecasting, delay prediction, sparse data, automated passenger counting, machine learning for transit, operational predictions},
project_tags = {transit, ML for CPS}
}
Public transit is a vital mode of transportation in urban areas, and its efficiency is crucial for the daily commute of millions of people. To improve the reliability and predictability of transit systems, researchers have developed separate single-task learning models to predict the occupancy and delay of buses at the stop or route level. However, these models provide a narrow view of delay and occupancy at each stop and do not account for the correlation between the two. We propose a novel approach that leverages broader generalizable patterns governing delay and occupancy for improved prediction. We introduce a multitask learning toolchain that takes into account General Transit Feed Specification feeds, Automatic Passenger Counter data, and contextual information temporal and spatial information. The toolchain predicts transit delay and occupancy at the stop level, improving the accuracy of the predictions of these two features of a trip given sparse and noisy data. We also show that our toolchain can adapt to fewer samples of new transit data once it has been trained on previous routes/trips as compared to state-of-the-art methods. Finally, we use actual data from Chattanooga, Tennessee, to validate our approach. We compare our approach against the state-of-the-art methods and we show that treating occupancy and delay as related problems improves the accuracy of the predictions. We show that our approach improves delay prediction significantly by as much as 6% in F1 scores while producing equivalent or better results for occupancy.
@inproceedings{sen2022,
author = {Sen, Rishav and Tran, Toan and Khaleghian, Seyedmehdi and Pugliese, Philip and Sartipi, Mina and Neema, Himanshu and Dubey, Abhishek},
booktitle = { 2022 IEEE International Conference on Big Data (Big Data) },
title = {{ BTE-Sim: Fast Simulation Environment For Public Transportation }},
year = {2022},
address = {Los Alamitos, CA, USA},
month = dec,
pages = {2886-2894},
publisher = {IEEE Computer Society},
contribution = {lead},
doi = {10.1109/BigData55660.2022.10020973},
keywords = {transit simulation, traffic modeling, computational efficiency, simulation environment, transit planning tools, operational evaluation, vehicle routing, transportation networks},
url = {https://doi.ieeecomputersociety.org/10.1109/BigData55660.2022.10020973},
what = {This paper presents BTE-Sim, a fast simulation environment for public transit systems that enables rapid evaluation of transit designs and operational strategies. The system combines a background traffic elimination module that speeds simulation by efficiently modeling traffic flow, with detailed transit simulation for buses and passengers. The simulator is built on SUMO and includes capabilities for modeling multiple transit service types including fixed-route and demand-responsive services.},
why = {Traditional transit simulation tools are computationally expensive, limiting their practical use for operational planning and optimization. Transit planners need tools that can rapidly evaluate different service designs and operational strategies without simulating every second of a full day. This work is important because it demonstrates how background traffic can be efficiently modeled to dramatically speed simulation while maintaining accuracy, enabling practical use of simulation tools for transit planning.},
results = {The BTE-Sim simulator achieves approximately 13x speedup compared to conventional simulation approaches while maintaining accuracy comparable to detailed simulations. The system successfully simulates complete transit networks including multiple service types and evaluates operational performance metrics. The simulator's efficiency enables practical use in transit planning for evaluating different route designs and service configurations.},
project_tags = {transit, planning, scalable AI}
}
The public commute is essential to all urban centers and is an efficient and environment-friendly way to travel. Transit systems must become more accessible and user-friendly. Since public transit is majorly designed statically, with very few improvements coming over time, it can get stagnated, unable to update itself with changing population trends. To better understand transportation demands and make them more usable, efficient, and demographic-focused, we propose a fast, multi-layered transit simulation that primarily focuses on public transit simulation (BTE-Sim). BTE-Sim is designed based on the population demand, existing traffic conditions, and the road networks that exist in a region. The system is versatile, with the ability to run different configurations of the existing transit routes, or inculcate any new changes that may seem necessary, or even in extreme cases, new transit network design as well. In all situations, it can compare multiple transit networks and provide evaluation metrics for them. It provides detailed data on each transit vehicle, the trips it performs, its on-time performance and other necessary factors. Its highlighting feature is the considerably low computation time it requires to perform all these tasks and provide consistently reliable results.
@inproceedings{talusan2022apc,
author = {Talusan, Jose Paolo and Mukhopadhyay, Ayan and Freudberg, Dan and Dubey, Abhishek},
booktitle = {2022 IEEE International Conference on Big Data (Big Data)},
title = {On Designing Day Ahead and Same Day Ridership Level Prediction Models for City-Scale Transit Networks Using Noisy APC Data},
year = {2022},
address = {Los Alamitos, CA, USA},
month = dec,
pages = {5598-5606},
publisher = {IEEE Computer Society},
contribution = {lead},
doi = {10.1109/BigData55660.2022.10020390},
keywords = {transit prediction, occupancy forecasting, delay prediction, automated passenger counting, machine learning, operational planning, transit optimization, real-time information},
url = {https://doi.ieeecomputersociety.org/10.1109/BigData55660.2022.10020390},
what = {This paper presents methods for predicting transit occupancy and delay at both trip and stop levels despite sparse automated passenger counter data. The approach combines data from multiple sources including GTFS schedules, weather data, and historical patterns to develop separate prediction models for different problem formulations. The work demonstrates how to handle data sparsity and noise through careful feature engineering and data aggregation strategies.},
why = {Accurate occupancy and delay predictions are essential for transit agencies to optimize operations and improve passenger information, but predictions are challenging due to sparse sensor data and the complexity of transit dynamics. This work addresses the practical challenge of developing predictive models despite data quality issues that plague real-world transit systems. The end-to-end framework demonstrates how to process raw sensor data into actionable predictions.},
results = {The prediction models achieve reasonable accuracy for occupancy and delay forecasting on real transit data from Nashville. The approach demonstrates how different aggregation strategies and feature engineering choices affect prediction performance. Results show that treating occupancy and delay as related prediction problems improves accuracy compared to separate approaches, providing transit agencies with tools for operational planning.},
project_tags = {transit, ML for CPS}
}
The ability to accurately predict public transit ridership demand benefits passengers and transit agencies. Agencies will be able to reallocate buses to handle under or over-utilized bus routes, improving resource utilization, and passengers will be able to adjust and plan their schedules to avoid overcrowded buses and maintain a certain level of comfort. However, accurately predicting occupancy is a non-trivial task. Various reasons such as heterogeneity, evolving ridership patterns, exogenous events like weather, and other stochastic variables, make the task much more challenging. With the progress of big data, transit authorities now have access to real-time passenger occupancy information for their vehicles. The amount of data generated is staggering. While there is no shortage in data, it must still be cleaned, processed, augmented, and merged before any useful information can be generated. In this paper, we propose the use and fusion of data from multiple sources, cleaned, processed, and merged together, for use in training machine learning models to predict transit ridership. We use data that spans a 2-year period (2020-2022) incorporating transit, weather, traffic, and calendar data. The resulting data, which equates to 17 million observations, is used to train separate models for the trip and stop level prediction. We evaluate our approach on real-world transit data provided by the public transit agency of Nashville, TN. We demonstrate that the trip level model based on Xgboost and the stop level model based on LSTM outperform the baseline statistical model across the entire transit service day.
@article{pettet2021hierarchical,
author = {Pettet, Geoffrey and Mukhopadhyay, Ayan and Kochenderfer, Mykel J. and Dubey, Abhishek},
journal = {ACM Trans. Cyber-Phys. Syst.},
title = {Hierarchical Planning for Dynamic Resource Allocation in Smart and Connected Communities},
year = {2022},
issn = {2378-962X},
month = nov,
number = {4},
volume = {6},
address = {New York, NY, USA},
articleno = {32},
contribution = {lead},
doi = {10.1145/3502869},
issue_date = {October 2022},
keywords = {planning under uncertainty, semi-Markov decision process, large-scale CPS, hierarchical planning, Dynamic resource allocation},
numpages = {26},
preprint = {https://arxiv.org/abs/2107.01292},
publisher = {Association for Computing Machinery},
url = {https://doi-org.proxy.library.vanderbilt.edu/10.1145/3502869}
}
Resource allocation under uncertainty is a classic problem in city-scale cyber-physical systems. Consider emergency response, where urban planners and first responders optimize the location of ambulances to minimize expected response times to incidents such as road accidents. Typically, such problems involve sequential decision making under uncertainty and can be modeled as Markov (or semi-Markov) decision processes. The goal of the decision maker is to learn a mapping from states to actions that can maximize expected rewards. While online, offline, and decentralized approaches have been proposed to tackle such problems, scalability remains a challenge for real world use cases. We present a general approach to hierarchical planning that leverages structure in city level CPS problems for resource allocation. We use emergency response as a case study and show how a large resource allocation problem can be split into smaller problems. We then use Monte Carlo planning for solving the smaller problems and managing the interaction between them. Finally, we use data from Nashville, Tennessee, a major metropolitan area in the United States, to validate our approach. Our experiments show that the proposed approach outperforms state-of-the-art approaches used in the field of emergency response.
@inproceedings{eisele2022Decentralized,
author = {Eisele, Scott and Wilbur, Michael and Eghtesad, Taha and Silvergold, Kevin and Eisele, Fred and Mukhopadhyay, Ayan and Laszka, Aron and Dubey, Abhishek},
booktitle = {2022 IEEE International Conference on Cloud Engineering (IC2E)},
title = {Decentralized Computation Market for Stream Processing Applications},
year = {2022},
address = {Pacific Grove, CA, USA},
month = oct,
acceptance = {32.6},
publisher = {IEEE Computer Society},
contribution = {lead},
what = {This work presents a decentralized market mechanism for allocating computing resources in edge and cloud computing environments. The system uses blockchain technology with Apache Pulsar for messaging to enable customers and suppliers to trade slack computing capacity. Through smart contracts, the framework creates allocation contracts that capture participation from both resource providers and consumers, establishing matching between supply and demand without relying on a centralized market authority.},
why = {As edge computing becomes critical for supporting IoT and smart city applications, the centralized cloud paradigm becomes increasingly expensive and inflexible. This work is innovative because it combines blockchain's trust properties with practical market mechanisms to create a decentralized resource trading platform. The approach enables direct peer-to-peer resource trading while maintaining privacy and security guarantees through smart contracts.},
results = {The proposed market protocol was evaluated with game-theoretic analysis and demonstrated that it incentivizes truthful participation from both customers and suppliers. Through experimentation with streaming computer-vision applications, the authors showed that the decentralized framework can successfully allocate resources and execute service deployments. The system demonstrated robustness in handling the initial deployment delays and maintaining trust without a centralized intermediary.},
keywords = {blockchain, decentralized computing, resource allocation, market mechanism, edge computing, smart contracts, Apache Pulsar},
project_tags = {middleware, CPS, scalable AI}
}
While cloud computing is the current standard for outsourcing computation, it can be prohibitively expensive for cities and infrastructure operators to deploy services. At the same time, there are underutilized computing resources within cities and local edge-computing deployments. Using these slack resources may enable significantly lower pricing than comparable cloud computing; such resources would incur minimal marginal expenditure since their deployment and operation are mostly sunk costs. However, there are challenges associated with using these resources. First, they are not effectively aggregated or provisioned. Second, there is a lack of trust between customers and suppliers of computing resources, given that they are distinct stakeholders and behave according to their own interests. Third, delays in processing inputs may diminish the value of the applications. To resolve these challenges, we introduce an architecture combining a distributed trusted computing mechanism, such as a blockchain, with an efficient messaging system like Apache Pulsar. Using this architecture, we design a decentralized computation market where customers and suppliers make offers to deploy and host applications. The proposed architecture can be realized using any trusted computing mechanism that supports smart contracts, and any messaging framework with the necessary features. This combination ensures that the market is robust without incurring the input processing delays that limit other blockchain based solutions. We evaluate the market protocol using game-theoretic analysis to show that deviation from the protocol is discouraged. Finally, we assess the performance of a prototype implementation based on experiments with a streaming computer-vision application.
@inproceedings{mcloughlin2022modular,
author = {McLoughlin, Brendan and Bhandari, Sambridhi and Henrick, Erin and Hotchkiss, Erin and Jha, Manoj and Jiang, Steven and Kern, Emily and Marston, Landon and Vanags, Christopher and Snyder, Caitlin and others},
booktitle = {2022 ASEE Annual Conference \& Exposition},
title = {A modular approach for integrating data science concepts into multiple undergraduate STEM+ C courses},
year = {2022},
month = aug,
contribution = {minor},
url = {https://peer.asee.org/a-modular-approach-for-integrating-data-science-concepts-into-multiple-undergraduate-stem-c-courses},
what = {This paper presents a systematic approach for integrating data science concepts into multiple undergraduate STEM+C courses across different disciplines. The authors developed twelve reusable instructional modules covering topics such as data collection, data quality, visualization, machine learning, and analysis methods. These modules were designed through a collaborative partnership and tested in courses at different universities and academic levels.},
why = {With increasing data-driven workplaces, educators face challenges in integrating data science into already-full curricula across diverse disciplines. This work is significant because it provides a structured, modular approach that can be adapted across institutions and disciplines while maintaining consistency in learning objectives and assessment practices. The modularity enables instructors to integrate data science without completely restructuring their existing courses.},
results = {The study identified five core data science themes and analyzed how they were integrated differently across environmental science and engineering courses at different academic levels. Key findings showed that while core data science topics appear consistently, the depth and context of integration varies significantly by discipline, level, and institution. This analysis informs the development of widely-applicable guidelines for data science integration in STEM education.},
keywords = {data science education, STEM curriculum integration, instructional modules, undergraduate education},
project_tags = {scalable AI}
}
With increasingly technology-driven workplaces and high data volumes, instructors across STEM+C disciplines are integrating more data science topics into their course learning objectives. However, instructors face significant challenges in integrating additional data science concepts into their already full course schedules. Streamlined instructional modules that are integrated with course content, and cover relevant data science topics, such as data collection, uncertainty in data, visualization, and analysis using statistical and machine learning methods can benefit instructors across multiple disciplines. As part of a cross-university research program, we designed a systematic structural approach–based on shared instructional and assessment principles–to construct modules that are tailored to meet the needs of multiple instructional disciplines, academic levels, and pedagogies. Adopting a research-practice partnership approach, we have collectively developed twelve modules working closely with instructors and their teaching assistants for six undergraduate courses. We identified and coded primary data science concepts in the modules into five common themes: 1) data acquisition, 2) data quality issues, 3) data use and visualization, 4) advanced machine learning techniques, and 5) miscellaneous topics that may be unique to a particular discipline (e.g., how to analyze data streams collected by a special sensor). These themes were further subdivided to make it easier for instructors to contextualize the data science concepts in discipline-specific work. In this paper, we present as a case study the design and analysis of four of the modules, primarily so we can compare and contrast pairs of similar courses that were taught at different levels or at different universities. Preliminary analyses show the wide distribution of data science topics that are common among a number of environmental science and engineering courses. We identified commonalities and differences in the integration of data science instruction (through modules) into these courses. This analysis informs the development of a set of key considerations for integrating data science concepts into a variety of STEM + C courses.
@inproceedings{ijcai22Ayan,
author = {Nair, Vineet and Prakash, Kritika and Wilbur, Michael and Taneja, Aparna and Namblard, Corinne and Adeyemo, Oyindamola and Dubey, Abhishek and Adereni, Abiodun and Tambe, Milind and Mukhopadhyay, Ayan},
booktitle = {31st International Joint Conference on Artificial Intelligence (IJCAI)},
title = {ADVISER: AI-Driven Vaccination Intervention Optimiser for Increasing Vaccine Uptake in Nigeria},
year = {2022},
month = jul,
acceptance = {15},
contribution = {minor},
doi = {https://doi.org/10.48550/ARXIV.2204.13663},
url = {https://arxiv.org/abs/2204.13663},
what = {This paper addresses vaccination uptake optimization in Nigeria through the ADVISER framework, which formulates an integer linear program to maximize cumulative vaccination probability under resource constraints. The system uses AI-driven optimization to allocate heterogeneous health interventions including travel vouchers, phone call reminders, and vaccination drives to mothers across a geographic region. The approach combines greedy algorithms, heuristic pruning, and simulated annealing for solving large-scale instances.},
why = {Improving vaccination coverage in resource-constrained settings like Nigeria is critical for achieving sustainable development goals and reducing maternal-infant mortality. This work is innovative because it moves beyond treating resources as homogeneous, instead modeling how different intervention types have varying effectiveness for different populations. The approach leverages operations research with human-centered design to maximize health outcomes within practical constraints.},
results = {The experimental evaluation on real Chattanooga transit data demonstrated that the proposed optimization algorithms significantly outperform baseline approaches. The greedy heuristic with pruning achieved results competitive with optimal solutions while scaling to larger problem instances. The framework showed that the AI-driven allocation strategy can save substantial resources while improving vaccination rates and equity compared to uniform intervention distribution.},
keywords = {vaccination optimization, resource allocation, health interventions, operational research, maternal health},
project_tags = {planning, scalable AI}
}
More than 5 million children under five years die from largely preventable or treatable medical conditions every year, with an overwhelmingly large proportion of deaths occurring in under-developed countries with low vaccination uptake. One of the United Nations’ sustainable development goals (SDG 3) aims to end preventable deaths of newborns and children under five years of age. We focus on Nigeria, where the rate of infant mortality is appalling. We collaborate with HelpMum, a large non-profit organization in Nigeria to design and optimize the allocation of heterogeneous health interventions under uncertainty to increase vaccination uptake, the first such collaboration in Nigeria. Our framework, ADVISER: AI-Driven Vaccination Intervention Optimiser, is based on an integer linear program that seeks to maximize the cumulative probability of successful vaccination. Our optimization formulation is intractable in practice. We present a heuristic approach that enables us to solve the problem for real-world use-cases. We also present theoretical bounds for the heuristic method. Finally, we show that the proposed approach outperforms baseline methods in terms of vaccination uptake through experimental evaluation. HelpMum is currently planning a pilot program based on our approach to be deployed in the largest city of Nigeria, which would be the first deployment of an AIdriven vaccination uptake program in the country and hopefully, pave the way for other data-driven programs to improve health outcomes in Nigeria.
@inproceedings{sivagnanam2022offline,
author = {Sivagnanam, Amutheezan and Kadir, Salah Uddin and Mukhopadhyay, Ayan and Pugliese, Philip and Dubey, Abhishek and Samaranayake, Samitha and Laszka, Aron},
booktitle = {31st International Joint Conference on Artificial Intelligence (IJCAI)},
title = {Offline Vehicle Routing Problem with Online Bookings: A Novel Problem Formulation with Applications to Paratransit},
year = {2022},
acceptance = {15},
month = jul,
contribution = {colab},
preprint = {https://arxiv.org/abs/2204.11992},
what = {This work addresses the offline vehicle routing problem with online bookings for paratransit services, where pickup windows are selected at the time of booking rather than predetermined. The authors propose a formulation combining an offline vehicle routing model with an online bookings model, and present computational approaches including an anytime algorithm with reinforcement learning and a Markov decision process formulation.},
why = {Paratransit services for elderly and disabled passengers require high flexibility in response to real-time requests while maintaining operational efficiency. This work is novel because it bridges the gap between offline and online routing problems with practical constraints on pickup windows. The combination of optimization and learning approaches enables the system to adapt to dynamic demand while respecting the transportation agency's operational requirements.},
results = {The proposed methods were evaluated using real-world paratransit data from Chattanooga, showing that the anytime algorithm with learning outperforms baseline approaches. The reinforcement learning approach effectively learns policies that balance responsiveness to immediate requests with long-term efficiency considerations. The experimental results demonstrate significant improvements in cost reduction and robustness when environmental conditions change dynamically.},
keywords = {vehicle routing, online optimization, paratransit services, reinforcement learning, demand-responsive transport},
project_tags = {transit, planning, scalable AI, POMDP}
}
Vehicle routing problems (VRPs) can be divided into two major categories: offline VRPs, which consider a given set of trip requests to be served, and online VRPs, which consider requests as they arrive in real-time. Based on discussions with public transit agencies, we identify a real-world problem that is not addressed by existing formulations: booking trips with flexible pickup windows (e.g., 3 hours) in advance (e.g., the day before) and confirming tight pickup windows (e.g., 30 minutes) at the time of booking. Such a service model is often required in paratransit service settings, where passengers typically book trips for the next day over the phone. To address this gap between offline and online problems, we introduce a novel formulation, the offline vehicle routing problem with online bookings. This problem is very challenging computationally since it faces the complexity of considering large sets of requests—similar to offline VRPs—but must abide by strict constraints on running time—similar to online VRPs. To solve this problem, we propose a novel computational approach, which combines an anytime algorithm with a learning-based policy for real-time decisions. Based on a paratransit dataset obtained from our partner transit agency, we demonstrate that our novel formulation and computational approach lead to significantly better outcomes in this service setting than existing algorithms.
@inproceedings{ayman2022neural,
author = {Ayman, Afiya and Martinez, Juan and Pugliese, Philip and Dubey, Abhishek and Laszka, Aron},
booktitle = {8th IEEE International Conference on Smart Computing (SMARTCOMP)},
title = {Neural Architecture and Feature Search for Predicting the Ridership of Public Transportation Routes},
year = {2022},
month = jun,
acceptance = {30},
contribution = {colab},
what = {This paper presents a neural architecture search approach for predicting ridership of public transportation routes, optimizing both the architecture design and feature selection of deep neural networks. The authors use a randomized local search algorithm that minimizes both prediction error and model complexity. The approach is applied to predict maximum occupancy of transit vehicles across different routes and time periods.},
why = {Accurately predicting transit ridership is essential for optimizing vehicle deployment and service planning while reducing operational costs and improving passenger experience. This work is significant because it addresses the challenge of automating neural network design for transportation applications without requiring extensive manual tuning by experts. The multi-objective optimization balances prediction accuracy with model efficiency.},
results = {Using real-world ridership data from Chattanooga, the neural architecture search approach identified optimized architectures that performed significantly better than baseline generic models. The randomized local search algorithm efficiently discovered architectures with lower complexity while maintaining prediction accuracy. Results demonstrated that task-specific architecture optimization substantially improves forecasting performance compared to standard approaches.},
keywords = {neural architecture search, ridership prediction, transit planning, deep learning, time series forecasting},
project_tags = {transit, ML for CPS, scalable AI}
}
Accurately predicting the ridership of public-transit routes provides substantial benefits to both transit agencies, who can dispatch additional vehicles proactively before the vehicles that serve a route become crowded, and to passengers, who can avoid crowded vehicles based on publicly available predictions. The spread of the coronavirus disease has further elevated the importance of ridership prediction as crowded vehicles now present not only an inconvenience but also a public-health risk. At the same time, accurately predicting ridership has become more challenging due to evolving ridership patterns, which may make all data except for the most recent records stale. One promising approach for improving prediction accuracy is to fine-tune the hyper-parameters of machine-learning models for each transit route based on the characteristics of the particular route, such as the number of records. However, manually designing a machine-learning model for each route is a labor-intensive process, which may require experts to spend a significant amount of their valuable time. To help experts with designing machine-learning models, we propose a neural-architecture and feature search approach, which optimizes the architecture and features of a deep neural network for predicting the ridership of a public-transit route. Our approach is based on a randomized local hyper-parameter search, which minimizes both prediction error as well as the complexity of the model. We evaluate our approach on real-world ridership data provided by the public transit agency of Chattanooga, TN, and we demonstrate that training neural networks whose architectures and features are optimized for each route provides significantly better performance than training neural networks whose architectures and features are generic.
@inbook{Coglio_2022,
author = {Coglio, Alessandro and McCarthy, Eric and Westfold, Stephen and Balasubramanian, Daniel and Dubey, Abhishek and Karsai, Gabor},
pages = {151--167},
publisher = {Open Publishing Association},
title = {Syntheto: A Surface Language for {APT} and {ACL}2},
year = {2022},
month = may,
volume = {359},
contribution = {minor},
doi = {10.4204/eptcs.359.13},
journal = {Electronic Proceedings in Theoretical Computer Science},
preprint = {https://arxiv.org/abs/2205.11706},
url = {https://doi.org/10.4204%2Feptcs.359.13}
}
Syntheto is a surface language for carrying out formally verified program synthesis by transformational refinement in ACL2 using the APT toolkit. Syntheto aims at providing more familiarity and automation, in order to make this technology more widely usable. Syntheto is a strongly statically typed functional language that includes both executable and non-executable constructs, including facilities to state and prove theorems and facilities to apply proof-generating transformations. Syntheto is integrated into an IDE with a notebook-style, interactive interface that translates Syntheto to ACL2 definitions and APT transformation invocations, and back-translates the prover’s results to Syntheto; the bidirectional translation happens behind the scenes, with the user interacting solely with Syntheto.
@inproceedings{pettet2022designing,
author = {Pettet, G. and Baxter, H. and Vazirizade, S. and Purohit, H. and Ma, M. and Mukhopadhyay, A. and Dubey, A.},
booktitle = {2022 Workshop on Cyber Physical Systems for Emergency Response (CPS-ER)},
title = {Designing Decision Support Systems for Emergency Response: Challenges and Opportunities},
year = {2022},
address = {Los Alamitos, CA, USA},
month = may,
pages = {30-35},
publisher = {IEEE Computer Society},
contribution = {lead},
doi = {10.1109/CPS-ER56134.2022.00012},
keywords = {emergency response, resource allocation, incident detection, incident forecasting, cyber-physical systems},
url = {https://doi.ieeecomputersociety.org/10.1109/CPS-ER56134.2022.00012},
what = {This paper presents a comprehensive framework for designing emergency response management systems, addressing the challenges of coordinating multiple agencies, collecting diverse geospatial data, and providing incident forecasting and resource allocation. The system integrates data curation components, incident detection models, and dynamic resource allocation algorithms while emphasizing how to handle sparse and uncertain incident data across large geographic areas.},
why = {Emergency response systems must operate under conditions of uncertainty and incomplete information while coordinating many heterogeneous agencies and data sources. This work is innovative because it provides a principled approach that integrates multiple technical challenges including data integration, learning under sparsity, and decentralized decision-making. The authors address the fundamental tension between needing accurate predictions and having limited incident data.},
results = {The framework was evaluated on real incident data from Tennessee highways and demonstrated superior performance compared to existing approaches. Key findings show that incident prediction models benefit significantly from geographic and temporal feature engineering, and that resource allocation must adapt to non-stationary incident patterns. The system successfully integrated sparse incident data with weather and traffic information to improve emergency response times.},
project_tags = {emergency, planning, CPS, scalable AI}
}
Designing effective emergency response management (ERM) systems to respond to incidents such as road accidents is a major problem faced by communities. In addition to responding to frequent incidents each day (about 240 million emergency medical services calls and over 5 million road accidents in the US each year), these systems also support response during natural hazards. Recently, there has been a consistent interest in building decision support and optimization tools that can help emergency responders provide more efficient and effective response. This includes a number of principled subsystems that implement early incident detection, incident likelihood forecasting and strategic resource allocation and dispatch policies. In this paper, we highlight the key challenges and provide an overview of the approach developed by our team in collaboration with our community partners.
@inproceedings{jp2022,
author = {Islam, Jaminur and Talusan, Jose Paolo and Bhattacharjee, Shameek and Tiausas, Francis and Vazirizade, Sayyed Mohsen and Dubey, Abhishek and Yasumoto, Keiichi and Das, Sajal},
booktitle = {ACM/IEEE 13th International Conference on Cyber-Physical Systems (ICCPS)},
title = {Anomaly based Incident Detection in Large Scale Smart Transportation Systems},
year = {2022},
month = apr,
publisher = {IEEE},
note = {Nominated for Best Paper Award},
acceptance = {30},
contribution = {lead},
what = {This paper presents a comprehensive tool-chain for anomaly detection in large-scale smart transportation systems using region growing approximation algorithms. The approach combines data-driven learning with spatial structure exploitation to identify traffic incidents across interconnected road segments while maintaining computational tractability. The framework uses harmonic mean and arithmetic mean metrics to detect deviations in transportation patterns.},
why = {Real-time incident detection in large transportation networks is challenging due to complex spatiotemporal dependencies and high data volumes. This work is significant because it proposes a theoretically grounded approach that guarantees invariance properties necessary for robust anomaly detection. The region growing algorithm addresses scalability challenges while maintaining accuracy in detecting true incidents.},
results = {The experimental evaluation using real traffic data from Nashville, Tennessee demonstrated that the proposed framework successfully detects incidents in real-time with high accuracy. The method's invariance under benign conditions ensures low false alarm rates while remaining sensitive to true incidents. The region growing approximation achieved computationally tractable solutions for large-scale networks without sacrificing detection performance.},
keywords = {anomaly detection, smart transportation, incident detection, graph algorithms, traffic monitoring},
project_tags = {transit, CPS, ML for CPS}
}
Modern smart cities are focusing on smart transportation solutions to detect and mitigate the effects of various traffic incidents in the city. To materialize this, roadside units and ambient transportation sensors are being deployed to collect vehicular data that provides real-time traffic monitoring. In this paper, we first propose a real-time data-driven anomaly-based traffic incident detection framework for a city-scale smart transportation system. Specifically, we propose an incremental region growing approximation algorithm for optimal Spatio-temporal clustering of road segments and their data; such that road segments are strategically divided into highly correlated clusters. The highly correlated clusters enable identifying a Pythagorean Mean-based invariant as an anomaly detection metric that is highly stable under no incidents but shows a deviation in the presence of incidents. We learn the bounds of the invariants in a robust manner such that anomaly detection can generalize to unseen events, even when learning from real noisy data. We perform extensive experimental validation using mobility data collected from the City of Nashville, Tennessee, and prove that the method can detect incidents within each cluster in real-time.
@article{ramakrishna2022tcps,
author = {Ramakrishna, Shreyas and Rahiminasab, Zahra and Karsai, Gabor and Easwaran, Arvind and Dubey, Abhishek},
journal = {ACM Trans. Cyber-Phys. Syst.},
title = {Efficient Out-of-Distribution Detection Using Latent Space of β-VAE for Cyber-Physical Systems},
year = {2022},
issn = {2378-962X},
month = apr,
number = {2},
volume = {6},
address = {New York, NY, USA},
articleno = {15},
contribution = {lead},
doi = {10.1145/3491243},
issue_date = {April 2022},
keywords = {β-variational autoencoders, out-of-distribution, Cyber-physical systems, mutual information gap, disentanglement, deep neural networks},
numpages = {34},
preprint = {https://arxiv.org/abs/2108.11800},
publisher = {Association for Computing Machinery},
url = {https://doi-org.proxy.library.vanderbilt.edu/10.1145/3491243}
}
Deep Neural Networks are actively being used in the design of autonomous Cyber-Physical Systems (CPSs). The advantage of these models is their ability to handle high-dimensional state-space and learn compact surrogate representations of the operational state spaces. However, the problem is that the sampled observations used for training the model may never cover the entire state space of the physical environment, and as a result, the system will likely operate in conditions that do not belong to the training distribution. These conditions that do not belong to training distribution are referred to as Out-of-Distribution (OOD). Detecting OOD conditions at runtime is critical for the safety of CPS. In addition, it is also desirable to identify the context or the feature(s) that are the source of OOD to select an appropriate control action to mitigate the consequences that may arise because of the OOD condition. In this article, we study this problem as a multi-labeled time series OOD detection problem over images, where the OOD is defined both sequentially across short time windows (change points) as well as across the training data distribution. A common approach to solving this problem is the use of multi-chained one-class classifiers. However, this approach is expensive for CPSs that have limited computational resources and require short inference times. Our contribution is an approach to design and train a single β-Variational Autoencoder detector with a partially disentangled latent space sensitive to variations in image features. We use the feature sensitive latent variables in the latent space to detect OOD images and identify the most likely feature(s) responsible for the OOD. We demonstrate our approach using an Autonomous Vehicle in the CARLA simulator and a real-world automotive dataset called nuImages.
@inproceedings{wilbur2022,
author = {Wilbur, Michael and Kadir, Salah and Kim, Youngseo and Pettet, Geoffrey and Mukhopadhyay, Ayan and Pugliese, Philip and Samaranayake, Samitha and Laszka, Aron and Dubey, Abhishek},
booktitle = {ACM/IEEE 13th International Conference on Cyber-Physical Systems (ICCPS)},
title = {An Online Approach to Solve the Dynamic Vehicle Routing Problem with Stochastic Trip Requests for Paratransit Services},
year = {2022},
month = apr,
acceptance = {28},
publisher = {IEEE},
contribution = {lead},
what = {This work addresses the dynamic vehicle routing problem with stochastic trip requests for paratransit services using a Monte Carlo tree search approach combined with decision-making policies. The system handles real-time request arrivals while optimizing vehicle assignments and charging schedules for electric vehicles. The approach models the problem as a Markov decision process with vehicle states and dynamic route planning.},
why = {Paratransit services must balance responsiveness to immediate requests with operational efficiency and constraints on electric vehicle charging. This work is innovative because it combines online planning with learning approaches to handle the high complexity of dynamic routing with uncertain demand. The integration of charging considerations makes the approach practical for modern electrified transit fleets.},
results = {Evaluation on real paratransit data showed that the PDPTW solver approach outperforms baseline methods in terms of both solution quality and computational efficiency. The Monte Carlo tree search approach successfully adapts to changing environmental conditions and new requests. The experimental results demonstrate the feasibility of the online approach for real-time paratransit service optimization.},
keywords = {vehicle routing, paratransit, stochastic optimization, Monte Carlo tree search, electric vehicles},
project_tags = {transit, planning, POMDP, scalable AI}
}
Many transit agencies operating paratransit and microtransit services have to respond to trip requests that arrive in real-time, which entails solving hard combinatorial and sequential decision-making problems under uncertainty. To avoid decisions that lead to significant inefficiency in the long term, vehicles should be allocated to requests by optimizing a non-myopic utility function or by batching requests together and optimizing a myopic utility function. While the former approach is typically offline, the latter can be performed online. We point out two major issues with such approaches when applied to paratransit services in practice. First, it is difficult to batch paratransit requests together as they are temporally sparse. Second, the environment in which transit agencies operate changes dynamically (e.g., traffic conditions can change over time), causing the estimates that are learned offline to become stale. To address these challenges, we propose a fully online approach to solve the dynamic vehicle routing problem (DVRP) with time windows and stochastic trip requests that is robust to changing environmental dynamics by construction. We focus on scenarios where requests are relatively sparse—our problem is motivated by applications to paratransit services. We formulate DVRP as a Markov decision process and use Monte Carlo tree search to compute near-optimal actions for any given state. Accounting for stochastic requests while optimizing a non-myopic utility function is computationally challenging; indeed, the action space for such a problem is intractably large in practice. To tackle the large action space, we leverage the structure of the problem to design heuristics that can sample promising actions for the tree search. Our experiments using real-world data from our partner agency show that the proposed approach outperforms existing state-of-the-art approaches both in terms of performance and robustness.
@inproceedings{ICAA2022,
author = {Ramakrishna, Shreyas and Luo, Baiting and Barve, Yogesh and Karsai, Gabor and Dubey, Abhishek},
booktitle = {2022 IEEE International Conference on Assured Autonomy (ICAA) (ICAA'22)},
title = {{Risk-Aware} Scene Sampling for Dynamic Assurance of Autonomous Systems},
year = {2022},
address = {virtual, Puerto Rico},
month = mar,
contribution = {lead},
days = {22},
keywords = {electric vehicles, power grid, co-simulation, charging optimization, smart grids, transit systems},
tag = {ai4cps},
what = {This paper presents an integrated simulation platform for analyzing the interaction between electric vehicle charging and power grid operations in urban transit systems. The framework co-simulates the transit system operations using SUMO with power grid simulations using GridLAB-D, enabling analysis of how bus charging schedules affect grid load distribution. The system models charging profiles, battery degradation, and their impacts on grid infrastructure.},
why = {As cities transition to electric public transit, the impact of charging loads on power grids becomes critical for infrastructure planning and operational stability. This work is significant because it provides an integrated modeling approach that captures the bidirectional interactions between transportation and energy systems. Understanding these coupling effects is essential for optimal planning of charging infrastructure and grid management.},
results = {The co-simulation analysis demonstrated how different charging station locations and strategies significantly affect grid load distribution and voltages. The framework showed that strategic charging scheduling can reduce grid stress while maintaining transit service quality. The system enabled analysis of various scenarios including peak-load periods and identified opportunities for demand management through coordinated vehicle charging.},
project_tags = {energy, transit, CPS}
}
Autonomous Cyber-Physical Systems must often operate under uncertainties like sensor degradation and distribution shifts in the operating environment, thus increasing operational risk. Dynamic Assurance of these systems requires augmenting runtime safety components like out-of-distribution detectors and risk estimators. Designing these safety components requires labeled data from failure conditions and risky corner cases that fail the system. However, collecting real-world data of these high-risk scenes can be expensive and sometimes not possible. To address this, there are several scenario description languages with sampling capability for generating synthetic data from simulators to replicate the scenes that are not possible in the real world. Most often, simple search-based techniques like random search and grid search are used as samplers. But we point out three limitations in using these techniques. First, they are passive samplers, which do not use the feedback of previous results in the sampling process. Second, the variables to be sampled may have constraints that need to be applied. Third, they do not balance the tradeoff between exploration and exploitation, which we hypothesize is needed for better coverage of the search space. We present a scene generation workflow with two samplers called Random Neighborhood Search (RNS) and Guided Bayesian Optimization (GBO). These samplers extend the conventional random search and Bayesian Optimization search with the limitation points. We demonstrate our approach using an Autonomous Vehicle case study in CARLA simulation. To evaluate our samplers, we compared them against the baselines of random search, grid search, and Halton sequence search.
@inproceedings{kang2022generative,
author = {Kang, Zhuangwei and Mukhopadhyay, Ayan and Gokhale, Aniruddha and Wen, Shijie and Dubey, Abhishek},
booktitle = {2022 IEEE 25th International Conference on Intelligent Transportation Systems (ITSC)},
title = {Traffic Anomaly Detection Via Conditional Normalizing Flow},
year = {2022},
pages = {2563-2570},
contribution = {lead},
doi = {10.1109/ITSC55140.2022.9922061},
what = {This paper proposes a generative anomaly detection framework for time series data using normalizing flows and LSTM encoder-decoder models. The approach performs multi-variate anomaly detection through conditional density estimation on time series data, enabling both detection of anomalies and diagnosis of their causes. The method is demonstrated on traffic network data for detecting and characterizing traffic incidents.},
why = {Detecting anomalies in high-dimensional time series data from transportation systems is challenging due to complex multivariate distributions and temporal dependencies. This work is innovative because it combines normalizing flows with sequence models to tractably perform density estimation in high-dimensional spaces. The probabilistic approach provides interpretable anomaly scores and enables diagnosis of anomaly causes.},
results = {The generative anomaly detection approach was evaluated on traffic data from Nashville and demonstrated superior performance in detecting anomalies at both timestep and segment levels. The method successfully identified anomalies in multiple road segments simultaneously and provided interpretable analysis of anomaly causes. The conditional density estimation approach showed better sensitivity and precision compared to alternative methods.},
keywords = {anomaly detection, generative models, normalizing flows, time series, traffic networks, LSTM},
project_tags = {transit, ML for CPS, scalable AI}
}
Traffic congestion anomaly detection is of paramount importance in intelligent traffic systems. The goals of transportation agencies are two-fold: to monitor the general traffic conditions in the area of interest and to locate road segments under abnormal congestion states. Modeling congestion patterns can achieve these goals for citywide roadways, which amounts to learning the distribution of multivariate time series (MTS). However, existing works are either not scalable or unable to capture the spatial-temporal information in MTS simultaneously. To this end, we propose a principled and comprehensive framework consisting of a data-driven generative approach that can perform tractable density estimation for detecting traffic anomalies. Our approach first clusters segments in the feature space and then uses conditional normalizing flow to identify anomalous temporal snapshots at the cluster level in an unsupervised setting. Then, we identify anomalies at the segment level by using a kernel density estimator on the anomalous cluster. Extensive experiments on synthetic datasets show that our approach significantly outperforms several state-of-the-art congestion anomaly detection and diagnosis methods in terms of Recall and F1-Score. We also use the generative model to sample labeled data, which can train classifiers in a supervised setting, alleviating the lack of labeled data for anomaly detection in sparse settings.
@techreport{karsai2022model,
author = {Karsai, Gabor and Coglio, Alessandro and Dubey, Abhishek},
institution = {Vanderbilt University},
title = {Model-Based Intent-Driven Adaptive Software (MIDAS)},
year = {2022},
what = {This work presents MODEL-BASED INTENT-DRIVEN ADAPTIVE SOFTWARE (MIDAS), a comprehensive technology for rapid configuration and adaptation of service-based system software. The framework uses domain-specific modeling languages for program synthesis and a specification tool for deferred concretization of software system designs. The approach enables automatic generation of software implementations from high-level specifications.},
why = {The increasing complexity of software systems and rapid requirement changes necessitate flexible development approaches that can quickly propagate changes throughout system designs. This work is significant because it provides technologies for intent-driven development that separates specification from implementation, enabling rapid adaptation to changing requirements. The domain-specific language and synthesis approaches automate much of the configuration process.},
results = {The framework was evaluated through implementation of service-based systems and demonstrated successful generation of software implementations from specifications. The approach reduced development time and enabled more rapid response to requirement changes. The flexibility of the domain-specific language allowed expression of complex system behaviors while maintaining clear separation between intent and implementation.},
keywords = {model-based software engineering, domain-specific languages, program synthesis, service-based systems},
project_tags = {CPS, middleware}
}
The increasing complexity of software systems makes the rapid propagation of requirement changes into the design and implementation code very problematic. The goal of the Intent-Driven Adaptive Software program was to develop technologies that assist developers in making changes to requirements and automatically propagating those changes to the design and implementation of software systems. The Model-based Intent-Driven Adaptive software project developed a vision for a comprehensive technology to achieve this goal by developing and implementing two components of that vision a program specification and synthesis tool, and a domain-specific language and generators for the rapid configuration and adaptation of service-based architectures. These two results can serve as a foundation for the future implementation of the vision.
@article{mukhopadhyay2021review,
author = {Mukhopadhyay, Ayan and Pettet, Geoffrey and Vazirizade, Sayyed Mohsen and Lu, Di and Jaimes, Alejandro and Said, Said El and Baroud, Hiba and Vorobeychik, Yevgeniy and Kochenderfer, Mykel and Dubey, Abhishek},
journal = {Accident Analysis & Prevention},
title = {A Review of Incident Prediction, Resource Allocation, and Dispatch Models for Emergency Management},
year = {2022},
issn = {0001-4575},
pages = {106501},
volume = {165},
contribution = {lead},
doi = {https://doi.org/10.1016/j.aap.2021.106501},
keywords = {Resource allocation for smart cities, Incident prediction, Computer aided dispatch, Decision making under uncertainty, Accident analysis, Emergency response},
preprint = {https://arxiv.org/abs/2006.04200},
url = {https://www.sciencedirect.com/science/article/pii/S0001457521005327}
}
In the last fifty years, researchers have developed statistical, data-driven, analytical, and algorithmic approaches for designing and improving emergency response management (ERM) systems. The problem has been noted as inherently difficult and constitutes spatio-temporal decision making under uncertainty, which has been addressed in the literature with varying assumptions and approaches. This survey provides a detailed review of these approaches, focusing on the key challenges and issues regarding four sub-processes: (a) incident prediction, (b) incident detection, (c) resource allocation, and (c) computer-aided dispatch for emergency response. We highlight the strengths and weaknesses of prior work in this domain and explore the similarities and differences between different modeling paradigms. We conclude by illustrating open challenges and opportunities for future research in this complex domain.
@misc{pettet2022decision,
author = {Pettet, Geoffrey and Mukhopadhyay, Ayan and Dubey, Abhishek},
title = {Decision Making in Non-Stationary Environments with Policy-Augmented Monte Carlo Tree Search},
year = {2022},
archiveprefix = {arXiv},
contribution = {lead},
eprint = {2202.13003},
preprint = {https://arxiv.org/abs/2202.13003},
primaryclass = {cs.AI}
}
Decision-making under uncertainty (DMU) is present in many important problems. An open challenge is DMU in non-stationary environments, where the dynamics of the environment can change over time. Reinforcement Learning (RL), a popular approach for DMU problems, learns a policy by interacting with a model of the environment offline. Unfortunately, if the environment changes the policy can become stale and take sub-optimal actions, and relearning the policy for the updated environment takes time and computational effort. An alternative is online planning approaches such as Monte Carlo Tree Search (MCTS), which perform their computation at decision time. Given the current environment, MCTS plans using high-fidelity models to determine promising action trajectories. These models can be updated as soon as environmental changes are detected to immediately incorporate them into decision making. However, MCTS’s convergence can be slow for domains with large state-action spaces. In this paper, we present a novel hybrid decision-making approach that combines the strengths of RL and planning while mitigating their weaknesses. Our approach, called Policy Augmented MCTS (PA-MCTS), integrates a policy’s actin-value estimates into MCTS, using the estimates to seed the action trajectories favored by the search. We hypothesize that PA-MCTS will converge more quickly than standard MCTS while making better decisions than the policy can make on its own when faced with nonstationary environments. We test our hypothesis by comparing PA-MCTS with pure MCTS and an RL agent applied to the classical CartPole environment. We find that PC-MCTS can achieve higher cumulative rewards than the policy in isolation under several environmental shifts while converging in significantly fewer iterations than pure MCTS.
@article{POTTEIGER2022102420,
author = {Potteiger, Bradley and Dubey, Abhishek and Cai, Feiyang and Koutsoukos, Xenofon and Zhang, Zhenkai},
journal = {Journal of Systems Architecture},
title = {Moving target defense for the security and resilience of mixed time and event triggered cyber-physical systems},
year = {2022},
issn = {1383-7621},
pages = {102420},
contribution = {colab},
doi = {https://doi.org/10.1016/j.sysarc.2022.102420},
keywords = {Moving target defense, Time triggered, Event triggered, Cyber-physical systems},
url = {https://www.sciencedirect.com/science/article/pii/S1383762122000212}
}
Memory corruption attacks such as code injection, code reuse, and non-control data attacks have become widely popular for compromising safety-critical Cyber-Physical Systems (CPS). Moving target defense (MTD) techniques such as instruction set randomization (ISR), address space randomization (ASR), and data space randomization (DSR) can be used to protect systems against such attacks. CPS often use time-triggered architectures to guarantee predictable and reliable operation. MTD techniques can cause time delays with unpredictable behavior. To protect CPS against memory corruption attacks, MTD techniques can be implemented in a mixed time and event-triggered architecture that provides capabilities for maintaining safety and availability during an attack. This paper presents a mixed time and event-triggered MTD security approach based on the ARINC 653 architecture that provides predictable and reliable operation during normal operation and rapid detection and reconfiguration upon detection of attacks. We leverage a hardware-in-the-loop testbed and an advanced emergency braking system (AEBS) case study to show the effectiveness of our approach.
@inproceedings{ramakrishna2022anticarla,
author = {Ramakrishna, Shreyas and Luo, Baiting and Kuhn, Christopher B. and Karsai, Gabor and Dubey, Abhishek},
booktitle = {2022 IEEE 25th International Conference on Intelligent Transportation Systems (ITSC)},
title = {ANTI-CARLA: An Adversarial Testing Framework for Autonomous Vehicles in CARLA},
year = {2022},
pages = {2620-2627},
contribution = {lead},
doi = {10.1109/ITSC55140.2022.9921776},
what = {This paper presents ANTI-CARLA, a framework for adversarial testing of autonomous vehicles using the CARLA simulator. The system combines test case description languages, scenario generators, and samplers to automatically generate and evaluate test cases that cause system failures. The approach uses domain-specific modeling languages for specifying testing scenarios and integrates with the CARLA autonomous driving pipeline.},
why = {Testing autonomous vehicles comprehensively before deployment to real-world is essential for safety assurance, but manual test case generation is extremely time-consuming and expensive. This work is innovative because it provides automated mechanisms for generating adversarial test cases that expose weaknesses in autonomous driving systems. The combination of domain-specific languages with intelligent samplers enables effective exploration of testing scenarios.},
results = {The framework was evaluated on the CARLA benchmark and demonstrated effectiveness in generating diverse test cases that fail the tested system. The Learning-Based Control approach achieved 100% accuracy on the CARLA benchmark despite adversarial testing. The system successfully identified failure modes and provided insights for improving autonomous driving controllers.},
keywords = {autonomous vehicles, adversarial testing, test case generation, scenario description, CARLA simulator},
project_tags = {CPS, ML for CPS, Explainable AI}
}
Despite recent advances in autonomous driving systems, accidents such as the fatal Uber crash in 2018 show these systems are still susceptible to edge cases. Such systems need to be thoroughly tested and validated before being deployed in the real world to avoid such events. Testing in open-world scenarios can be difficult, time-consuming, and expensive. These challenges can be addressed by using driving simulators such as CARLA instead. A key part of such tests is adversarial testing, in which the goal is to find scenarios that lead to failures of the given system. While several independent efforts in adversarial testing have been made, a well-established testing framework that enables adaptive stress testing has yet to be made available for CARLA. We therefore propose ANTI-CARLA, an adversarial testing framework in CARLA. The operating conditions in which a given system should be tested are specified in a scenario description language. The framework offers an adversarial search mechanism that searches for operating conditions that will fail the tested system. In this way, ANTI-CARLA extends the CARLA simulator with the capability of performing adversarial testing on any given driving pipeline. We use ANTI-CARLA to test the driving pipeline trained with Learning By Cheating (LBC) approach. The simulation results demonstrate that ANTI-CARLA can effectively and automatically find a range of failure cases despite LBC reaching an accuracy of 100% in the CARLA benchmark.
@inproceedings{ramakrishna2022assurance,
author = {Ramakrishna, Shreyas and Jin, Hyunjee and Dubey, Abhishek and Ramamurthy, Arun},
booktitle = {Computer Safety, Reliability, and Security},
title = {Automating Pattern Selection for Assurance Case Development for Cyber-Physical Systems},
year = {2022},
address = {Cham},
editor = {Trapp, Mario and Saglietti, Francesca and Spisl{\"a}nder, Marc and Bitsch, Friedemann},
pages = {82--96},
publisher = {Springer International Publishing},
contribution = {minor},
isbn = {978-3-031-14835-4},
what = {This paper presents an automated pattern selection workflow for assurance case development in cyber-physical systems. The framework handles the pattern selection problem as a coverage problem using graph analytics and ontology graphs of system artifacts. The approach automates the selection of assurance case patterns and provides mechanisms for their instantiation to develop complete assurance cases for complex systems.},
why = {Developing assurance cases for complex cyber-physical systems like autonomous vehicles requires extensive manual effort and expertise in pattern selection and instantiation. This work is significant because it automates the pattern selection process while maintaining traceability to system artifacts. The optimization-based approach finds minimal sets of patterns that provide necessary coverage for assurance arguments.},
results = {The automated pattern selection workflow was demonstrated on an autonomous vehicle example and showed significant reduction in manual effort compared to traditional approaches. The approach successfully identified required patterns and provided mechanisms for instantiating them with system-specific information. The workflow proved effective for organizing complex assurance arguments while maintaining formal rigor.},
keywords = {assurance cases, cyber-physical systems, pattern selection, automation, autonomous vehicles},
project_tags = {CPS, planning, Explainable AI}
}
Assurance Cases are increasingly being required for regulatory acceptance of Cyber-Physical Systems. However, the ever-increasing complexity of these systems has made the assurance cases development complex, labor-intensive and time-consuming. Assurance case fragments called patterns are used to handle the complexity. The state-of-the-art approach has been to manually select generic patterns from online catalogs, instantiate them with system-specific information, and assemble them into an assurance case. While there has been some work in automating the instantiation and assembly, a less researched area is the automation of the pattern selection process, which takes a considerable amount of the assurance case development time. To close this automation gap, we have developed an automated pattern selection workflow that handles the selection problem as a coverage problem, intending to find the smallest set of patterns that can cover the available system artifacts. For this, we utilize the ontology graphs of the system artifacts and the patterns and perform graph analytics. The selected patterns are fed into an external instantiation function to develop an assurance case. Then, they are evaluated for coverage using two coverage metrics. An illustrative autonomous vehicle example is provided, demonstrating the utility of the proposed workflow in developing an assurance case with reduced efforts and time compared to the manual development alternative.
@inproceedings{rishav2022eEnergy,
author = {Sen, Rishav and Bharati, Alok Kumar and Khaleghian, Seyedmehdi and Ghosal, Malini and Wilbur, Michael and Tran, Toan and Pugliese, Philip and Sartipi, Mina and Neema, Himanshu and Dubey, Abhishek},
booktitle = {Proceedings of the Thirteenth ACM International Conference on Future Energy Systems},
title = {E-Transit-Bench: Simulation Platform for Analyzing Electric Public Transit Bus Fleet Operations},
year = {2022},
address = {New York, NY, USA},
pages = {532–541},
publisher = {Association for Computing Machinery},
series = {e-Energy '22},
contribution = {lead},
doi = {10.1145/3538637.3539586},
isbn = {9781450393973},
keywords = {electric vehicles, power grid, transit simulation, charging optimization, grid impact analysis},
location = {Virtual Event},
numpages = {10},
url = {https://doi.org/10.1145/3538637.3539586},
what = {This paper presents E-TRANSIT-BENCH, a simulation platform for analyzing the impact of electric vehicle charging on power grid operations in public transit systems. The framework integrates transit simulation with power grid modeling using SUMO and GridLAB-D co-simulation. The system enables detailed analysis of how bus electrification affects grid stability, load distribution, and charging infrastructure requirements.},
why = {As public transit agencies transition to electric buses, understanding the impact on power infrastructure is critical for cost-effective planning and grid stability. This work is significant because it provides an integrated modeling framework that captures complex interactions between transit operations and grid dynamics. The co-simulation approach enables analysis of various electrification strategies and their grid impacts.},
results = {The simulation platform demonstrated how different charging strategies significantly affect grid load profiles and voltage stability. The analysis identified optimal charging station locations that minimize grid stress while supporting transit schedules. The framework enabled evaluation of various battery capacities, charging rates, and scheduling strategies to inform infrastructure planning decisions.},
project_tags = {energy, transit, CPS}
}
When electrified transit systems make grid aware choices, improved social welfare is achieved by reducing grid stress, reducing system loss, and minimizing power quality issues. Electrifying transit fleet has numerous challenges like non availability of buses during charging, varying charging costs and so on, that are related the electric grid behavior. However, transit systems do not have access to the information about the co-evolution of the grid’s power flow and therefore cannot account for the power grid’s needs in its day-to-day operation. In this paper we propose a framework of transportation-grid co-simulation, analyzing the spatio-temporal interaction between the transit operations with electric buses and the power distribution grid. Real-world data for a day’s traffic from Chattanooga city’s transit system is simulated in SUMO and integrated with a realistic distribution grid simulation (using GridLAB-D) to understand the grid impact due to transit electrification. Charging information is obtained from the transportation simulation to feed into grid simulation to assess the impact of charging. We also discuss the impact to the grid with higher degree of transit electrification that further necessitates such an integrated transportation-grid co-simulation to operate the integrated system optimally. Our future work includes extending the platform for optimizing the charging and trip assignment operations.
@inproceedings{sun2021transitgym,
author = {Sun, Ruixiao and Gui, Rongze and Neema, Himanshu and Chen, Yuche and Ugirumurera, Juliette and Severino, Joseph and Pugliese, Philip and Laszka, Aron and Dubey, Abhishek},
booktitle = {2021 IEEE International Conference on Smart Computing (SMARTCOMP)},
title = {TRANSIT-GYM: A Simulation and Evaluation Engine for Analysis of Bus Transit Systems},
year = {2021},
month = aug,
pages = {69-76},
contribution = {colab},
acceptance = {31.7},
doi = {10.1109/SMARTCOMP52413.2021.00030},
issn = {2693-8340},
keywords = {Training;Analytical models;Uncertainty;Computational modeling;Microscopy;Vehicle routing;Urban areas;Transit simulation;domain-specific modeling language;traffic simulation;micro-simulation;regional transportation system;transportation planning;data-driven optimization},
tag = {transit}
}
Public-transit systems face a number of operational challenges: (a) changing ridership patterns requiring optimization of fixed line services, (b) optimizing vehicle-to-trip assignments to reduce maintenance and operation codes, and (c) ensuring equitable and fair coverage to areas with low ridership. Optimizing these objectives presents a hard computational problem due to the size and complexity of the decision space. State-of-the-art methods formulate these problems as variants of the vehicle routing problem and use data-driven heuristics for optimizing the procedures. However, the evaluation and training of these algorithms require large datasets that provide realistic coverage of various operational uncertainties. This paper presents a dynamic simulation platform, called TRANSIT-GYM, that can bridge this gap by providing the ability to simulate scenarios, focusing on variation of demand models, variations of route networks, and variations of vehicle-to-trip assignments. The central contribution of this work is a domain-specific language and associated experimentation tool-chain and infrastructure to enable subject-matter experts to intuitively specify, simulate, and analyze large-scale transit scenarios and their parametric variations. Of particular significance is an integrated microscopic energy consumption model that also helps to analyze the energy cost of various transit decisions made by the transportation agency of a city.
@inproceedings{vazirizade2021learning,
author = {Vazirizade, Sayyed Mohsen and Mukhopadhyay, Ayan and Pettet, Geoffrey and El Said, Said and Baroud, Hiba and Dubey, Abhishek},
booktitle = {2021 IEEE International Conference on Smart Computing (SMARTCOMP)},
title = {Learning Incident Prediction Models Over Large Geographical Areas for Emergency Response},
year = {2021},
month = aug,
pages = {424-429},
acceptance = {31.7},
contribution = {lead},
doi = {10.1109/SMARTCOMP52413.2021.00091},
issn = {2693-8340},
keywords = {Road accidents;Pipelines;Collaboration;Weather forecasting;Predictive models;Emergency services;Resource management;Spatial Temporal Incident Prediction;Emergency Response Management;Resource Allocation;Statistical Modeling},
tag = {ai4cps,incident}
}
Emergency Response Management (ERM) necessitates the use of models capable of predicting the spatial-temporal likelihood of incident occurrence. These models are used for proactive stationing in order to reduce overall response time. Traditional methods simply aggregate past incidents over space and time; such approaches fail to make useful short-term predictions when the spatial region is large and focused on fine-grained spatial entities like interstate highway networks. This is partially due to the sparsity of incidents with respect to space and time. Further, accidents are affected by several covariates. Collecting, cleaning, and managing multiple streams of data from various sources is challenging for large spatial areas. In this paper, we highlight how this problem is being solved in collaboration with the Tennessee Department of Transportation (TDOT) to improve ERM in the state of Tennessee. Our pipeline, based on a combination of synthetic resampling, clustering, and data mining techniques, can efficiently forecast the spatio-temporal dynamics of accident occurrence, even under sparse conditions. Our pipeline uses data related to roadway geometry, weather, historical accidents, and traffic to aid accident forecasting. To understand how our forecasting model can affect allocation and dispatch, we improve and employ a classical resource allocation approach. Experimental results show that our approach can noticeably reduce response times and the number of unattended incidents in comparison to current approaches followed by first responders. The developed pipeline is efficacious, applicable in practice, and open-source.
@article{eisele2020Safe,
author = {Eisele, Scott and Eghtesad, Taha and Campanelli, Keegan and Agrawal, Prakhar and Laszka, Aron and Dubey, Abhishek},
journal = {ACM Trans. Cyber-Phys. Syst.},
title = {Safe and Private Forward-Trading Platform for Transactive Microgrids},
year = {2021},
issn = {2378-962X},
month = jan,
number = {1},
volume = {5},
address = {New York, NY, USA},
articleno = {8},
contribution = {lead},
doi = {10.1145/3403711},
issue_date = {January 2021},
keywords = {blockchain, distributed energy, transactive energy, privacy, smart contracts, microgrids},
numpages = {29},
publisher = {Association for Computing Machinery},
tag = {decentralization, power},
url = {https://doi.org/10.1145/3403711},
what = {This paper introduces TRANSAX, a blockchain-based decentralized energy trading platform for transactive microgrids that simultaneously addresses efficiency, safety, and privacy requirements. The system uses smart contracts for transparent trading while employing cryptographic mixing protocols for anonymization. The architecture leverages distributed ledgers and resilient information architecture to enable peer-to-peer energy trading.},
why = {Decentralized energy trading in microgrids offers significant efficiency benefits but must simultaneously satisfy conflicting requirements for efficiency, safety, and privacy. This work is innovative because it demonstrates how blockchain and cryptographic techniques can be combined to achieve all three properties without requiring a centralized authority. The approach enables local energy trading while preventing exploitation of trading activity.},
results = {The TRANSAX platform was evaluated through testbed experiments and demonstrated feasibility of private blockchain-based energy trading. The system successfully maintained energy safety constraints while preserving privacy of trading activities. The hybrid solver architecture proved effective for solving complex energy allocation problems while distributing computation across resilient ledger nodes.},
project_tags = {energy, middleware, CPS}
}
Transactive microgrids have emerged as a transformative solution for the problems faced by distribution system operators due to an increase in the use of distributed energy resources and rapid growth in renewable energy generation. Transactive microgrids are tightly coupled cyber and physical systems, which require resilient and robust financial markets where transactions can be submitted and cleared, while ensuring that erroneous or malicious transactions cannot destabilize the grid. In this paper, we introduce TRANSAX, a novel decentralized platform for transactive microgrids. TRANSAX enables participants to trade in an energy futures market, which improves efficiency by finding feasible matches for energy trades, reducing the load on the distribution system operator. TRANSAX provides privacy to participants by anonymizing their trading activity using a distributed mixing service, while also enforcing constraints that limit trading activity based on safety requirements, such as keeping power flow below line capacity. We show that TRANSAX can satisfy the seemingly conflicting requirements of efficiency, safety, and privacy, and we demonstrate its performance using simulation results.
@inproceedings{aaai21,
author = {Sivagnanam, Amutheezan and Ayman, Afiya and Wilbur, Michael and Pugliese, Philip and Dubey, Abhishek and Laszka, Aron},
booktitle = {Proceedings of the 35th AAAI Conference on Artificial Intelligence (AAAI-21)},
title = {Minimizing Energy Use of Mixed-Fleet Public Transit for Fixed-Route Service},
year = {2021},
acceptance = {21.4},
contribution = {colab},
tag = {ai4cps,transit},
what = {This paper presents a Markov decision process formulation of dynamic resource allocation for mixed-fleet public transit systems with electric and hybrid vehicles. The framework optimizes vehicle assignments to routes and charging schedules to minimize energy consumption and emissions while meeting service requirements. The approach addresses the complexity of managing heterogeneous vehicle types with different energy characteristics.},
why = {Public transit agencies operating mixed fleets of electric and internal combustion vehicles face complex decisions about vehicle-to-route assignment that significantly impact operational costs and environmental impact. This work is significant because it provides a principled optimization framework that considers the heterogeneous energy consumption of different vehicle types. The approach enables data-driven decisions about fleet electrification strategies.},
results = {The optimization framework was evaluated on real transit data from Chattanooga and demonstrated substantial energy savings compared to baseline approaches. The greedy algorithm and simulated annealing heuristics successfully solved large-scale instances while achieving near-optimal solutions. The analysis revealed that optimal vehicle assignment can reduce energy consumption by up to 145,635 kwh and CO2 emissions by up to 576.7 tons.},
keywords = {transit optimization, electric vehicles, energy optimization, mixed fleet, operational planning},
project_tags = {energy, transit, planning, scalable AI}
}
Affordable public transit services are crucial for communities since they enable residents to access employment, education, and other services. Unfortunately, transit services that provide wide coverage tend to suffer from relatively low utilization, which results in high fuel usage per passenger per mile, leading to high operating costs and environmental impact. Electric vehicles (EVs) can reduce energy costs and environmental impact, but most public transit agencies have to employ them in combination with conventional, internal-combustion engine vehicles due to the high upfront costs of EVs. To make the best use of such a mixed fleet of vehicles, transit agencies need to optimize route assignments and charging schedules, which presents a challenging problem for large transit networks. We introduce a novel problem formulation to minimize fuel and electricity use by assigning vehicles to transit trips and scheduling them for charging, while serving an existing fixed-route transit schedule. We present an integer program for optimal assignment and scheduling, and we propose polynomial-time heuristic and meta-heuristic algorithms for larger networks. We evaluate our algorithms on the public transit service of Chattanooga, TN using operational data collected from transit vehicles. Our results show that the proposed algorithms are scalable and can reduce energy use and, hence, environmental impact and operational costs. For Chattanooga, the proposed algorithms can save $145,635 in energy costs and 576.7 metric tons of CO2 emission annually.
@inproceedings{ajay2021powerattack,
author = {Chhokra, Ajay and Barreto, Carlos and Dubey, Abhishek and Karsai, Gabor and Koutsoukos, Xenofon},
booktitle = {9th Workshop on Modeling and Simulation of Cyber-Physical Energy Systems, MSCPES@CPSIoTWeek},
title = {Power-Attack: A comprehensive tool-chain for modeling and simulating attacks in power systems},
year = {2021},
category = {workshop},
contribution = {colab},
keywords = {power systems, cyber security, attack simulation, domain-specific language, protection systems},
project = {cps-reliability},
tag = {platform,power},
what = {This paper presents Power-Attack, a comprehensive tool-chain for modeling and simulating cyber attacks in power systems. The system includes a domain-specific language for defining attack scenarios and a scalable simulation engine that models both physical layer dynamics and protection system components. The framework enables evaluation of protection relay responses and overall system behavior under various attack conditions.},
why = {As power systems become increasingly vulnerable to cyber attacks, developing tools for attack simulation and mitigation analysis is critical for system security. This work is innovative because it provides an integrated tool-chain combining high-level domain-specific languages with detailed physical layer simulation. The approach enables security analysts to quickly specify complex attack scenarios and evaluate system responses.},
results = {The Power-Attack framework was evaluated on IEEE 39 bus systems and demonstrated effectiveness in simulating various cyber attack scenarios. The tool-chain successfully modeled both device attacks and data attacks on protection systems. The simulation results demonstrated how different attacks affect frequency stability and identified specific vulnerabilities in protection system configurations.},
project_tags = {CPS, emergency, Explainable AI}
}
Due to the increased deployment of novel communication, control and protection functions, the grid has become vulnerable to a variety of attacks. Designing robust machine learning based attack detection and mitigation algorithms require large amounts of data that rely heavily on a representative environment, where different attacks can be simulated. This paper presents a comprehensive tool-chain for modeling and simulating attacks in power systems. The paper makes the following contributions, first, we present a probabilistic domain specific language to define multiple attack scenarios and simulation configuration parameters. Secondly, we extend the PyPower-dynamics simulator with protection system components to simulate cyber attacks in control and protection layers of power system. In the end, we demonstrate multiple attack scenarios with a case study based on IEEE 39 bus system.
@article{BASAK2021101283,
author = {Basak, Sanchita and Sengupta, Saptarshi and Wen, Shi-Jie and Dubey, Abhishek},
journal = {Pervasive and Mobile Computing},
title = {Spatio-temporal AI inference engine for estimating hard disk reliability},
year = {2021},
issn = {1574-1192},
pages = {101283},
volume = {70},
contribution = {lead},
doi = {https://doi.org/10.1016/j.pmcj.2020.101283},
keywords = {Remaining useful life, Long short term memory, Prognostics, Predictive health maintenance, Hierarchical clustering},
tag = {ai4cps, platform},
url = {http://www.sciencedirect.com/science/article/pii/S1574119220301231}
}
This paper focuses on building a spatio-temporal AI inference engine for estimating hard disk reliability. Most electronic systems such as hard disks routinely collect such reliability parameters in the field to monitor the health of the system. Changes in parameters as a function of time are monitored and any observed changes are compared with the known failure signatures. If the trajectory of the measured data matches that of a failure signature, operators are alerted to take corrective action. However, the interest of the operators lies in being able to identify the failures before they occur. The state of the art methodology including our prior work is to train machine learning models on temporal sequence data capturing the variations across multiple features and using it to predict the remaining useful life of the devices. However, as we show in this paper temporal prediction capability alone is not sufficient and can lead to low precision and the uncertainty around the prediction is very large. This is primarily due to the non-uniform progression of feature patterns over time. Our hypothesis is that the accuracy can be improved if we combine the temporal prediction methods with a spatial analysis that compares the value of key SMART features of the devices across similar model in a fixed time window (unlike the temporal method which uses the data from a single device and a much larger historical window). In this paper, we first describe both temporal and spatial approaches, describe the methods to select various hyperparameters, and then show a workflow to combine these two methodologies and provide comparative results. Our results illustrate that the average precision of temporal methods using long-short temporal memory networks to predict impending failures in the next ten days was 84 percent. To improve precision, we use the set of disks identified as potential failures and start applying spatial anomaly detection methods on those disks. This helps us remove the false positives from the temporal prediction results and provide a tighter bound on the set of disks with impending failure.
@techreport{dot_61069_DS1,
author = {Baroud, Hiba and Dubey, Abhishek and Vazirizade, Sayyed Mohsen and others},
institution = {Tennessee. Department of Transportation},
title = {Collaborative Research Project to Coordinate the Data from the CRASH Predictive Analytics Program Between TDOT and TDOSHS},
year = {2021}
}
@inproceedings{ecml2021,
author = {Wilbur, Michael and Mukhopadhyay, Ayan and Vazirizade, Sayyed and Pugliese, Philip and Laszka, Aron and Dubey, Abhishek},
booktitle = {Joint European Conference on Machine Learning and Knowledge Discovery in Databases},
title = {Energy and Emission Prediction for Mixed-Vehicle Transit Fleets Using Multi-Task and Inductive Transfer Learning},
year = {2021},
acceptance = {29},
contribution = {lead},
tag = {ai4cps,transit},
what = {This paper proposes energy and emission prediction for mixed-vehicle transit fleets using multi-task learning and inductive transfer learning approaches. The framework addresses the challenge of predicting energy consumption for diverse vehicle types by leveraging shared representations across vehicle classes. The approach develops a unified prediction model that can handle variations in vehicle specifications and operating conditions.},
why = {Accurately predicting energy consumption across heterogeneous transit fleets is essential for operational planning and environmental impact assessment, but limited data for each vehicle type makes this challenging. This work is significant because it leverages transfer learning to improve prediction accuracy when some vehicle types have insufficient training data. The multi-task learning approach captures generalizable patterns across vehicle classes.},
results = {The multi-task learning approach significantly outperformed vehicle-specific baseline models, particularly for vehicle classes with limited data. The inductive transfer learning successfully improved predictive accuracy for classes with insufficient training samples. The evaluation on real transit data demonstrated that the proposed approach achieves better generalization and handles new vehicle types more effectively than traditional methods.},
keywords = {energy prediction, transit systems, multi-task learning, transfer learning, emissions, vehicle fleets},
project_tags = {energy, transit, ML for CPS, scalable AI}
}
Public transit agencies are focused on making their fixed-line bus systems more energy efficient by introducing electric (EV) and hybrid (HV) vehicles to their eets. However, because of the high upfront cost of these vehicles, most agencies are tasked with managing a mixed-fleet of internal combustion vehicles (ICEVs), EVs, and HVs. In managing mixed-fleets, agencies require accurate predictions of energy use for optimizing the assignment of vehicles to transit routes, scheduling charging, and ensuring that emission standards are met. The current state-of-the-art is to develop separate neural network models to predict energy consumption for each vehicle class. Although different vehicle classes’ energy consumption depends on a varied set of covariates, we hypothesize that there are broader generalizable patterns that govern energy consumption and emissions. In this paper, we seek to extract these patterns to aid learning to address two problems faced by transit agencies. First, in the case of a transit agency which operates many ICEVs, HVs, and EVs, we use multi-task learning (MTL) to improve accuracy of forecasting energy consumption. Second, in the case where there is a significant variation in vehicles in each category, we use inductive transfer learning (ITL) to improve predictive accuracy for vehicle class models with insufficient data. As this work is to be deployed by our partner agency, we also provide an online pipeline for joining the various sensor streams for fixed-line transit energy prediction. We find that our approach outperforms vehicle-specific baselines in both the MTL and ITL settings.
@inproceedings{iccps2021,
author = {Pettet, Geoffrey and Mukhopadhyay, Ayan and Kochenderfer, Mykel and Dubey, Abhishek},
booktitle = {Proceedings of the 12th {ACM/IEEE} International Conference on Cyber-Physical Systems, {ICCPS} 2021, Nashville, TN, USA},
title = {Hierarchical Planning for Resource Allocation in Emergency Response Systems},
year = {2021},
acceptance = {26},
contribution = {lead},
keywords = {emergency response, resource allocation, hierarchical planning, cyber-physical systems, scalability},
project = {smart-cities,smart-emergency-response},
tag = {ai4cps,decentralization,incident},
what = {This paper presents hierarchical planning approaches for dynamic resource allocation in emergency response systems under uncertainty. The framework decomposes the overall resource allocation problem into regional sub-problems to improve scalability while maintaining coordination across spatial areas. The approach integrates high-level planning with low-level local decision-making using both centralized and decentralized variants.},
why = {Emergency response systems must allocate resources across large geographic areas while responding to random incident arrivals and uncertain response requirements. This work is innovative because it provides hierarchical planning approaches that exploit spatial structure to achieve scalability without requiring complete centralization. The framework demonstrates how to balance coordination needs with the benefits of decentralization.},
results = {Evaluation on real emergency response data from major metropolitan areas demonstrated that the hierarchical approach scales significantly better than centralized approaches. The decentralized variant achieved comparable response times to centralized planning while requiring less global coordination. The framework successfully handled both static resource pre-positioning decisions and dynamic response to incident arrivals.},
project_tags = {emergency, planning, CPS, scalable AI, POMDP}
}
A classical problem in city-scale cyber-physical systems (CPS) is resource allocation under uncertainty. Spatial-temporal allocation of resources is optimized to allocate electric scooters across urban areas, place charging stations for vehicles, and design efficient on-demand transit. Typically, such problems are modeled as Markov (or semi-Markov) decision processes. While online, offline, and decentralized methodologies have been used to tackle such problems, none of the approaches scale well for large-scale decision problems. We create a general approach to hierarchical planning that leverages structure in city-level CPS problems to tackle resource allocation under uncertainty. We use emergency response as a case study and show how a large resource allocation problem can be split into smaller problems. We then create a principled framework for solving the smaller problems and tackling the interaction between them. Finally, we use real-world data from a major metropolitan area in the United States to validate our approach. Our experiments show that the proposed approach outperforms state-of-the-art approaches used in the field of emergency response.
@inproceedings{ICDM_2021,
author = {Senarath, Yasas and Mukhopadhyay, Ayan and Vazirizade, Sayyed and hemant Purohit and Nannapaneni, Saideep and Dubey, Abhishek},
booktitle = {21st IEEE International Conference on Data Mining (ICDM 2021)},
title = {Practitioner-Centric Approach for Early Incident Detection Using Crowdsourced Data for Emergency Services},
year = {2021},
acceptance = {20},
contribution = {colab},
tag = {ai4cps,incident},
what = {This paper presents CROME, a crowdsourced multi-objective event detection framework for early incident detection using crowdsourced data from Waze and traffic incident reports. The system balances the conflicting objectives of spatial-temporal accuracy and temporal responsiveness for incident detection. The approach uses convolutional neural networks and multi-objective optimization to find Pareto-optimal solutions.},
why = {Early incident detection from crowdsourced data is important for emergency response but must balance the competing objectives of detection accuracy and responsiveness. This work is significant because it formulates incident detection as a multi-objective optimization problem that enables explicit trade-off analysis. The approach demonstrates how to leverage noisy crowdsourced data while maintaining principled reasoning about accuracy-responsiveness trade-offs.},
results = {The CROME framework was evaluated on real traffic incident data from Nashville and demonstrated superior performance compared to single-objective baseline approaches. The multi-objective optimization identified Pareto-optimal solutions that practitioners can select based on their priorities. The system successfully detected incidents significantly earlier than traditional methods while maintaining acceptable spatial accuracy.},
keywords = {incident detection, crowdsourced data, multi-objective optimization, emergency response, traffic monitoring},
project_tags = {emergency, transit, ML for CPS, scalable AI}
}
Emergency response is highly dependent on the time of incident reporting. Unfortunately, the traditional approach to receiving incident reports (e.g., calling 911 in the USA) has time delays. Crowdsourcing platforms such as Waze provide an opportunity for early identification of incidents. However, detecting incidents from crowdsourced data streams is difficult due to the challenges of noise and uncertainty associated with such data. Further, simply optimizing over detection accuracy can compromise spatial-temporal localization of the inference, thereby making such approaches infeasible for real-world deployment. This paper presents a novel problem formulation and solution approach for practitioner-centered incident detection using crowdsourced data by using emergency response management as a case-study. The proposed approach CROME (Crowdsourced Multi-objective Event Detection) quantifies the relationship between the performance metrics of incident classification (e.g., F1 score) and the requirements of model practitioners (e.g., 1 km. radius for incident detection). First, we show how crowdsourced reports, ground-truth historical data, and other relevant determinants such as traffic and weather can be used together in a Convolutional Neural Network (CNN) architecture for early detection of emergency incidents. Then, we use a Pareto optimization-based approach to optimize the output of the CNN in tandem with practitioner-centric parameters to balance detection accuracy and spatial-temporal localization. Finally, we demonstrate the applicability of this approach using crowdsourced data from Waze and traffic accident reports from Nashville, TN, USA. Our experiments demonstrate that the proposed approach outperforms existing approaches in incident detection while simultaneously optimizing the needs for realworld deployment and usability.
@inproceedings{jp21,
author = {Tiausas, Francis and Talusan, Jose Paolo and Ishimaki, Yu and Yamana, Hayato and Yamaguchi, Hirozumi and Bhattacharjee, Shameek and Dubey, Abhishek and Yasumoto, Keiichi and Das, Sajal K.},
booktitle = {2021 IEEE International Conference on Smart Computing (SMARTCOMP)},
title = {User-centric Distributed Route Planning in Smart Cities based on Multi-objective Optimization},
year = {2021},
acceptance = {31.7},
pages = {77-82},
contribution = {lead},
doi = {10.1109/SMARTCOMP52413.2021.00031},
tag = {transit}
}
The realization of edge-based cyber-physical systems (CPS) poses important challenges in terms of performance, robustness, security, etc. This paper examines a novel approach to providing a user-centric adaptive route planning service over a network of Road Side Units (RSUs) in smart cities. The key idea is to adaptively select routing task parameters such as privacy-cloaked area sizes and number of retained intersections to balance processing time, privacy protection level, and route accuracy for privacy-augmented distributed route search while also handling per-query user preferences. This is formulated as an optimization problem with a set of parameters giving the best result for a set of queries given system constraints. Processing Throughput, Privacy Protection, and Travel Time Accuracy were developed as the objective functions to be balanced. A Multi-Objective Genetic Algorithm based technique (NSGA-II) is applied to recover a feasible solution. The performance of this approach was then evaluated using traffic data from Osaka, Japan. Results show good performance of the approach in balancing the aforementioned objectives based on user preferences.
@inproceedings{juan21,
author = {Martinez, Juan and Ayman, Ayan Mukhopadhyay Afiya and Wilbur, Michael and Pugliese, Philip and Freudberg, Dan and Laszka, Aron and Dubey, Abhishek},
booktitle = {Proceedings of the Workshop on AI for Urban Mobility at the 35th AAAI Conference on Artificial Intelligence (AAAI-21)},
title = {Predicting Public Transportation Load to Estimate the Probability of Social Distancing Violations},
year = {2021},
contribution = {minor},
tag = {transit},
what = {This paper develops statistical models to predict public transit ridership patterns and estimate the probability of social distancing violations during the COVID-19 pandemic. The researchers use automated passenger counting data and General Transit Feed Specification information from two major metropolitan areas to build predictive models including Poisson, negative binomial, zero-inflated Poisson, and zero-inflated negative binomial distributions. These models capture board and alight counts across different bus stops and times of day, accounting for temporal heterogeneity in transit demand.},
why = {During the COVID-19 pandemic, transit agencies faced unprecedented challenges in maintaining public health while preserving service accessibility. Accurate prediction of ridership demand is essential for optimizing transit schedules and ensuring that social distancing protocols can be maintained on buses. This work is innovative because it bridges data-driven ridership forecasting with public health constraints, providing transit planners with evidence-based tools to make operational decisions that protect both passengers and drivers.},
results = {The zero-inflated statistical models demonstrated superior performance in predicting both board and alight counts compared to standard Poisson models, with particularly good performance on test data from June 2020. The models enable transit agencies to estimate hourly ridership patterns and identify peak demand periods, allowing for dynamic capacity adjustments and social distancing compliance. Results show that the approach can provide actionable insights for transit operators planning safe operations during pandemic and post-pandemic periods.},
keywords = {transit ridership prediction, COVID-19, social distancing, public transportation, statistical modeling, automated passenger counting, zero-inflated models},
project_tags = {transit, planning, ML for CPS}
}
Public transit agencies struggle to maintain transit accessibility with reduced resources, unreliable ridership data, reduced vehicle capacities due to social distancing, and reduced services due to driver unavailability. In collaboration with transit agencies from two large metropolitan areas in the USA, we are designing novel approaches for addressing the afore-mentioned challenges by collecting accurate real-time ridership data, providing guidance to commuters, and performing operational optimization for public transit. We estimate rider-ship data using historical automated passenger counting data, conditional on a set of relevant determinants. Accurate ridership forecasting is essential to optimize the public transit schedule, which is necessary to improve current fixed lines with on-demand transit. Also, passenger crowding has been a problem for public transportation since it deteriorates passengers’ wellbeing and satisfaction. During the COVID-19 pandemic, passenger crowding has gained importance since it represents a risk for social distancing violations. Therefore, we are creating optimization models to ensure that social distancing norms can be adequately followed while ensuring that the total demand for transit is met. We will then use accurate forecasts for operational optimization that includes (a) proactive fixed-line schedule optimization based on predicted demand, (b) dispatch of on-demand micro-transit, prioritizing at-risk populations, and (c) allocation of vehicles to transit and cargo trips, considering exigent vehicle maintenance requirements (i.e., disinfection). Finally, this paper presents some initial results from our project regarding the estimation of ridership in public transit.
@inproceedings{matthew21,
author = {Burruss, Matthew and Ramakrishna, Shreyas and Dubey, Abhishek},
booktitle = {2021 IEEE International Conference on Smart Computing (SMARTCOMP)},
title = {Deep-RBF Networks for Anomaly Detection in Automotive Cyber-Physical Systems},
year = {2021},
pages = {55-60},
acceptance = {31.7},
contribution = {lead},
doi = {10.1109/SMARTCOMP52413.2021.00028},
tag = {a14cps}
}
Deep Neural Networks (DNNs) are popularly used for implementing autonomy related tasks in automotive Cyber-Physical Systems (CPSs). However, these networks have been shown to make erroneous predictions to anomalous inputs, which manifests either due to Out-of-Distribution (OOD) data or adversarial attacks. To detect these anomalies, a separate DNN called assurance monitor is often trained and used in parallel to the controller DNN, increasing the resource burden and latency. We hypothesize that a single network that can perform controller predictions and anomaly detection is necessary to reduce the resource requirements. Deep-Radial Basis Function (RBF) networks provide a rejection class alongside the class predictions, which can be utilized for detecting anomalies at runtime. However, the use of RBF activation functions limits the applicability of these networks to only classification tasks. In this paper, we show how the deep-RBF network can be used for detecting anomalies in CPS regression tasks such as continuous steering predictions. Further, we design deep-RBF networks using popular DNNs such as NVIDIA DAVE-II, and ResNet20, and then use the resulting rejection class for detecting adversarial attacks such as a physical attack and data poison attack. Finally, we evaluate these attacks and the trained deep-RBF networks using a hardware CPS testbed called DeepNNCar and a real-world German Traffic Sign Benchmark (GTSB) dataset. Our results show that the deep-RBF networks can robustly detect these attacks in a short time without additional resource requirements.
@inproceedings{resonate2021,
author = {Hartsell, Charles and Ramakrishna, Shreyas and Dubey, Abhishek and Stojcsics, Daniel and Mahadevan, Nag and Karsai, Gabor},
booktitle = {16th {International} Symposium on Software Engineering for Adaptive and Self-Managing Systems, {SEAMS} 2021},
title = {ReSonAte: A Runtime Risk Assessment Framework for Autonomous Systems},
year = {2021},
category = {selectiveconference},
contribution = {colab},
acceptance = {30},
project = {cps-middleware,cps-reliability},
tag = {ai4cps},
what = {The ReSONAte framework presents a runtime risk assessment methodology for autonomous cyber-physical systems that handles dynamic uncertainties in operating environments. The framework uses design-time hazard analysis information combined with system state observations at runtime to dynamically estimate risk using Bow-Tie Diagram models. The approach extends traditional safety assurance techniques by incorporating runtime monitoring data and state-dependent risk calculations to maintain system safety during continuous operation.},
why = {Autonomous cyber-physical systems must operate in unpredictable real-world environments where design-time assumptions may not hold. Traditional static risk assessment approaches are insufficient for handling the dynamic hazards and state-dependent failure modes that emerge at runtime. This work is innovative because it bridges design-time safety analysis with runtime operations, enabling systems to dynamically adjust their risk assessments based on current conditions and maintain safe operation in the face of uncertainties.},
results = {The ReSONAte framework successfully demonstrates dynamic risk estimation on autonomous vehicle examples using Carla simulations with 600 executions. The approach shows that risk calculations can be performed with minimal computational overhead (0.3 milliseconds) while accurately tracking state-dependent hazard rates. The framework proves effective for handling uncertainty in system operations and provides practical mechanisms for autonomous systems to support self-adaptation based on dynamically computed risk values.},
keywords = {autonomous systems, runtime risk assessment, cyber-physical systems, safety assurance, hazard analysis, Bow-Tie Diagrams, dynamic uncertainty},
project_tags = {CPS, Explainable AI, scalable AI}
}
Autonomous Cyber-Physical Systems (CPSs) are often required to handle uncertainties and self-manage the system operation in response to problems and increasing risk in the operating paradigm. This risk may arise due to distribution shifts, environmental context, or failure of software or hardware components. Traditional techniques for risk assessment focus on design-time techniques such as hazard analysis, risk reduction, and assurance cases among others. However, these static, design time techniques do not consider the dynamic contexts and failures the systems face at runtime. We hypothesize that this requires a dynamic assurance approach that computes the likelihood of unsafe conditions or system failures considering the safety requirements, assumptions made at design time, past failures in a given operating context, and the likelihood of system component failures. We introduce the ReSonAte dynamic risk estimation framework for autonomous systems. ReSonAte reasons over Bow-Tie Diagrams (BTDs), which capture information about hazard propagation paths and control strategies. Our innovation is the extension of the BTD formalism with attributes for modeling the conditional relationships with the state of the system and environment. We also describe a technique for estimating these conditional relationships and equations for estimating risk-based on the state of the system and environment. To help with this process, we provide a scenario modeling procedure that can use the prior distributions of the scenes and threat conditions to generate the data required for estimating the conditional relationships. To improve scalability and reduce the amount of data required, this process considers each control strategy in isolation and composes several single-variate distributions into one complete multi-variate distribution for the control strategy in question. Lastly, we describe the effectiveness of our approach using two separate autonomous system simulations: CARLA and an unmanned underwater vehicle.
@inproceedings{rtmeter2021,
author = {Mustafa, Hussain M. and Bariya, Mohini and Sajan, K.S. and Chhokra, Ajay and Srivastava, Anurag and Dubey, Abhishek and von Meier, Alexandra and Biswas, Gautam},
booktitle = {9th Workshop on Modeling and Simulation of Cyber-Physical Energy Systems, MSCPES@CPSIoTWeek},
title = {RT-METER: A Real-Time, Multi-Layer Cyber–Power Testbed for Resiliency Analysis},
year = {2021},
category = {workshop},
contribution = {colab},
keywords = {smart grid, cyber-physical systems, power systems, resilience, testbed, cyber-power, monitoring and control, anomaly detection},
project = {cps-reliability},
tag = {platform,power},
what = {RT-METER is a real-time, multi-layer cyber-power testbed designed for resilience analysis of power systems. The testbed integrates three core layers: a physical power system simulator using HYPERSIM, a communication network layer with NS3 for simulating network behavior, and a control layer with advanced algorithmic tools. The architecture enables comprehensive simulation of cyber-power scenarios with realistic sensor emulation and provides tools for event detection, failure diagnosis, and resilience monitoring.},
why = {Modern power grids are increasingly vulnerable to cyber-attacks and natural disruptions, requiring advanced tools for validating resilience mechanisms. Existing testbeds often lack integration of all three critical layers (power, communication, and control) needed to understand system-wide impacts. RT-METER is innovative because it provides a unified platform for testing cyber-physical security mechanisms in power systems, enabling researchers to evaluate the effectiveness of protection strategies under realistic conditions with multiple interacting failure modes.},
results = {The testbed successfully demonstrates comprehensive cyber-power resilience analysis capabilities, supporting validation of algorithmic tools including resilience monitoring, event detection, and failure diagnosis systems. The architecture enables detailed evaluation of cyber-security mechanisms and their interaction with power system dynamics. Results show that the testbed can simulate complex cascading failures and evaluate mitigation strategies, providing valuable validation capabilities for power system protection and control algorithms.},
project_tags = {energy, CPS, middleware, ML for CPS}
}
In this work, we present a Real-Time, Multi-layer cybEr–power TestbEd for the Resiliency analysis (RT-METER) to support power grid operation and planning. Developed cyber-power testbed provides a mechanism for end-to-end validation of advanced tools for cyber-power grid monitoring, control, and planning. By integrating a host of features across three core layers—physical power system, communication network, and monitoring/ control center with advanced tools,—the testbed allows for the simulation of rich and varied cyber-power grid scenarios and the generating realistic sensor, system, and network data. Developing advanced tools to assist operators during complex and challenging scenarios is essential for the successful operation of the future grid. We detail a suite of algorithmic tools validated using the developed testbed for the realistic grid data.
@article{sandoval2021data,
author = {Sandoval, Ricardo and {Van Geffen}, Caleb and Wilbur, Michael and Hall, Brandon and Dubey, Abhishek and Barbour, William and Work, Daniel B.},
journal = {Transportation Research Interdisciplinary Perspectives},
title = {Data driven methods for effective micromobility parking},
year = {2021},
issn = {2590-1982},
pages = {100368},
volume = {10},
contribution = {minor},
doi = {https://doi.org/10.1016/j.trip.2021.100368},
keywords = {micromobility, shared electric scooters, parking facility location, clustering, urban planning, accessibility, demand analysis},
tag = {transit},
url = {https://www.sciencedirect.com/science/article/pii/S2590198221000750},
what = {This work proposes a data-driven methodology for locating and prioritizing shared electric scooter (SES) parking facilities using clustering algorithms and demand analysis. The approach addresses the problem of finding optimal parking locations by maximizing trip capture while considering environmental factors such as sidewalk width for ADA compliance. Case studies in Nashville and on the Vanderbilt campus demonstrate how clustering methods can identify high-demand parking zones and how incorporating infrastructure constraints affects facility placement.},
why = {Shared electric scooter systems create significant urban mobility benefits but generate management challenges, particularly regarding parking infrastructure placement and accessibility. Simply maximizing trip capture without considering environmental constraints can create conflicts with infrastructure accessibility standards. This work is innovative because it demonstrates how to balance operational efficiency with urban equity concerns, showing that data-driven approaches can improve both the effectiveness and fairness of micromobility infrastructure planning.},
results = {The analysis demonstrates that demand-based clustering can effectively identify optimal parking locations, capturing 300% more problematic narrow-sidewalk trips when infrastructure constraints are incorporated, with only a 13% trade-off in overall trip capture. Empirical results provide city planners with quantitative guidance on how many parking facilities are needed to serve scooter demand and how capacity should be allocated across locations. The findings show that considering built environment factors significantly improves both demand coverage and ADA compliance.},
project_tags = {transit, planning}
}
In this work, we propose a data-driven method to use proven clustering algorithms for establishing shared electric scooter (SES) parking locations and assessing their anticipated utilization. We first address the problem of finding locations for a given number of parking facilities, based pur0ely on demand, that maximize the number of trips that would likely be parked at these facilities. We then formulate an enhanced version of the SES parking facility problem in which exogenous environmental factors are considered, such as sidewalk width. Parking SESs on narrow sidewalks raises accessibility concerns for other users of this infrastructure and capturing these trips in dedicated parking facilities is a valid priority to trade off with pure demand maximization. These methods are demonstrated in two case studies, which use a large SES dataset from Nashville, Tennessee, USA. We provide empirical results on how many facilities are needed to serve demand of SESs and necessary capacity allocation of the facilities. When the methodology considers sidewalk width in facility placement, the refined parking locations can address 300% more problematic trips parked along narrow sidewalks, with only a nominal sacrifice, around 13%, in the overall number of trips served.
@article{SUN2021102637,
author = {Sun, Ruixiao and Chen, Yuche and Dubey, Abhishek and Pugliese, Philip},
journal = {Transportation Research Part D: Transport and Environment},
title = {Hybrid electric buses fuel consumption prediction based on real-world driving data},
year = {2021},
issn = {1361-9209},
pages = {102637},
volume = {91},
contribution = {colab},
doi = {https://doi.org/10.1016/j.trd.2020.102637},
keywords = {Hybrid diesel transit bus, Artificial neural network, Fuel consumption prediction},
tag = {transit},
url = {https://www.sciencedirect.com/science/article/pii/S1361920920308221}
}
Estimating fuel consumption by hybrid diesel buses is challenging due to its diversified operations and driving cycles. In this study, long-term transit bus monitoring data were utilized to empirically compare fuel consumption of diesel and hybrid buses under various driving conditions. Artificial neural network (ANN) based high-fidelity microscopic (1 Hz) and mesoscopic (5–60 min) fuel consumption models were developed for hybrid buses. The microscopic model contained 1 Hz driving, grade, and environment variables. The mesoscopic model aggregated 1 Hz data into 5 to 60-minute traffic pattern factors and predicted average fuel consumption over its duration. The prediction results show mean absolute percentage errors of 1–2% for microscopic models and 5–8% for mesoscopic models. The data were partitioned by different driving speeds, vehicle engine demand, and road grade to investigate their impacts on prediction performance.
@inproceedings{wilbur21,
author = {Wilbur, Michael and Pugliese, Philip and Laszka, Aron and Dubey, Abhishek},
booktitle = {Proceedings of the Workshop on AI for Urban Mobility at the 35th AAAI Conference on Artificial Intelligence (AAAI-21)},
title = {Efficient Data Management for Intelligent Urban Mobility Systems},
year = {2021},
contribution = {colab},
tag = {ai4cps,transit},
what = {This paper develops efficient data management and processing frameworks for intelligent urban mobility systems using real-world transit data from Chattanooga. The work presents an integrated data architecture combining real-time vehicle telemetry, weather data, traffic information, and elevation maps to support machine learning applications. The framework addresses challenges in storing high-velocity data streams and enabling both offline model training and real-time inference for transit applications including energy prediction and ridership forecasting.},
why = {Transit agencies increasingly need to leverage large-scale multimodal sensor data to optimize operations and support decision-making, but lack standardized frameworks for collecting, storing, and processing such data efficiently. Traditional approaches often fail to handle the high-volume, heterogeneous nature of real-world transit data streams. This work is innovative because it provides a practical, cloud-based architecture that enables transit agencies to build machine learning applications using real-world data while maintaining the flexibility to adapt systems as new requirements emerge.},
results = {The framework successfully demonstrates integration of multiple data sources into a unified system supporting both offline training and real-time inference. Results show that the architecture enables effective machine learning-based energy prediction for electric vehicles and transit optimization applications. The system has been deployed with the Chattanooga Area Regional Transportation Authority and provides the foundation for multiple AI applications including occupancy prediction and route-level energy consumption estimation.},
keywords = {intelligent transportation systems, data management, machine learning, real-time systems, energy prediction, transit optimization, data architecture},
project_tags = {transit, energy, ML for CPS, middleware}
}
Modern intelligent urban mobility applications are underpinned by large-scale, multivariate, spatiotemporal data streams. Working with this data presents unique challenges of data management, processing and presentation that is often overlooked by researchers. Therefore, in this work we present an integrated data management and processing framework for intelligent urban mobility systems currently in use by our partner transit agencies. We discuss the available data sources and outline our cloud-centric data management and stream processing architecture built upon open-source publish-subscribe and NoSQL data stores. We then describe our data-integrity monitoring methods. We then present a set of visualization dashboards designed for our transit agency partners. Lastly, we discuss how these tools are currently being used for AI-driven urban mobility applications that use these tools.
@inproceedings{wildfiredb2021,
author = {Singla, Samriddhi and Mukhopadhyay, Ayan and Wilbur, Michael and Diao, Tina and Gajjewar, Vinayak and Eldawy, Ahmed and Kochenderfer, Mykel and Shachter, Ross and Dubey, Abhishek},
booktitle = {35th Conference on Neural Information Processing Systems (NeurIPS 2021) Track on Datasets and Benchmarks},
title = {WildfireDB: An Open-Source Dataset ConnectingWildfire Spread with Relevant Determinants},
year = {2021},
acceptance = {26},
contribution = {minor},
tag = {ai4cps,incident},
what = {WildFireDB is a comprehensive open-source dataset connecting wildfire spread with relevant environmental determinants including weather, vegetation, and topography. The dataset comprises over 17.8 million data points covering wildfire occurrences in the continental United States from 2012-2017, integrating fire detection data from satellite imagery with spatial vegetation and topographic information. The work presents algorithmic approaches for merging large-scale raster and vector data to create a spatially and temporally coherent dataset for modeling fire spread.},
why = {Modeling wildfire spread is critical for emergency management and fire risk assessment, but comprehensive datasets relating fires to relevant environmental covariates have been lacking. Traditional fire spread models rely on physics-based approaches that require detailed parameter specification, while data-driven approaches are limited by insufficient data. This work is innovative because it provides the research community with a large-scale, multi-source dataset that enables machine learning approaches to improve fire spread forecasting and supports more sophisticated understanding of fire dynamics.},
results = {The dataset successfully integrates fire occurrence data from satellite sensors with vegetation, topographic, and weather information across a standardized spatial grid. The publicly available dataset enables development of data-driven models for fire spread forecasting. The work demonstrates the feasibility of creating large-scale integrated datasets that combine multiple data types at different spatial resolutions, providing a foundation for advancing fire risk management through improved predictive models.},
keywords = {wildfire, fire spread modeling, dataset, geospatial analysis, machine learning, risk management, environmental data integration},
project_tags = {emergency, planning}
}
Modeling fire spread is critical in fire risk management. Creating data-driven models to forecast spread remains challenging due to the lack of comprehensive data sources that relate fires with relevant covariates. We present the first comprehensive and open-source dataset that relates historical fire data with relevant covariates such as weather, vegetation, and topography. Our dataset, named WildfireDB, contains over 17 million data points that capture how fires spread in the continental USA in the last decade. In this paper, we describe the algorithmic approach used to create and integrate the data, describe the dataset, and present benchmark results regarding data-driven models that can be learned to forecast the spread of wildfires.
@article{yuchesae2021,
author = {Chen, Yuche and Wu, Guoyuan and Sun, Ruixiao and Dubey, Abhishek and Laszka, Aron and Pugliese, Philip},
journal = {Society of Automotive Engineers (SAE) International Journal of Sustainable Transportation, Energy, Environment, \& Policy},
title = {A Review and Outlook of Energy Consumption Estimation Models for Electric Vehicles},
year = {2021},
contribution = {minor},
tag = {transit},
what = {This paper presents a comprehensive review of energy consumption estimation models for electric vehicles, examining approaches across different modeling scales (microscopic vs. macroscopic) and methodologies (data-driven vs. rule-based). The review analyzes influential variables in four categories: vehicle components, vehicle dynamics, traffic conditions, and environmental factors. The work classifies existing models and identifies research gaps including the need for models applicable to different vehicle types and approaches suitable for vehicle-to-grid integration applications.},
why = {Accurate energy consumption modeling is essential for the widespread adoption of electric vehicles, as it directly impacts user range anxiety and the viability of optimal fleet scheduling. However, existing models vary significantly in their approaches and applicability to different use cases. This work is innovative because it provides a systematic framework for understanding the landscape of energy consumption models, identifying key limitations in current approaches, and highlighting priorities for future research to support the transition to electric vehicle transportation.},
results = {The review identifies a trend toward increasingly macroscopic models that can be used at the trip level for energy prediction, combined with growing adoption of data-driven approaches that leverage machine learning. Key findings show that vehicle type, traffic conditions, and weather are critical factors in energy consumption, and that most existing models focus on personal vehicles rather than transit or commercial applications. The review provides guidance for practitioners on model selection based on application requirements.},
keywords = {electric vehicles, energy consumption, machine learning, transportation modeling, vehicle dynamics, environmental factors},
project_tags = {energy, transit, ML for CPS}
}
Electric vehicles (EVs) are critical to the transition to a low-carbon transportation system. The successful adoption of EVs heavily depends on energy consumption models that can accurately and reliably estimate electricity consumption. This paper reviews the state of the art of EV energy consumption models, aiming to provide guidance for future development of EV applications. We summarize influential variables of EV energy consumption in four categories: vehicle component, vehicle dynamics, traffic, and environment-related factors. We classify and discuss EV energy consumption models in terms of modeling scale (microscopic vs. macroscopic) and methodology (data-driven vs. rule-based). Our review shows trends of increasing macroscopic models that can be used to estimate trip-level EV energy consumption and increasing data-driven models that utilize machine learning technologies to estimate EV energy consumption based on a large volume of real-world data. We identify research gaps for EV energy consumption models, including the development of energy estimation models for modes other than personal vehicles (e.g., electric buses, trucks, and nonroad vehicles), energy estimation models that are suitable for applications related to vehicle-to-grid integration, and multiscale energy estimation models as a holistic modeling approach.
@inproceedings{zhuangweiconfiguration21,
author = {Kang, Zhuangwei and Barve, Yogesh D. and Bao, Shunxing and Dubey, Abhishek and Gokhale, Aniruddha},
booktitle = {Proceedings of the International Conference on Internet-of-Things Design and Implementation},
title = {Configuration Tuning for Distributed IoT Message Systems Using Deep Reinforcement Learning: Poster Abstract},
year = {2021},
address = {New York, NY, USA},
pages = {273–274},
publisher = {Association for Computing Machinery},
series = {IoTDI '21},
contribution = {minor},
note = {Poster},
doi = {10.1145/3450268.3453517},
isbn = {9781450383547},
keywords = {Policy-based RL Algorithm, Publish/Subscribe Middleware, System Configuration},
location = {Charlottesvle, VA, USA},
numpages = {2},
url = {https://doi.org/10.1145/3450268.3453517}
}
Distributed messaging systems (DMSs) are often equipped with a large number of configurable parameters that enable users to define application run-time behaviors and information dissemination rules. However, the resulting high-dimensional configuration space makes it difficult for users to determine the best configuration that can maximize application QoS under a variety of operational conditions. This poster introduces a novel, automatic knob tuning framework called DMSConfig. DMSConfig explores the configuration space by interacting with a data-driven environment prediction model(a DMS simulator), which eliminates the prohibitive cost of conducting online interactions with the production environment. DMSConfig employs the deep deterministic policy gradient (DDPG) method and a custom reward mechanism to learn and make configuration decisions based on predicted DMS states and performance. Our initial experimental results, conducted on a single-broker Kafka cluster, show that DMSConfig significantly outperforms the default configuration and has better adaptability to CPU and bandwidth-limited environments. We also confirm that DMSConfig produces fewer violations of latency constraints than three prevalent parameter tuning tools.
@inproceedings{senarath_emergency_2020,
author = {Senarath, Yasas and Nannapaneni, Saideep and Purohit, Hemant and Dubey, Abhishek},
booktitle = {The 2020 {IEEE}/{WIC}/{ACM} {International} {Joint} {Conference} {On} {Web} {Intelligence} {And} {Intelligent} {Agent} {Technology}},
title = {Emergency {Incident} {Detection} from {Crowdsourced} {Waze} {Data} using {Bayesian} {Information} {Fusion}},
year = {2020},
month = nov,
note = {arXiv: 2011.05440},
publisher = {IEEE},
acceptance = {30},
annote = {Comment: 8 pages, The 2020 IEEE/WIC/ACM International Joint Conference On Web Intelligence And Intelligent Agent Technology (WI-IAT '20)},
contribution = {colab},
copyright = {All rights reserved},
file = {arXiv Fulltext PDF:/Users/abhishek/Zotero/storage/B8WHQRUX/Senarath et al. - 2020 - Emergency Incident Detection from Crowdsourced Waz.pdf:application/pdf;arXiv.org Snapshot:/Users/abhishek/Zotero/storage/98PX572Y/2011.html:text/html},
keywords = {Computer Science - Artificial Intelligence, Computer Science - Social and Information Networks},
tag = {incident},
url = {http://arxiv.org/abs/2011.05440},
urldate = {2021-01-31}
}
The number of emergencies have increased over the years with the growth in urbanization. This pattern has overwhelmed the emergency services with limited resources and demands the optimization of response processes. It is partly due to traditional ‘reactive’ approach of emergency services to collect data about incidents, where a source initiates a call to the emergency number (e.g., 911 in U.S.), delaying and limiting the potentially optimal response. Crowdsourcing platforms such as Waze provides an opportunity to develop a rapid, ‘proactive’ approach to collect data about incidents through crowd-generated observational reports. However, the reliability of reporting sources and spatio-temporal uncertainty of the reported incidents challenge the design of such a proactive approach. Thus, this paper presents a novel method for emergency incident detection using noisy crowdsourced Waze data. We propose a principled computational framework based on Bayesian theory to model the uncertainty in the reliability of crowd-generated reports and their integration across space and time to detect incidents. Extensive experiments using data collected from Waze and the official reported incidents in Nashville, Tenessee in the U.S. show our method can outperform strong baselines for both F1-score and AUC. The application of this work provides an extensible framework to incorporate different noisy data sources for proactive incident detection to improve and optimize emergency response operations in our communities.
@inproceedings{chokraACC2020,
author = {Chhokra, Ajay and Hasan, Saqib and Dubey, Abhishek and Karsai, Gabor},
booktitle = {2020 American Control Conference (ACC)},
title = {A Binary Decision Diagram Based Cascade Prognostics Scheme For Power Systems},
year = {2020},
month = jul,
pages = {3011-3016},
contribution = {minor},
doi = {10.23919/ACC45564.2020.9147902},
issn = {2378-5861},
keywords = {power systems, cascading failures, binary decision diagrams, contingency analysis, resilience, blackout mitigation, grid stability},
tag = {platform,power},
what = {This paper presents a binary decision diagram-based cascade prognostics scheme for power systems that efficiently identifies critical component outages and their cascading effects. The approach uses reduced-order BDDs to encode blackout-causing states and transition relations, enabling fast look-up of cascading consequences. The methodology addresses the challenge of high-order contingency analysis by providing computationally efficient mechanisms to identify critical component combinations that could lead to widespread blackouts.},
why = {Power systems face cascading failure risks where failures in one component can trigger widespread blackouts affecting millions of customers. Identifying critical contingencies that could lead to cascading outages is computationally challenging as the number of possible combinations grows exponentially. This work is innovative because it provides a practical approach using reduced-order decision diagrams to efficiently identify worst-case cascading scenarios, enabling operators to proactively strengthen system resilience against critical contingencies.},
results = {The BDD-based approach successfully identifies critical N-K contingencies in IEEE 14-bus and IEEE 39-bus systems, demonstrating that the method can encode complex cascading behaviors in compact data structures. Results show that the approach finds critical vulnerabilities that could cause system blackouts and evaluates load curtailment actions that can mitigate cascading failures. The methodology enables practical contingency analysis for resilience assessment in power systems.},
project_tags = {energy, planning}
}
Cascading outages in power systems is a rare, but important phenomenon with huge social and economic implications. Due to the inherent complexity and heterogeneity of components in power system, analysis and prediction of the current and future states of the system is a challenging task. In this paper, we address prognosis of cascading outages in power systems by employing a novel approach based on reduced ordered binary decision diagrams. We present a systemic way of synthesizing these decision diagrams based on a simple cascade model. We also describe a workflow for finding the emergency load curtailment actions as a part of the mitigation strategy. In the end, we show the applicability of our approach using the standard IEEE 14 bus system.
@inproceedings{Lasz2006Data,
author = {Ayman, Afiya and Wilbur, Michael and Sivagnanam, Amutheezan and Pugliese, Philip and Dubey, Abhishek and Laszka, Aron},
booktitle = {2020 IEEE International Conference on Smart Computing (SMARTCOMP)
(SMARTCOMP 2020)},
title = {{Data-Driven} Prediction of {Route-Level} Energy Use for {Mixed-Vehicle}
Transit Fleets},
year = {2020},
address = {Bologna, Italy},
month = jun,
acceptance = {32},
contribution = {colab},
days = {21},
keywords = {data-driven methods, intelligent systems, machine learning, data analytics, decision procedures},
tag = {ai4cps,transit},
what = {This paper presents foundational work on data-driven methods for intelligent systems, examining how to extract useful models and decision procedures from large-scale data. The work addresses fundamental challenges in data management, processing, and utilization for building intelligent systems that can learn from experience and adapt to changing conditions.},
why = {Data-driven approaches are increasingly central to building intelligent systems that can adapt to real-world complexity and uncertainty. Understanding effective methodologies for extracting knowledge from data is critical for advancing automation and decision-making in complex systems. This foundational work provides important perspective on how data-driven methods can be applied to intelligent system development.},
results = {The work establishes key principles and methodologies for data-driven system development, providing frameworks that have influenced subsequent research in machine learning and intelligent control systems. The contributions help establish the theoretical and practical foundations for data-driven approaches to system design.},
project_tags = {ML for CPS}
}
Due to increasing concerns about environmental impact, operating costs, and energy security, public transit agencies are seeking to reduce their fuel use by employing electric vehicles (EVs). However, because of the high upfront cost of EVs, most agencies can afford only mixed fleets of internal-combustion and electric vehicles. Making the best use of these mixed fleets presents a challenge for agencies since optimizing the assignment of vehicles to transit routes, scheduling charging, etc. require accurate predictions of electricity and fuel use. Recent advances in sensor-based technologies, data analytics, and machine learning enable remedying this situation; however, to the best of our knowledge, there exists no framework that would integrate all relevant data into a route-level prediction model for public transit. In this paper, we present a novel framework for the data-driven prediction of route-level energy use for mixed-vehicle transit fleets, which we evaluate using data collected from the bus fleet of CARTA, the public transit authority of Chattanooga, TN. We present a data collection and storage framework, which we use to capture system-level data, including traffic and weather conditions, and high-frequency vehicle-level data, including location traces, fuel or electricity use, etc. We present domain-specific methods and algorithms for integrating and cleansing data from various sources, including street and elevation maps. Finally, we train and evaluate machine learning models, including deep neural networks, decision trees, and linear regression, on our integrated dataset. Our results show that neural networks provide accurate estimates, while other models can help us discover relations between energy use and factors such as road and weather conditions.
@inproceedings{Bhattacharjee_2020,
author = {Bhattacharjee, Anirban and Chhokra, Ajay Dev and Sun, Hongyang and Shekhar, Shashank and Gokhale, Aniruddha and Karsai, Gabor and Dubey, Abhishek},
booktitle = {2020 IEEE 4th International Conference on Fog and Edge Computing (ICFEC)},
title = {Deep-Edge: An Efficient Framework for Deep Learning Model Update on Heterogeneous Edge},
year = {2020},
month = may,
publisher = {IEEE},
contribution = {minor},
doi = {http://dx.doi.org/10.1109/icfec50348.2020.00016},
isbn = {9781728173054},
journal = {2020 IEEE 4th International Conference on Fog and Edge Computing (ICFEC)},
tag = {ai4cps},
url = {http://dx.doi.org/10.1109/ICFEC50348.2020.00016}
}
Deep Learning (DL) model-based AI services are increasingly offered in a variety of predictive analytics services such as computer vision, natural language processing, speech recognition. However, the quality of the DL models can degrade over time due to changes in the input data distribution, thereby requiring periodic model updates. Although cloud data-centers can meet the computational requirements of the resource-intensive and time-consuming model update task, transferring data from the edge devices to the cloud incurs a significant cost in terms of network bandwidth and are prone to data privacy issues. With the advent of GPU-enabled edge devices, the DL model update can be performed at the edge in a distributed manner using multiple connected edge devices. However, efficiently utilizing the edge resources for the model update is a hard problem due to the heterogeneity among the edge devices and the resource interference caused by the co-location of the DL model update task with latency-critical tasks running in the background. To overcome these challenges, we present Deep-Edge, a load- and interference-aware, fault-tolerant resource management framework for performing model update at the edge that uses distributed training. This paper makes the following contributions. First, it provides a unified framework for monitoring, profiling, and deploying the DL model update tasks on heterogeneous edge devices. Second, it presents a scheduler that reduces the total re-training time by appropriately selecting the edge devices and distributing data among them such that no latency-critical applications experience deadline violations. Finally, we present empirical results to validate the efficacy of the framework using a real-world DL model update case-study based on the Caltech dataset and an edge AI cluster testbed.
@inproceedings{sundar2020detecting,
author = {Sundar, V. and Ramakrishna, S. and Rahiminasab, Z. and Easwaran, A. and Dubey, A.},
booktitle = {2020 IEEE Security and Privacy Workshops (SPW)},
title = {Out-of-Distribution Detection in Multi-Label Datasets using Latent Space of {\beta}-VAE},
year = {2020},
address = {Los Alamitos, CA, USA},
month = may,
pages = {250-255},
publisher = {IEEE Computer Society},
contribution = {colab},
doi = {10.1109/SPW50608.2020.00057},
keywords = {out-of-distribution detection, autonomous systems, generative models, safety, machine learning, environmental factors},
tag = {ai4cps},
url = {https://doi.ieeecomputersociety.org/10.1109/SPW50608.2020.00057},
what = {This paper presents methodology for detecting out-of-distribution examples in multi-label datasets using the latent space of beta-VAE models. The approach trains a beta-VAE for each partition of image data with specific generative factor values and uses KL-divergence metrics to identify when test images have factor values not seen during training. The methodology enables detection of safety-critical out-of-distribution scenarios in autonomous systems operating with multiple environmental factors.},
why = {Learning-enabled components in autonomous systems are widely used for perception tasks but often fail on out-of-distribution inputs not seen during training, creating safety risks. Traditional out-of-distribution detection approaches struggle with multi-label datasets where multiple environmental factors vary simultaneously. This work is innovative because it provides a practical approach using generative models to detect out-of-distribution images in complex scenarios, supporting safe deployment of learning-enabled autonomous systems.},
results = {The methodology successfully detects out-of-distribution variations in the nuScenes dataset across multiple generative factors including time-of-day, traffic density, and pedestrian presence. Results demonstrate that the approach can identify safety-critical distribution shifts and that appropriately selected beta-VAE models achieve better detection performance than standard approaches. The work shows that generative models can effectively support safety-critical out-of-distribution detection in autonomous systems.},
project_tags = {CPS, ML for CPS, Explainable AI}
}
@inproceedings{Hartsell_2020,
author = {Hartsell, Charles and Mahadevan, Nagabhushan and Nine, Harmon and Bapty, Ted and Dubey, Abhishek and Karsai, Gabor},
booktitle = {2020 IEEE Workshop on Design Automation for CPS and IoT (DESTION)},
title = {Workflow Automation for Cyber Physical System Development Processes},
year = {2020},
month = apr,
publisher = {IEEE},
contribution = {colab},
doi = {http://dx.doi.org/10.1109/DESTION50928.2020.00007},
isbn = {9781728199948},
journal = {2020 IEEE Workshop on Design Automation for CPS and IoT (DESTION)},
tag = {platform}
}
Development of Cyber Physical Systems (CPSs) requires close interaction between developers with expertise in many domains to achieve ever-increasing demands for improved performance, reduced cost, and more system autonomy. Each engineering discipline commonly relies on domain-specific modeling languages, and analysis and execution of these models is often automated with appropriate tooling. However, integration between these heterogeneous models and tools is often lacking, and most of the burden for inter-operation of these tools is placed on system developers. To address this problem, we introduce a workflow modeling language for the automation of complex CPS development processes and implement a platform for execution of these models in the Assurance-based Learning-enabled CPS (ALC) Toolchain. Several illustrative examples are provided which show how these workflow models are able to automate many time-consuming integration tasks previously performed manually by system developers.
@article{aymantoit2020,
author = {Ayman, Afiya and Sivagnanam, Amutheezan and Wilbur, Michael and Pugliese, Philip and Dubey, Abhishek and Laszka, Aron},
journal = {ACM Transations of Internet Technology},
title = {Data-Driven Prediction and Optimization of Energy Use for Transit Fleets of Electric and ICE Vehicles},
year = {2020},
contribution = {colab},
tag = {ai4cps,transit},
what = {This paper presents a comprehensive framework for data-driven prediction and optimization of energy consumption in transit fleets. The work integrates vehicle telemetry data, elevation information, weather conditions, and traffic data to build machine learning models for predicting energy consumption at the trip level. The framework includes algorithms for data cleaning, feature engineering, and optimization of vehicle-to-route assignments to minimize energy costs while meeting service constraints.},
why = {Public transit agencies operating mixed fleets of electric and internal combustion vehicles face significant challenges in optimizing operations while reducing environmental impact and operating costs. Accurate prediction of energy consumption is essential for effective vehicle scheduling and fleet management. This work is innovative because it provides an end-to-end framework that integrates real-world data collection, processing, and optimization to support practical decisions about vehicle assignments and fleet operation.},
results = {The framework successfully demonstrates energy consumption prediction for mixed transit fleets using real data from the Chattanooga Area Regional Transportation Authority. Machine learning models including neural networks and decision trees achieve accurate energy predictions that outperform simpler baselines. Results show that the approach can support optimization of vehicle assignments to minimize energy costs while maintaining service levels, providing practical benefits for transit agencies operating electric vehicles.},
keywords = {electric vehicles, energy consumption prediction, transit optimization, machine learning, vehicle scheduling, data-driven optimization},
project_tags = {energy, transit, ML for CPS}
}
Due to the high upfront cost of electric vehicles, many public transit agencies can afford only mixed fleets of internal-combustion and electric vehicles. Optimizing the operation of such mixed fleets is challenging because it requires accurate trip-level predictions of electricity and fuel use as well as efficient algorithms for assigning vehicles to transit routes. We present a novel framework for the data-driven prediction of trip-level energy use for mixed-vehicle transit fleets and for the optimization of vehicle assignments, which we evaluate using data collected from the bus fleet of CARTA, the public transit agency of Chattanooga, TN. We first introduce a data collection, storage, and processing framework for system-level and high-frequency vehicle-level transit data, including domain-specific data cleansing methods. We train and evaluate machine learning models for energy prediction, demonstrating that deep neural networks attain the highest accuracy. Based on these predictions, we formulate the problem of minimizing energy use through assigning vehicles to fixed-route transit trips. We propose an optimal integer program as well as efficient heuristic and meta-heuristic algorithms, demonstrating the scalability and performance of these algorithms numerically using the transit network of CARTA.
@inbook{baptydubeyjanos2020,
author = {Bapty, Ted and Dubey, Abhishek and Sztipanovits, Janos},
chapter = {8},
pages = {161-184},
publisher = {John Wiley & Sons, Ltd},
title = {Cyber-Physical Vulnerability Analysis of IoT Applications Using Multi-Modeling},
year = {2020},
isbn = {9781119593386},
booktitle = {Modeling and Design of Secure Internet of Things},
contribution = {colab},
doi = {10.1002/9781119593386.ch8},
eprint = {https://onlinelibrary.wiley.com/doi/pdf/10.1002/9781119593386.ch8},
keywords = {energy injection, home automation system, IoT-based cyber-physical systems, low-level physical vulnerabilities, multi-modeling approach, vulnerability analysis},
tag = {platform},
url = {https://onlinelibrary.wiley.com/doi/abs/10.1002/9781119593386.ch8}
}
Summary Using the Smart Home as a use case, we examine the vulnerabilities in the system across the technologies used in its implementation. A typical smart home will contain a variety of sensors, actuators (e.g. for opening doors), communication links, storage devices, video cameras, network interfaces, and control units. Each of these physical components and subsystems must be secure in order for the overall system to be secure. Typical security analysis focuses on the defined interfaces of the system: network security via firewalls, communications encryption, and authentication at terminals. Unfortunately, many of these devices in the Internet of Things (IoT) space are susceptible to physical attacks via electromagnetic energy, or other sound/heat energy. Properly designed electromagnetic (EM) waveforms can access a range of vulnerabilities, providing unanticipated entry points into the system. In this chapter, we discuss a multi-modeling methodology for analyzing cyber-physical vulnerabilities, assessing the system across geometry, electronic, and behavioral domains. A home automation system is used as an example, showing a methodology for assessing vulnerabilities in hardware. The example exploits the use of EM energy injection. A multi-modeling of the system captures the geometric structure of the hardware with links to behavioral models. Low-energy EM pathways are discovered that may impact system behavior. Computation is minimized by applying analysis of EM effects only at behavior-critical inputs and outputs. The chapter also discusses a methodology for system-level impact analysis. The final conclusion is that susceptibility to physical layer presents many attack surfaces, due to a large number of heterogeneous IoT devices, mandating consideration of the physical dimensions to vulnerability analysis and risk mitigation.
@inproceedings{barbour2020,
author = {{Barbour}, W. and {Wilbur}, M. and {Sandoval}, R. and {Dubey}, A. and {Work}, D. B.},
booktitle = {2020 IEEE Workshop on Design Automation for CPS and IoT (DESTION)},
title = {Streaming computation algorithms for spatiotemporal micromobility service availability},
year = {2020},
pages = {32-38},
contribution = {colab},
doi = {https://doi.org/10.1109/DESTION50928.2020.00012},
tag = {transit}
}
Location-based services and fleet management are important components of modern smart cities. However, statistical analysis with large-scale spatiotemporal data in real-time is computationally challenging and can necessitate compromise in accuracy or problem simplification. The main contribution of this work is the presentation of a stream processing approach for real-time monitoring of resource equity in spatially-aware micromobility fleets. The approach makes localized updates to resource availability as needed, instead of batch computation of availability at regular update intervals. We find that the stream processing approach can compute, on average, 62 resource availability updates in the same execution time as a single batch computation. This advantage in processing time makes continuous real-time stream processing equivalent to a batch computation performed every 15 minutes, in terms of algorithm execution time. Since the stream processing approach considers every update to the fleet in real-time, resource availability is always up-to-date and there is no compromise in terms of accuracy.
@inproceedings{barretocyber2020,
author = {Barreto, Carlos and Eghtesad, Taha and Eisele, Scott and Laszka, Aron and Dubey, Abhishek and Koutsoukos, Xenofon},
booktitle = {3rd IEEE International Conference on IndustrialCyber-Physical Systems (ICPS 2020)},
title = {Cyber-Attacks and Mitigation in Blockchain Based Transactive Energy Systems},
year = {2020},
category = {selectiveconference},
contribution = {colab},
keywords = {blockchain, transactive energy systems, cybersecurity, smart contracts, distributed energy resources, market mechanisms, cyberattacks},
project = {cps-reliability},
tag = {decentralization,power},
what = {This paper addresses cyber-attacks and mitigation strategies in blockchain-based transactive energy systems where distributed energy resources participate in peer-to-peer electricity trading. The work identifies vulnerabilities in blockchain-based market mechanisms and proposes attack scenarios that could disrupt market operations. The analysis considers scenarios where attackers can compromise gateway nodes, delay trading bids, and disrupt the blockchain consensus mechanism.},
why = {Blockchain-based transactive energy systems enable decentralized energy trading but introduce new security vulnerabilities and operational risks. Understanding how attackers could exploit these systems is essential for designing robust mechanisms that maintain market integrity even under adversarial conditions. This work is innovative because it systematically analyzes attack vectors specific to blockchain-based energy markets and demonstrates the importance of considering cybersecurity alongside operational efficiency in decentralized energy systems.},
results = {The analysis identifies critical vulnerabilities in blockchain-based transactive energy systems and demonstrates how delays in trading bids and gateway compromise could affect market equilibrium. Results show that certain attack scenarios can increase generator profits while reducing overall market efficiency. The work highlights the need for additional security mechanisms and careful system design to ensure that blockchain-based energy markets remain robust against cyber threats.},
project_tags = {energy, CPS}
}
Power grids are undergoing major changes due to the rapid adoption of intermittent renewable energy resources and the increased availability of energy storage devices. These trends drive smart-grid operators to envision a future where peer-to-peer energy trading occurs within microgrids, leading to the development of Transactive Energy Systems. Blockchains have garnered significant interest from both academia and industry for their potential application in decentralized TES, in large part due to their high level of resilience. In this paper, we introduce a novel class of attacks against blockchain based TES, which target the gateways that connect market participants to the system. We introduce a general model of blockchain based TES and study multiple threat models and attack strategies. We also demonstrate the impact of these attacks using a testbed based on GridLAB-D and a private Ethereum network. Finally, we study how to mitigate these attack.
@inproceedings{basak2020mscpes,
author = {Sajan, Kaduvettykunnal and Bariya, Mohini and Basak, Sanchita and Srivastava, Anurag K. and Dubey, Abhishek and von Meier, Alexandra and Biswas, Gautam},
booktitle = {8th Workshop on Modeling and Simulation of Cyber-Physical Energy Systems, MSCPES@CPSIoTWeek},
title = {Realistic Synchrophasor Data Generation for Anomaly Detection and Event Classification},
year = {2020},
category = {workshop},
contribution = {lead},
keywords = {phasor measurement units, anomaly detection, synthetic data, event classification, power systems, data generation},
project = {cps-reliability},
tag = {platform,power},
what = {This paper presents methodology for realistic synthetic phasor measurement unit (PMU) data generation to support anomaly detection and event classification in power systems. The work develops techniques for adding realistic noise and bad data effects to simulated measurements, enabling development and testing of anomaly detection algorithms without requiring access to real operational data. The methodology includes procedures for detecting and classifying five categories of events relevant to power grid operations.},
why = {Power grid operators increasingly rely on machine learning and data analytics for monitoring and decision support, but validation of these systems requires access to realistic data that includes various anomaly and event scenarios. Access to real operational data is limited due to security and confidentiality constraints. This work is innovative because it enables development of realistic synthetic data that captures the complexity of real power grid measurements while maintaining data privacy, supporting the advancement of anomaly detection and classification capabilities.},
results = {The methodology successfully generates realistic synthetic PMU data with configurable noise characteristics and event signatures. Results demonstrate that anomaly detection and event classification algorithms can be validated on synthetic data and achieve performance consistent with expectations. The work shows that the approach enables development and testing of algorithms for detecting anomalies including bad data, missing data, and various power system events, supporting more robust power grid monitoring systems.},
project_tags = {energy, CPS, ML for CPS}
}
The push to automate and digitize the electric grid has led to widespread installation of Phasor Measurement Units (PMUs) for improved real-time wide-area system monitoring and control. Nevertheless, transforming large volumes of highresolution PMU measurements into actionable insights remains challenging. A central challenge is creating flexible and scalable online anomaly detection in PMU data streams. PMU data can hold multiple types of anomalies arising in the physical system or the cyber system (measurements and communication networks). Increasing the grid situational awareness for noisy measurement data and Bad Data (BD) anomalies has become more and more significant. Number of machine learning, data analytics and physics based algorithms have been developed for anomaly detection, but need to be validated with realistic synchophasor data. Access to field data is very challenging due to confidentiality and security reasons. This paper presents a method for generating realistic synchrophasor data for the given synthetic network as well as event and bad data detection and classification algorithms. The developed algorithms include Bayesian and change-point techniques to identify anomalies, a statistical approach for event localization and multi-step clustering approach for event classification. Developed algorithms have been validated with satisfactory results for multiple examples of power system events including faults and load/generator/capacitor variations/switching for an IEEE test system. Set of synchrophasor data will be available publicly for other researchers.
@inproceedings{chhokrasam2020,
author = {Chhokra, Ajay and Mahadevan, Nagabhushan and Dubey, Abhishek and Karsa, Gabor},
booktitle = {12th System Analysis and Modelling Conference},
title = {Qualitative fault modeling in safety critical Cyber Physical Systems},
year = {2020},
contribution = {minor},
tag = {platform},
what = {This paper introduces ReSONAte, a dynamic runtime risk assessment framework for autonomous cyber-physical systems that uses Bow-Tie Diagrams to model hazard propagation and estimate risk based on current system state. The framework integrates design-time hazard analysis with runtime monitoring and state-dependent risk calculations. The methodology enables autonomous systems to dynamically adjust their operations based on estimated risk values in response to changing environmental conditions.},
why = {Autonomous systems must operate in complex, dynamic environments where the probability and severity of hazards change based on system state and environmental conditions. Static design-time risk assessments cannot capture these dynamic variations, limiting the ability of systems to maintain safety during continuous operation. This work is innovative because it bridges safety assurance methodologies with runtime operations, enabling systems to estimate risk dynamically and support adaptive safety management based on current operational conditions.},
results = {The ReSONAte framework successfully demonstrates dynamic risk estimation for autonomous vehicles in simulation environments. The approach shows that risk calculations can be performed efficiently at runtime while accurately reflecting state-dependent hazard probabilities. Results demonstrate that the framework enables autonomous systems to estimate risk based on current conditions and support dynamic decision-making that maintains safety during continuous operation.},
keywords = {autonomous systems, runtime risk assessment, safety assurance, Bow-Tie Diagrams, cyber-physical systems, state-dependent risk},
project_tags = {CPS, Explainable AI, scalable AI}
}
One of the key requirements for designing safety critical cyber physical systems (CPS) is to ensure resiliency. Typically, the cyber sub-system in a CPS is empowered with protection devices that quickly detect and isolate faulty components to avoid failures. However, these protection devices can have internal faults that can cause cascading failures, leading to system collapse. Thus, to guarantee the resiliency of the system, it is necessary to identifythe root cause(s) of a given system disturbance to take appropriate control actions. Correct failure diagnosis in such systems depends upon an integrated fault model of the system that captures the effect of faults in CPS as well as nominal and faulty operation of protection devices, sensors, and actuators. In this paper, we propose a novel graph based qualitative fault modeling formalism for CPS, called, Temporal Causal Diagrams(TCDs) that allow system designers to effectively represent faultsand their effects in both physical and cyber sub-systems. The paper also discusses in detail the fault propagation and execution semantics of a TCD model by translating to timed automata and thus allowing an efficient means to quickly analyze, validate and verify the fault model. In the end, we show the efficacy of the modeling approach with the help of a case study from energy system.
@article{eisele2020Blockchains,
author = {Eisele, Scott and Barreto, Carlos and Dubey, Abhishek and Koutsoukos, Xenofon and Eghtesad, Taha and Laszka, Aron and Mavridou, Anastasia},
journal = {IEEE Computer},
title = {Blockchains for Transactive Energy Systems: Opportunities, Challenges, and Approaches},
year = {2020},
contribution = {lead},
tag = {platform,decentralization,power},
what = {This paper examines blockchain technology and smart contracts for transactive energy systems, analyzing opportunities, challenges, and approaches for implementing blockchain-based electricity markets. The work discusses implementation of TRANSAX, a blockchain-based transactive energy system that provides efficient, safe, and privacy-preserving markets built on smart contracts. The analysis covers code complexity, privacy concerns, computational efficiency, integration challenges, and security vulnerabilities specific to blockchain-based energy systems.},
why = {Blockchain and distributed ledger technologies enable new possibilities for decentralized energy trading and peer-to-peer electricity markets, but their practical adoption faces significant challenges related to security, efficiency, and integration with existing infrastructure. Understanding these challenges is essential for designing systems that can leverage blockchain benefits while maintaining operational reliability and security. This work is innovative because it systematically analyzes the trade-offs inherent in blockchain-based energy systems and provides practical solutions for addressing key implementation challenges.},
results = {The paper presents TRANSAX, a working blockchain-based transactive energy system that demonstrates practical approaches for implementing markets with complex computational requirements. Results show that blockchain-based systems can support transactive energy operations while maintaining system efficiency through hybrid solver patterns and careful smart contract design. The work demonstrates that blockchain technology can enable decentralized energy markets while addressing practical challenges through appropriate architectural choices.},
keywords = {blockchain, transactive energy systems, smart contracts, decentralized markets, energy trading, peer-to-peer energy systems},
project_tags = {energy, CPS, middleware}
}
The emergence of blockchains and smart contracts have renewed interest in electrical cyber-physical systems, especially in the area of transactive energy systems. However, despite recent advances, there remain significant challenges that impede the practical adoption of blockchains in transactive energy systems, which include implementing complex market mechanisms in smart contracts, ensuring safety of the power system, and protecting residential consumers’ privacy. To address these challenges, we present TRANSAX, a blockchain-based transactive energy system that provides an efficient, safe, and privacy-preserving market built on smart contracts. Implementation and deployment of TRANSAX in a verifiably correct and efficient way is based on VeriSolid, a framework for the correct-by-construction development of smart contracts, and RIAPS, a middleware for resilient distributed power systems
@inproceedings{eisele2020mechanisms,
author = {Eisele, Scott and Eghtesad, Taha and Troutman, Nicholas and Laszka, Aron and Dubey, Abhishek},
booktitle = {14TH ACM International Conference on Distributed and Event Based Systems},
title = {Mechanisms for Outsourcing Computation via a Decentralized Market},
year = {2020},
acceptance = {25.5},
category = {selectiveconference},
contribution = {lead},
keywords = {blockchain, decentralized computation, smart contracts, job scheduling, distributed ledger, computation verification},
tag = {platform,decentralization},
what = {This paper introduces MODiCUM, a blockchain-based distributed ledger system for outsourcing computation with mechanisms to address misbehavior and verify results. The work presents a smart contract-based protocol for job outsourcing that enables verification of computation correctness while keeping verification costs low. The system addresses challenges in decentralized computation by implementing dispute resolution mechanisms and ensuring that resource providers receive payment only for correct results.},
why = {Outsourcing computation to distributed resources could enable efficient use of idle computing capacity, but requires mechanisms to verify that results are correct without trusting individual participants. Traditional centralized approaches suffer from single points of failure and may not capture all available computing resources. This work is innovative because it provides a decentralized approach to computation outsourcing using blockchain smart contracts, enabling trustless verification and fair compensation while maintaining computational efficiency.},
results = {MODiCUM successfully demonstrates a working decentralized computation marketplace where jobs are posted, matched with resource providers, and results are verified using smart contracts. The system enables efficient verification through random sampling of results and maintains fairness through mechanisms that prevent collusion. Results show that the approach can support computation outsourcing with reasonable efficiency while ensuring that dishonest participants face consequences.},
project_tags = {middleware, CPS}
}
As the number of personal computing and IoT devices grows rapidly, so does the amount of computational power that is available at the edge. Since many of these devices are often idle, there is a vast amount of computational power that is currently untapped, and which could be used for outsourcing computation. Existing solutions for harnessing this power, such as volunteer computing (e.g., BOINC), are centralized platforms in which a single organization or company can control participation and pricing. By contrast, an open market of computational resources, where resource owners and resource users trade directly with each other, could lead to greater participation and more competitive pricing. To provide an open market, we introduce MODiCuM, a decentralized system for outsourcing computation. MODiCuM deters participants from misbehaving-which is a key problem in decentralized systems-by resolving disputes via dedicated mediators and by imposing enforceable fines. However, unlike other decentralized outsourcing solutions, MODiCuM minimizes computational overhead since it does not require global trust in mediation results. We provide analytical results proving that MODiCuM can deter misbehavior, and we evaluate the overhead of MODiCuM using experimental results based on an implementation of our platform.
@article{GHOSH2020101759,
author = {Ghosh, Purboday and Eisele, Scott and Dubey, Abhishek and Metelko, Mary and Madari, Istvan and Volgyesi, Peter and Karsai, Gabor},
journal = {Journal of Systems Architecture},
title = {Designing a decentralized fault-tolerant software framework for smart grids and its applications},
year = {2020},
issn = {1383-7621},
pages = {101759},
volume = {109},
contribution = {minor},
doi = {https://doi.org/10.1016/j.sysarc.2020.101759},
keywords = {Component, Fault tolerance, Distributed systems, Smart grid},
tag = {platform},
url = {http://www.sciencedirect.com/science/article/pii/S1383762120300539}
}
The vision of the ‘Smart Grid’ anticipates a distributed real-time embedded system that implements various monitoring and control functions. As the reliability of the power grid is critical to modern society, the software supporting the grid must support fault tolerance and resilience of the resulting cyber-physical system. This paper describes the fault-tolerance features of a software framework called Resilient Information Architecture Platform for Smart Grid (RIAPS). The framework supports various mechanisms for fault detection and mitigation and works in concert with the applications that implement the grid-specific functions. The paper discusses the design philosophy for and the implementation of the fault tolerance features and presents an application example to show how it can be used to build highly resilient systems.
@inproceedings{haophm2020,
author = {Tu, Hao and Lukic, Srdjan and Dubey, Abhishek and Karsai, Gabor},
booktitle = {Annual Conference of the PHM Society},
title = {An LSTM-Based Online Prediction Method for Building Electric Load During COVID-19},
year = {2020},
contribution = {minor},
tag = {ai4cps,power},
what = {This paper presents an LSTM-based online prediction method for building electric load during COVID-19 that adapts to concept changes in energy consumption patterns. The approach uses online learning with adaptive learning rates to maintain model accuracy when building energy use patterns change fundamentally due to operational changes. The methodology includes ensemble approaches with multiple models at different learning rates to enable robust predictions despite changing consumption patterns.},
why = {Building energy consumption patterns change significantly in response to occupancy changes and operational modifications, making historical models ineffective when such changes occur. Traditional offline models cannot adapt to these concept changes, leading to poor predictions during transition periods. This work is innovative because it provides an online learning approach that enables energy prediction systems to adapt in real-time to concept changes, improving prediction accuracy and enabling better building management during periods of operational change.},
results = {The LSTM-based approach successfully predicts building electric load and adapts to changes in energy consumption patterns caused by COVID-19 building closures and reopenings. Results show that online learning with adaptive learning rates improves prediction accuracy compared to fixed-rate online learning, achieving lower prediction errors as the system adapts to new patterns. The approach enables building management systems to maintain accurate energy predictions despite changing operational conditions.},
keywords = {building energy prediction, LSTM, online learning, concept drift, adaptive learning, smart buildings},
project_tags = {energy, ML for CPS}
}
Accurate prediction of electric load is critical to optimally controlling and operating buildings. It provides the opportunities to reduce building energy consumption and to implement advanced functionalities such as demand response in the context of smart grid. However, buildings are nonstationary and it is important to consider the underlying concept changes that will affect the load pattern. In this paper we present an online learning method for predicting building electric load during concept changes such as COVID-19. The proposed methods is based on online Long Short-Term Memory (LSTM) recurrent neural network. To speed up the learning process during concept changes and improve prediction accuracy, an ensemble of multiple models with different learning rates is used. The learning rates are updated in realtime to best adapt to the new concept while maintaining the learned information for the prediction.
@article{Hasan2020,
author = {Hasan, Saqib and Dubey, Abhishek and Karsai, Gabor and Koutsoukos, Xenofon},
journal = {International Journal of Electrical Power \& Energy Systems},
title = {A game-theoretic approach for power systems defense against dynamic cyber-attacks},
year = {2020},
issn = {0142-0615},
volume = {115},
contribution = {colab},
doi = {https://doi.org/10.1016/j.ijepes.2019.105432},
file = {:Hasan2020-A_Game_Theoretic_Approach_for_Power_Systems_Defense_against_Dynamic_Cyber_Attacks.pdf:PDF},
keywords = {power systems, cybersecurity, game theory, dynamic attacks, resilience, critical infrastructure protection},
project = {cps-reliability},
tag = {platform,power},
url = {http://www.sciencedirect.com/science/article/pii/S0142061519302807},
what = {This paper presents a game-theoretic approach for power system defense against dynamic cyber-attacks where attackers can target multiple substations at different times. The work develops both static and dynamic attack models and provides efficient algorithms for identifying worst-case attacks and optimal defense strategies. The methodology uses game theory to model the interaction between attackers and defenders, enabling strategic identification of critical substations to protect.},
why = {Power systems face emerging threats from dynamic cyber-attacks that can target multiple components at different times to maximize damage while remaining within resource constraints. Static defense approaches are insufficient for protecting against strategically sequenced attacks. This work is innovative because it applies game-theoretic analysis to power system cybersecurity, enabling defenders to anticipate worst-case attack scenarios and identify optimal protection strategies that minimize overall system damage.},
results = {The game-theoretic analysis identifies worst-case dynamic attacks and optimal defense strategies for IEEE test systems. Results show that dynamic attacks can cause greater damage than static attacks through strategic timing and component selection. The work provides algorithms for defenders to identify critical substations to protect based on attack budget constraints, enabling more strategic allocation of limited security resources.},
project_tags = {energy, CPS, planning}
}
Technological advancements in today’s electrical grids give rise to new vulnerabilities and increase the potential attack surface for cyber-attacks that can severely affect the resilience of the grid. Cyber-attacks are increasing both in number as well as sophistication and these attacks can be strategically organized in chronological order (dynamic attacks), where they can be instantiated at different time instants. The chronological order of attacks enables us to uncover those attack combinations that can cause severe system damage but this concept remained unexplored due to the lack of dynamic attack models. Motivated by the idea, we consider a game-theoretic approach to design a new attacker-defender model for power systems. Here, the attacker can strategically identify the chronological order in which the critical substations and their protection assemblies can be attacked in order to maximize the overall system damage. However, the defender can intelligently identify the critical substations to protect such that the system damage can be minimized. We apply the developed algorithms to the IEEE-39 and 57 bus systems with finite attacker/defender budgets. Our results show the effectiveness of these models in improving the system resilience under dynamic attacks.
@inproceedings{m4iot2020,
author = {Kang, Zhuangwei and Canady, Robert and Dubey, Abhishek and Gokhale, Aniruddha and Shekhar, Shashank and Sedlacek, Matous},
booktitle = {Proceedings of the 7th Workshop on Middleware and Applications for the Internet of Things, M4IoT@Middleware},
title = {A Study of Publish/Subscribe Middleware Under Different
IoT Traffic Conditions},
year = {2020},
contribution = {minor},
tag = {platform},
what = {This paper presents a study of publish/subscribe middleware platforms including DDS, MQTT, and ZeroMQ under various IoT traffic conditions. The work defines QoS properties relevant to IoT applications and systematically evaluates middleware performance across three representative use cases including high-frequency data flows, periodic data flows, and sporadic data flows. The analysis provides practical guidance on middleware selection for different IoT applications.},
why = {IoT applications have diverse and demanding requirements for data dissemination, but middleware platforms are often selected based on general reputation rather than systematic evaluation for specific use cases. Understanding how different middleware platforms perform under realistic IoT traffic conditions is essential for making effective technology choices. This work is innovative because it provides systematic, quantitative comparison of middleware platforms under realistic IoT scenarios, enabling practitioners to make informed decisions about technology selection.},
results = {The evaluation shows that different middleware platforms excel under different traffic conditions, with DDS providing the most stable performance for most scenarios but higher latency than alternatives. Results demonstrate that middleware selection should consider specific QoS requirements and traffic patterns rather than assuming universal platform superiority. The work provides practical guidance for IoT system designers on matching middleware platforms to application requirements.},
keywords = {publish/subscribe middleware, IoT, quality of service, distributed systems, MQTT, DDS, ZeroMQ},
project_tags = {middleware, CPS}
}
Publish/Subscribe (pub/sub) semantics are critical forIoT applications due to their loosely coupled nature.Although OMG DDS, MQTT, and ZeroMQ are mature pub/sub solutions used for IoT, prior studies show that their performance varies significantly under differentload conditions and QoS configurations, which makes middleware selection and configuration decisions hard. Moreover, the load conditions and role of QoS settings inprior comparison studies are not comprehensive and well-documented. To address these limitations, we (1) propose a set of performance-related properties for pub/sub middleware and investigate their support in DDS, MQTT,and ZeroMQ; (2) perform systematic experiments under three representative, lab-based real-world IoT use cases; and (3) improve DDS performance by applying three of our proposed QoS properties. Empirical results show that DDS has the most thorough QoS support, and more reliable performance in most scenarios. In addition, its Multicast, TurboMode, and AutoThrottle QoS policies can effectively improve DDS performance in terms of throughput and latency
@article{nannapaneni2020online,
author = {Nannapaneni, Saideep and Mahadevan, Sankaran and Dubey, Abhishek and Lee, Yung-Tsun Tina},
journal = {Journal of Intelligent Manufacturing},
title = {Online monitoring and control of a cyber-physical manufacturing process under uncertainty},
year = {2020},
pages = {1--16},
contribution = {minor},
doi = {https://doi.org/10.1007/s10845-020-01609-7},
publisher = {Springer},
tag = {platform}
}
Recent technological advancements in computing, sensing and communication have led to the development of cyber-physical manufacturing processes, where a computing subsystem monitors the manufacturing process performance in real-time by analyzing sensor data and implements the necessary control to improve the product quality. This paper develops a predictive control framework where control actions are implemented after predicting the state of the manufacturing process or product quality at a future time using process models. In a cyber-physical manufacturing process, the product quality predictions may be affected by uncertainty sources from the computing subsystem (resource and communication uncertainty), manufacturing process (input uncertainty, process variability and modeling errors), and sensors (measurement uncertainty). In addition, due to the continuous interactions between the computing subsystem and the manufacturing process, these uncertainty sources may aggregate and compound over time. In some cases, some process parameters needed for model predictions may not be precisely known and may need to be derived from real time sensor data. This paper develops a dynamic Bayesian network approach, which enables the aggregation of multiple uncertainty sources, parameter estimation and robust prediction for online control. As the number of process parameters increase, their estimation using sensor data in real-time can be computationally expensive. To facilitate real-time analysis, variance-based global sensitivity analysis is used for dimension reduction. The proposed methodology of online monitoring and control under uncertainty, and dimension reduction, are illustrated for a cyber-physical turning process.
@article{SHEKHAR2020101710,
author = {Shekhar, Shashank and Chhokra, Ajay and Sun, Hongyang and Gokhale, Aniruddha and Dubey, Abhishek and Koutsoukos, Xenofon and Karsai, Gabor},
journal = {Journal of Systems Architecture},
title = {URMILA: Dynamically Trading-off Fog and Edge Resources for Performance and Mobility-Aware IoT Services},
year = {2020},
contribution = {colab},
issn = {1383-7621},
doi = {https://doi.org/10.1016/j.sysarc.2020.101710},
keywords = {Fog/Edge Computing, User Mobility, Latency-sensitive IoT Services, Resource Management, middleware, performance},
project = {cps-middleware},
tag = {platform,transit},
url = {http://www.sciencedirect.com/science/article/pii/S1383762120300047}
}
The fog/edge computing paradigm is increasingly being adopted to support a range of latency-sensitive IoT services due to its ability to assure the latency requirements of these services while supporting the elastic properties of cloud computing. IoT services that cater to user mobility, however, face a number of challenges in this context. First, since user mobility can incur wireless connectivity issues, executing these services entirely on edge resources, such as smartphones, will result in a rapid drain in the battery charge. In contrast, executing these services entirely on fog resources, such as cloudlets or micro data centers, will incur higher communication costs and increased latencies in the face of fluctuating wireless connectivity and signal strength. Second, a high degree of multi-tenancy on fog resources involving different IoT services can lead to performance interference issues due to resource contention. In order to address these challenges, this paper describes URMILA, which makes dynamic resource management decisions to achieve effective trade-offs between using the fog and edge resources yet ensuring that the latency requirements of the IoT services are met. We evaluate URMILA’s capabilities in the context of a real-world use case on an emulated but realistic IoT testbed.
@inproceedings{Pettet2020,
author = {Pettet, Geoffrey and Mukhopadhyay, Ayan and Kochenderfer, Mykel and Vorobeychik, Yevgeniy and Dubey, Abhishek},
booktitle = {Proceedings of the 19th Conference on Autonomous Agents and MultiAgent Systems, {AAMAS} 2020, Auckland, New Zealand},
title = {On Algorithmic Decision Procedures in Emergency Response Systems in Smart and Connected Communities},
year = {2020},
category = {selectiveconference},
contribution = {lead},
acceptance = {23},
keywords = {emergency response, decision procedures, resource allocation, algorithmic planning, smart cities, multi-objective optimization},
project = {smart-emergency-response,smart-cities},
tag = {ai4cps, decentralization,incident},
timestamp = {Wed, 17 Jan 2020 07:24:00 +0200},
what = {This paper presents algorithmic decision procedures for emergency response management in smart cities, addressing the problem of optimal incident response under constraints of limited resources and communication disruptions. The work develops both greedy and Monte Carlo Tree Search approaches for dynamically rebalancing emergency responders in response to changing incident patterns. The methodology addresses the tension between minimizing immediate response times and maintaining overall system efficiency.},
why = {Emergency response systems must make decisions about resource allocation in real-time with incomplete information about incident severity and responder availability. Traditional approaches often focus on minimizing response time for individual incidents without considering overall system efficiency or the need to dynamically rebalance resources. This work is innovative because it provides algorithmic approaches for emergency response that optimize over multiple objectives and adapt dynamically to changing incident patterns, enabling more efficient and effective emergency management.},
results = {The algorithmic approaches successfully identify optimal responder allocations and demonstrate that dynamic rebalancing strategies can significantly reduce average response times compared to greedy approaches. Monte Carlo Tree Search provides more sophisticated decision-making by considering future incident probabilities, while greedy approaches offer computational efficiency. Results show that the approach enables emergency response systems to balance immediate response needs with longer-term system efficiency.},
project_tags = {emergency, planning, scalable AI}
}
Emergency Response Management (ERM) is a critical problem faced by communities across the globe. Despite its importance, it is common for ERM systems to follow myopic and straight-forward decision policies in the real world. Principled approaches to aid decision-making under uncertainty have been explored in this context but have failed to be accepted into real systems. We identify a key issue impeding their adoption — algorithmic approaches to emergency response focus on reactive, post-incident dispatching actions, i.e. optimally dispatching a responder after incidents occur. However, the critical nature of emergency response dictates that when an incident occurs, first responders always dispatch the closest available responder to the incident. We argue that the crucial period of planning for ERM systems is not post-incident, but between incidents. However, this is not a trivial planning problem - a major challenge with dynamically balancing the spatial distribution of responders is the complexity of the problem. An orthogonal problem in ERM systems is to plan under limited communication, which is particularly important in disaster scenarios that affect communication networks. We address both the problems by proposing two partially decentralized multi-agent planning algorithms that utilize heuristics and the structure of the dispatch problem. We evaluate our proposed approach using real-world data, and find that in several contexts, dynamic re-balancing the spatial distribution of emergency responders reduces both the average response time as well as its variance.
@inproceedings{pettetisgt2020,
author = {Pettet, Geoffrey and Ghosal, Malini and Mahserejian, Shant and Davis, Sarah and Sridhar, Siddharth and Dubey, Abhishek and Meyer, Michael},
booktitle = {2020 IEEE Power \& Energy Society Innovative Smart Grid Technologies Conference (ISGT)},
title = {A Decision Support Framework for Grid-Aware Electric Bus Charge Scheduling},
year = {2020},
organization = {IEEE},
contribution = {colab},
tag = {ai4cps,power},
what = {This paper presents a decision support framework for electric bus charge scheduling that integrates traffic and power grid models to optimize charging decisions. The framework uses a Markov Decision Process to model the bus charging problem, considering both operational costs and power grid impacts. The system was evaluated on the Tri-Cities transit network in Washington using detailed traffic simulation and power grid models to assess real-world applicability. The approach provides both offline planning and online decision-making capabilities for fleet management.},
why = {Electric bus fleets face a critical challenge in coordinating charging decisions to minimize costs while maintaining grid reliability and schedule adherence. This work is innovative because it bridges transportation and energy systems by jointly optimizing for both domains, rather than treating them separately. The decision support framework enables transit authorities to make informed scheduling choices that balance multiple competing objectives in a unified manner.},
results = {The framework achieved $860 in optimization cost, which is $50 lower than a greedy charging policy for the test scenario. The power grid impact metric was 376, significantly better than the greedy approach's score of 362. When scaled to the full 75-bus Richland transit system, results suggest potential savings of over $100k per year without requiring grid infrastructure upgrades. The framework demonstrated that considering grid constraints during charge scheduling can substantially reduce operational costs.},
keywords = {electric buses, charge scheduling, grid-aware decision support, Markov decision process, transportation-energy integration, traffic simulation},
project_tags = {transit, energy, planning, scalable AI}
}
While there are many advantages to electric public transit vehicles, they also pose new challenges for fleet operators. One key challenge is defining a charge scheduling policy that minimizes operating costs and power grid disruptions while maintaining schedule adherence. An uncoordinated policy could result in buses running out of charge before completing their trip, while a grid agnostic policy might incur higher energy costs or cause an adverse impact on the grid’s distribution system. We present a grid aware decision-theoretic framework for electric bus charge scheduling that accounts for energy price and grid load. The framework co-simulates models for traffic (Simulation of Urban Mobility) and the electric grid (GridLAB-D), which are used by a decision-theoretic planner to evaluate charging decisions with regard to their long-term effect on grid reliability and cost. We evaluated the framework on a simulation of Richland, WA’s bus and grid network, and found that it could save over $100k per year on operating costs for the city compared to greedy methods.
@inproceedings{Potteiger2020,
author = {{Potteiger}, B. and {Cai}, F. and {Dubey}, A. and {Koutsoukos}, X. and {Zhang}, Z.},
booktitle = {2020 IEEE 23rd International Symposium on Real-Time Distributed Computing (ISORC)},
title = {Security in Mixed Time and Event Triggered Cyber-Physical Systems using Moving Target Defense},
year = {2020},
pages = {89-97},
contribution = {minor},
doi = {https://doi.org/10.1109/ISORC49007.2020.00022},
tag = {platform}
}
Memory corruption attacks such as code injection, code reuse, and non-control data attacks have become widely popular for compromising safety-critical Cyber-Physical Systems (CPS). Moving target defense (MTD) techniques such as instruction set randomization (ISR), address space randomization (ASR), and data space randomization (DSR) can be used to protect systems against such attacks. CPS often use time-triggered architectures to guarantee predictable and reliable operation. MTD techniques can cause time delays with unpredictable behavior. To protect CPS against memory corruption attacks, MTD techniques can be implemented in a mixed time and event-triggered architecture that provides capabilities for maintaining safety and availability during an attack. This paper presents a mixed time and event-triggered MTD security approach based on the ARINC 653 architecture that provides predictable and reliable operation during normal operation and rapid detection and reconfiguration upon detection of attacks. We leverage a hardware-in-the-loop testbed and an advanced emergency braking system (AEBS) case study to show the effectiveness of our approach.
@article{ramakrishna2020dynamic,
author = {Ramakrishna, Shreyas and Harstell, Charles and Burruss, Matthew P. and Karsai, Gabor and Dubey, Abhishek},
journal = {Journal of Systems Architecture},
title = {Dynamic-weighted simplex strategy for learning enabled cyber physical systems},
year = {2020},
issn = {1383-7621},
pages = {101760},
volume = {111},
contribution = {lead},
doi = {https://doi.org/10.1016/j.sysarc.2020.101760},
keywords = {learning-enabled components, simplex architecture, autonomous vehicles, reinforcement learning, safety assurance, control synthesis},
tag = {a14cps},
url = {https://www.sciencedirect.com/science/article/pii/S1383762120300540},
what = {This paper introduces a dynamic-weighted simplex strategy for learning-enabled cyber-physical systems that use Learning Enabled Components for autonomous control. The approach extends the classical Simplex Architecture by incorporating reinforcement learning to dynamically weight controller outputs, allowing smooth transitions between an advanced high-performance controller and a safe baseline controller. The framework was demonstrated on a DeepNNCar autonomous vehicle platform with real-time performance monitoring and resource management.},
why = {Learning-enabled components offer impressive capabilities but present challenges in ensuring system safety and correctness under all circumstances. This work is innovative because it combines formal safety assurance from the Simplex Architecture with machine learning optimization, enabling systems to leverage high-performance learned components while maintaining provable safety guarantees. The dynamic weighting mechanism allows gradual transitions that avoid abrupt control switches.},
results = {The dynamic-weighted simplex strategy achieved 60% fewer out-of-track soft constraint violations compared to the original LEC-driven system while demonstrating higher optimized speed performance of 0.4 m/s during indoor driving. The approach successfully reduced computational overhead by avoiding the full complexity of the advanced controller when not needed. The framework proved effective at balancing safety and performance in real-world autonomous driving scenarios.},
project_tags = {CPS, ML for CPS, Explainable AI}
}
Cyber Physical Systems (CPS) have increasingly started using Learning Enabled Components (LECs) for performing perception-based control tasks. The simple design approach, and their capability to continuously learn has led to their widespread use in different autonomous applications. Despite their simplicity and impressive capabilities, these components are difficult to assure, which makes their use challenging. The problem of assuring CPS with untrusted controllers has been achieved using the Simplex Architecture. This architecture integrates the system to be assured with a safe controller and provides a decision logic to switch between the decisions of these controllers. However, the key challenges in using the Simplex Architecture are: (1) designing an effective decision logic, and (2) sudden transitions between controller decisions lead to inconsistent system performance. To address these research challenges, we make three key contributions: (1) dynamic-weighted simplex strategy – we introduce “weighted simplex strategy” as the weighted ensemble extension of the classical Simplex Architecture. We then provide a reinforcement learning based mechanism to find dynamic ensemble weights, (2) middleware framework – we design a framework that allows the use of the dynamic-weighted simplex strategy, and provides a resource manager to monitor the computational resources, and (3) hardware testbed – we design a remote-controlled car testbed called DeepNNCar to test and demonstrate the aforementioned key concepts. Using the hardware, we show that the dynamic-weighted simplex strategy has 60% fewer out-of-track occurrences (soft constraint violations), while demonstrating higher optimized speed (performance) of 0.4 m/s during indoor driving than the original LEC driven system.
@inproceedings{ramakrishna2020methodology,
author = {Ramakrishna, Shreyas and Hartsell, Charles and Dubey, Abhishek and Pal, Partha and Karsai, Gabor},
booktitle = {Thirteenth International Tools and Methods of Competitive Engineering Symposium (TMCE 2020)},
title = {A Methodology for Automating Assurance Case Generation},
year = {2020},
archiveprefix = {arXiv},
contribution = {minor},
eprint = {2003.05388},
preprint = {https://arxiv.org/abs/2003.05388},
primaryclass = {cs.RO},
tag = {ai4cps}
}
Safety Case has become an integral component for safety-certification in various Cyber Physical System domains including automotive, aviation, medical devices, and military. The certification processes for these systems are stringent and require robust safety assurance arguments and substantial evidence backing. Despite the strict requirements, current practices still rely on manual methods that are brittle, do not have a systematic approach or thorough consideration of sound arguments. In addition, stringent certification requirements and ever-increasing system complexity make ad-hoc, manual assurance case generation (ACG) inefficient, time consuming, and expensive. To improve the current state of practice, we introduce a structured ACG tool which uses system design artifacts, accumulated evidence, and developer expertise to construct a safety case and evaluate it in an automated manner. We also illustrate the applicability of the ACG tool on a remote-control car testbed case study.
@article{riaps2020,
author = {{Tu}, H. and {Du}, Y. and {Yu}, H. and {Dubey}, Abhishek and {Lukic}, S. and {Karsai}, G.},
journal = {IEEE Transactions on Industrial Electronics},
title = {Resilient Information Architecture Platform for the Smart Grid: A Novel Open-Source Platform for Microgrid Control},
year = {2020},
number = {11},
pages = {9393-9404},
volume = {67},
contribution = {colab},
tag = {platform},
what = {This paper presents RIAPS, a novel open-source platform for implementing and deploying distributed microgrid control systems. RIAPS provides a component-based architecture with hardware-agnostic deployment capabilities, hard real-time features through Linux PREEMPT_RT, comprehensive communication frameworks, and high-precision time synchronization. The platform supports IEEE 2030.7 standard microgrid functions and includes tools for developing, testing, and deploying real-time distributed control algorithms.},
why = {Distributed microgrid control systems are critical for modern power distribution but face challenges in bridging the gap between simulation and real-world deployment. RIAPS is innovative because it provides a unified open-source platform that eliminates the need for proprietary solutions while supporting practical requirements such as real-time synchronization, fault tolerance, and standardized communication protocols. The platform enables rapid prototyping and deployment of complex distributed algorithms on real hardware.},
results = {RIAPS successfully demonstrated distributed secondary control capable of synchronizing and proportionally compensating system voltage unbalance caused by unbalanced loads in microgrid scenarios. The platform achieved high-precision time synchronization with minimal jitter and latency, enabling coordinated control across multiple distributed computing nodes. The system demonstrated the key features of the platform including node synchronization, implementation of distributed consensus algorithms, and fault-detection and recovery mechanisms.},
keywords = {microgrid control, distributed systems, real-time systems, IEEE 2030.7, platform architecture, deployment services},
project_tags = {energy, CPS, middleware, scalable AI}
}
Microgrids are seen as an effective way to achieve reliable, resilient, and efficient operation of the power distribution system. Core functions of the microgrid control system are defined by the IEEE Standard 2030.7; however, the algorithms that realize these functions are not standardized, and are a topic of research. Furthermore, the corresponding controller hardware, operating system, and communication system to implement these functions vary significantly from one implementation to the next. In this article, we introduce an open-source platform, resilient information architecture platform for the smart grid (RIAPS), ideally suited for implementing and deploying distributed microgrid control algorithms. RIAPS provides a design-time tool suite for development and deployment of distributed microgrid control algorithms. With support from a number of run-time platform services, developed algorithms can be easily implemented and deployed into real microgrids. To demonstrate the unique features of RIAPS, we propose and implement a distributed microgrid secondary control algorithm capable of synchronized and proportional compensation of voltage unbalance using distributed generators. Test results show the effectiveness of the proposed control and the salient features of the RIAPS platform.
@inproceedings{rsuicfc2020,
author = {Talusan, Jose Paolo and Wilbur, Michael and Dubey, Abhishek and Yasumoto, Keiichi},
booktitle = {2020 IEEE International Conference on Fog Computing (ICFC)},
title = {On Decentralized Route Planning Using the Road Side Units as Computing Resources},
year = {2020},
organization = {IEEE},
category = {selectiveconference},
contribution = {colab},
keywords = {decentralized routing, edge computing, road side units, privacy-preserving, smart cities, task allocation},
tag = {decentralization,transit},
what = {This paper proposes a decentralized route planning service using Road Side Units as computing resources to provide real-time route planning while preserving privacy in smart cities. The system divides the city into grids with RSUs responsible for route planning tasks in their geographic areas, avoiding centralized cloud dependencies. The approach includes algorithms for task allocation and decentralized route planning that account for communication latency and model accuracy when assigning queries to RSUs.},
why = {Centralized route planning services expose cities to data privacy risks and latency issues, particularly during emergencies when reliable low-latency service is critical. This work is innovative because it leverages distributed edge computing infrastructure to provide privacy-preserving, low-latency route planning by pushing computation to the network edge. The decentralized design improves resilience and enables timely service delivery without relying on distant cloud data centers.},
results = {The system demonstrated the ability to respond to 1000 queries with only 5-7.5% accuracy loss compared to optimal centralized grid allocation when varying neighbor levels. By using neighbor RSUs with controlled accuracy tradeoffs, the approach achieved 30% decrease in processing time while maintaining model accuracy of 99% or higher. The system showed effective task allocation optimization that balances response latency with model accuracy across distributed edge devices.},
project_tags = {transit, planning, middleware, scalable AI}
}
Residents in cities typically use third-party platforms such as Google Maps for route planning services. While providing near real-time processing, these state of the art centralized deployments are limited to multiprocessing environments in data centers. This raises privacy concerns, increases risk for critical data and causes vulnerability to network failure. In this paper, we propose to use decentralized road side units (RSU) (owned by the city) to perform route planning. We divide the city road network into grids, each assigned an RSU where traffic data is kept locally, increasing security and resiliency such that the system can perform even if some RSUs fail. Route generation is done in two steps. First, an optimal grid sequence is generated, prioritizing shortest path calculation accuracy but not RSU load. Second, we assign route planning tasks to the grids in the sequence. Keeping in mind RSU load and constraints, tasks can be allocated and executed in any non-optimal grid but with lower accuracy. We evaluate this system using Metropolitan Nashville road traffic data. We divided the area into 500 grids, configuring load and neighborhood sizes to meet delay constraints while maximizing model accuracy. The results show that there is a 30 percent decrease in processing time with a decrease in model accuracy of 99 percent to 92.3 percent, by simply increasing the search area to the optimal grid’s immediate neighborhood.
@article{SHEKHAR2020101711,
author = {Shekhar, Shashank and Chhokra, Ajay and Sun, Hongyang and Gokhale, Aniruddha and Dubey, Abhishek and Koutsoukos, Xenofon and Karsai, Gabor},
journal = {Journal of Systems Architecture},
title = {URMILA: Dynamically Trading-off Fog and Edge Resources for Performance and Mobility-Aware IoT Services},
year = {2020},
issn = {1383-7621},
contribution = {colab},
doi = {https://doi.org/10.1016/j.sysarc.2020.101710},
keywords = {Fog/Edge Computing, User Mobility, Latency-sensitive IoT Services, Resource Management, middleware, performance},
project = {cps-middleware},
tag = {platform,transit},
url = {http://www.sciencedirect.com/science/article/pii/S1383762120300047}
}
The fog/edge computing paradigm is increasingly being adopted to support a range of latency-sensitive IoT services due to its ability to assure the latency requirements of these services while supporting the elastic properties of cloud computing. IoT services that cater to user mobility, however, face a number of challenges in this context. First, since user mobility can incur wireless connectivity issues, executing these services entirely on edge resources, such as smartphones, will result in a rapid drain in the battery charge. In contrast, executing these services entirely on fog resources, such as cloudlets or micro data centers, will incur higher communication costs and increased latencies in the face of fluctuating wireless connectivity and signal strength. Second, a high degree of multi-tenancy on fog resources involving different IoT services can lead to performance interference issues due to resource contention. In order to address these challenges, this paper describes URMILA, which makes dynamic resource management decisions to achieve effective trade-offs between using the fog and edge resources yet ensuring that the latency requirements of the IoT services are met. We evaluate URMILA’s capabilities in the context of a real-world use case on an emulated but realistic IoT testbed.
@inproceedings{wilbur2020decentralized,
author = {Wilbur, Michael and Samal, Chinmaya and Talusan, Jose Paolo and Yasumoto, Keiichi and Dubey, Abhishek},
booktitle = {2020 IEEE 23nd International Symposium on Real-Time Distributed Computing (ISORC)},
title = {Time-dependent Decentralized Routing using Federated Learning},
year = {2020},
organization = {IEEE},
contribution = {lead},
tag = {decentralization,transit},
what = {This paper presents a decentralized time-dependent routing approach using federated learning on private fog networks where RSUs collaboratively learn shared prediction models. The system enables route planning without relying on centralized cloud services by leveraging federated learning to collaboratively train models that predict travel times and select optimal routes. All training occurs locally on RSUs with only model weights shared to the central server, avoiding raw data transmission.},
why = {Centralized route planning services face latency and availability concerns in disaster scenarios, while federated learning enables distributed model training without centralizing sensitive data. This work is innovative because it combines decentralized routing with federated learning to create privacy-preserving, resilient route planning that relies on edge computing. The approach maintains data locality while enabling collaborative learning across the distributed network.},
results = {The federated learning approach achieved comparable latency and memory efficiency to centralized methods while reducing dependence on cloud infrastructure and protecting data privacy. The system successfully demonstrated time-dependent routing in a simulated Nashville metropolitan area with multiple RSUs. The framework proved effective at learning shared prediction models that improved query response time while maintaining acceptable accuracy loss due to distributed computation.},
keywords = {federated learning, decentralized routing, time-dependent networks, fog computing, privacy-preserving, collaborative learning},
project_tags = {transit, middleware, scalable AI}
}
Recent advancements in cloud computing have driven rapid development in data-intensive smart city applications by providing near real time processing and storage scalability. This has resulted in efficient centralized route planning services such as Google Maps, upon which millions of users rely. Route planning algorithms have progressed in line with the cloud environments in which they run. Current state of the art solutions assume a shared memory model, hence deployment is limited to multiprocessing environments in data centers. By centralizing these services, latency has become the limiting parameter in the technologies of the future, such as autonomous cars. Additionally, these services require access to outside networks, raising availability concerns in disaster scenarios. Therefore, this paper provides a decentralized route planning approach for private fog networks. We leverage recent advances in federated learning to collaboratively learn shared prediction models online and investigate our approach with a simulated case study from a mid-size U.S. city.
@article{wilburaccess2020,
author = {{Talusan}, J. P. V. and {Wilbur}, M. and {Dubey}, A. and {Yasumoto}, K.},
journal = {IEEE Access},
title = {Route Planning Through Distributed Computing by Road Side Units},
year = {2020},
pages = {176134-176148},
volume = {8},
contribution = {minor},
tag = {decentralization,transit}
}
Cities are embracing data-intensive applications to maximize their constrained transportation networks. Platforms such as Google offer route planning services to mitigate the effect of traffic congestion. These use remote servers that require an Internet connection, which exposes data to increased risk of network failures and latency issues. Edge computing, an alternative to centralized architectures, offers computational power at the edge that could be used for similar services. Road side units (RSU), Internet of Things (IoT) devices within a city, offer an opportunity to offload computation to the edge. To provide an environment for processing on RSUs, we introduce RSU-Edge, a distributed edge computing system for RSUs. We design and develop a decentralized route planning service over RSU-Edge. In the service, the city is divided into grids and assigned an RSU. Users send trip queries to the service and obtain routes. For maximum accuracy, tasks must be allocated to optimal RSUs. However, this overloads RSUs, increasing delay. To reduce delays, tasks may be reallocated from overloaded RSUs to its neighbors. The distance between the optimal and actual allocation causes accuracy loss due to stale data. The problem is identifying the most efficient allocation of tasks such that response constraints are met while maintaining acceptable accuracy. We created the system and present an analysis of a case study in Nashville, Tennessee that shows the effect of our algorithm on route accuracy and query response, given varying neighbor levels. We find that our system can respond to 1000 queries up to 57.17% faster, with only a model accuracy loss of 5.57% to 7.25% compared to using only optimal grid allocation.
@inproceedings{Basak2019b,
author = {Basak, Sanchita and Aman, Afiya and Laszka, Aron and Dubey, Abhishek and Leao, Bruno},
booktitle = {Proceedings of the 11th Annual Conference of the Prognostics and Health Management Society (PHM)},
title = {Data-Driven Detection of Anomalies and Cascading Failures in Traffic Networks},
year = {2019},
month = oct,
attachments = {https://www.isis.vanderbilt.edu/sites/default/files/PHM_traffic_cascades_paper.pdf},
category = {conference},
contribution = {lead},
doi = {https://doi.org/10.36001/phmconf.2019.v11i1.861},
file = {:Basak2019b-Data_Driven_Detection_of_Anomalies_and_Cascading_Failures_in_Traffic_Networks.pdf:PDF},
keywords = {anomaly detection, cascading failures, traffic networks, LSTM networks, data-driven methods, congestion forecasting},
project = {smart-transit,smart-cities,cps-reliability},
tag = {ai4cps,transit},
what = {This paper develops methods for detecting anomalies and cascading failures in traffic networks using a combination of model-driven and data-driven approaches. The work builds LSTM-based traffic speed predictors for each road segment while using Gaussian Process Regression as a comparison baseline. A Timed Failure Propagation Graph is formulated to identify root causes of congestion and trace how failures cascade through the network.},
why = {Understanding cascading failures in traffic networks is critical for mitigating widespread disruptions and improving system resilience. This work is innovative because it combines multiple modeling approaches (physical models and data-driven learning) with specialized techniques for capturing spatial-temporal dependencies in traffic data. The Timed Failure Propagation Graph provides a novel mechanism to identify congestion sources and understand how local incidents affect larger network regions.},
results = {The LSTM-based speed predictors achieved better performance than Gaussian Process Regression with average precision of 0.8507 on the precision-recall curve. The system successfully identified cascading effects of traffic congestion with average precision of 0.9269 and recall of 0.9118 when tested on ten congestion events in Nashville. The approach demonstrated the ability to isolate root causes of network-wide congestion from complex spatial-temporal data.},
project_tags = {transit, emergency, ML for CPS}
}
Traffic networks are one of the most critical infrastructures for any community. The increasing integration of smart and connected sensors in traffic networks provides researchers with unique opportunities to study the dynamics of this critical community infrastructure. Our focus in this paper is on the failure dynamics of traffic networks. By failure, we mean in this domain the hindrance of the normal operation of a traffic network due to cyber anomalies or physical incidents that cause cascaded congestion throughout the network. We are specifically interested in analyzing the cascade effects of traffic congestion caused by physical incidents, focusing on developing mechanisms to isolate and identify the source of a congestion. To analyze failure propagation, it is crucial to develop (a) monitors that can identify an anomaly and (b) a model to capture the dynamics of anomaly propagation. In this paper, we use real traffic data from Nashville, TN to demonstrate a novel anomaly detector and a Timed Failure Propagation Graph based diagnostics mechanism. Our novelty lies in the ability to capture the the spatial information and the interconnections of the traffic network as well as the use of recurrent neural network architectures to learn and predict the operation of a graph edge as a function of its immediate peers, including both incoming and outgoing branches. Our results show that our LSTM-based traffic-speed predictors attain an average mean squared error of 6.55\times10^-4 on predicting normalized traffic speed, while Gaussian Process Regression based predictors attain a much higher average mean squared error of 1.78\times10^-2. We are also able to detect anomalies with high precision and recall, resulting in an AUC (Area Under Curve) of 0.8507 for the precision-recall curve. To study physical traffic incidents, we augment the real data with simulated data generated using SUMO, a traffic simulator. Finally, we analyzed the cascading effect of the congestion propagation by formulating the problem as a Timed Failure Propagation Graph, which led us in identifying the source of a failure/congestion accurately.
@inproceedings{LaszkaVerisolid2019,
author = {Laszka, Aron and Mavridou, Anastasia and Eisele, Scott and Statchtiari, Emmanouela and Dubey, Abhishek},
booktitle = {First International Summer School on Security and Privacy for Blockchains and Distributed Ledger Technologies, BDLT 2019, Vienna, Austria},
title = {VeriSolid for TRANSAX: Correct-by-Design Ethereum Smart Contracts for Energy Trading},
year = {2019},
month = sep,
category = {workshop},
contribution = {colab},
file = {:LaszkaVerisolid2019Poster.pdf:PDF},
keywords = {smart contracts, formal verification, energy trading, blockchain, correct-by-design, Ethereum},
project = {cps-blockchains,transactive-energy},
tag = {platform,decentralization,power},
what = {This paper presents VeriSolid, a framework for formal verification of Ethereum smart contracts used in energy trading platforms. The framework provides a design and verification workflow combining transition-system based models with a Solidifier code generator for correct-by-design smart contracts. The approach enables automated verification of safety properties while providing high-level feedback to developers about contract correctness.},
why = {Smart contract vulnerabilities present serious security risks, particularly in energy trading platforms where financial consequences of errors are severe. VeriSolid is innovative because it provides a correct-by-design approach that integrates formal verification at specification time rather than attempting to patch vulnerabilities post-deployment. The framework makes formal methods accessible to developers through automated code generation and natural-language templates.},
results = {VeriSolid successfully detected critical bugs in the TRANSAX energy trading smart contract, identifying violations of specified safety properties and generating meaningful developer feedback. The framework demonstrated the ability to formally verify desired contract behavior and ensure deadlock freedom. The approach proved effective at identifying vulnerabilities that could otherwise persist in deployed contracts.},
project_tags = {energy, Explainable AI}
}
The adoption of blockchain based platforms is rising rapidly. Their popularity is explained by their ability to maintain a distributed public ledger, providing reliability, integrity, and auditability with- out a trusted entity. Recent platforms, e.g., Ethereum, also act as distributed computing platforms and enable the creation of smart contracts, i.e., software code that runs on the platform and automatically executes and enforces the terms of a contract. Since smart contracts can perform any computation, they allow the develop- ment of decentralized applications, whose execution is safeguarded by the security properties of the underlying platform. Due to their unique advantages, blockchain based platforms are envisioned to have a wide range of applications, ranging from financial to the Internet-of-Things. However, the trustworthiness of the platform guarantees only that a smart contract is executed correctly, not that the code of the contract is correct. In fact, a large number of contracts deployed in practice suffer from software vulnerabilities, which are often introduced due to the semantic gap between the assumptions that contract writers make about the underlying execution semantics and the actual semantics of smart contracts. A recent automated analysis of 19,336 smart contracts deployed in practice found that 8,333 of them suffered from at least one security issue. Although this study was based on smart contracts deployed on the public Ethereum blockchain, the analyzed security issues were largely plat- form agnostic. Security vulnerabilities in smart contracts present a serious issue for two main reasons. Firstly, smart-contract bugs cannot be patched. By design, once a contract is deployed, its func- tionality cannot be altered even by its creator. Secondly, once a faulty or malicious transaction is recorded, it cannot be removed from the blockchain (“code is law” principle). The only way to roll back a transaction is by performing a hard fork of the blockchain, which requires consensus among the stakeholders and undermines the trustworthiness of the platform. In light of this, it is crucial to ensure that a smart contract is se- cure before deploying it and trusting it with significant amounts of cryptocurrency. To this end, we present the VeriSolid framework for the formal verification and generation of contracts that are specified using a transition-system based model with rigorous operational semantics. VeriSolid provides an end-to-end design framework, which combined with a Solidity code generator, allows the correct- by-design development of Ethereum smart contracts. To the best of our knowledge, VeriSolid is the first framework to promote a model- based, correctness-by-design approach for blockchain-based smart contracts. Properties established at any step of the VeriSolid design flow are preserved in the resulting smart contracts, guaranteeing their correctness. VeriSolid fully automates the process of verifica- tion and code generation, while enhancing usability by providing easy-to-use graphical editors for the specification of transition sys- tems and natural-like language templates for the specification of formal properties. By performing verification early at design time, VeriSolid provides a cost-effective approach since fixing bugs later in the development process can be very expensive. Our verification approach can detect typical vulnerabilities, but it may also detect any violation of required properties. Since our tool applies verifi- cation at a high-level, it can provide meaningful feedback to the developer when a property is not satisfied, which would be much harder to do at bytecode level. We present the application of VeriSolid on smart contracts used in Smart Energy Systems such as transactive energy platforms. In particular, we used VeriSolid to design and generate the smart contract that serves as the core of the TRANSAX blockchain-based platform for trading energy futures. The designed smart contract allows energy producers and consumers to post offers for selling and buying energy. Since optimally matching selling offers with buying offers can be very expensive computationally, the contract relies on external solvers to compute and submit solutions to the matching problem, which are then checked by the contract. Using VeriSolid, we defined a set of safety properties and we were able to detect bugs after performing analysis with the NuSMV model checker.
@article{Dubey2019c,
author = {Dubey, Abhishek and {Emfinger}, W. and {Gokhale}, A. and {Kumar}, P. and {McDermet}, D. and {Bapty}, T. and {Karsai}, G.},
journal = {IEEE Aerospace and Electronic Systems Magazine},
title = {Enabling Strong Isolation for Distributed Real-Time Applications in Edge Computing Scenarios},
year = {2019},
issn = {1557-959X},
month = jul,
number = {7},
pages = {32-45},
volume = {34},
contribution = {lead},
doi = {10.1109/MAES.2019.2905921},
file = {:Dubey2019c-Enabling_Strong_Isolation_for_Distributed_Real-Time_Applications_in_Edge_Computing_Scenarios.pdf:PDF},
keywords = {mixed-criticality systems, temporal partitioning, distributed real-time systems, resource management, operating systems, safety-critical applications},
project = {cps-middleware,cps-reliability},
tag = {platform},
what = {This paper describes COSMOS, a distributed real-time managed systems operating system layer that enables temporal and spatial partitioning for mixed-criticality cyber-physical systems. The platform supports multiple applications with different criticality levels running on shared computing infrastructure while maintaining strong isolation and temporal predictability. COSMOS provides specialized scheduler concepts including CPU resource caps and priority-based scheduling to manage computing resources efficiently.},
why = {Mixed-criticality applications require strong isolation to prevent failures in non-critical functions from affecting critical operations, while maximizing resource utilization in embedded systems. COSMOS is innovative because it provides operating system-level support for spatial and temporal partitioning with minimal overhead, enabling multiple applications to safely share computing resources. The architecture enables higher resource efficiency without sacrificing safety guarantees.},
results = {COSMOS successfully supported execution of multiple mixed-criticality applications with CPU resource caps preventing higher priority tasks from starving lower priority ones. The system demonstrated effective implementation of temporal partitioning with minor frame scheduling enabling guaranteed execution of critical tasks. The framework proved capable of supporting various application configurations while maintaining isolation and predictability requirements.},
project_tags = {CPS, middleware, scalable AI}
}
Distributed coexisting applications found in the military and space domains, which operate over managed but shared computing resources at the edge require strong isolation from each other. The state of the art for computation sharing at the edge is traditionally based on Docker and similar pseudovirtualization features. Our team has been working on an end-to-end architecture that provides strong spatial and temporal isolation similar to what has become standard in avionics communities. In this paper, we describe an open-source extension to Linux that we have designed and implemented for our distributed real-time embedded managed systems (DREMS) architecture. The key concepts are the partitioning scheduler, strong security design, and a health management interface.
@inproceedings{Basak2019a,
author = {Basak, Sanchita and Sengupta, Saptarshi and Dubey, Abhishek},
booktitle = {{IEEE} International Conference on Smart Computing, {SMARTCOMP} 2019, Washington, DC, USA},
title = {Mechanisms for Integrated Feature Normalization and Remaining Useful Life Estimation Using LSTMs Applied to Hard-Disks},
year = {2019},
month = jun,
acceptance = {29},
note = {Best Paper Award},
pages = {208--216},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/conf/smartcomp/BasakSD19},
category = {selectiveconference},
contribution = {lead},
doi = {10.1109/SMARTCOMP.2019.00055},
file = {:Basak2019a-Mechanisms_for_Integrated_Feature_Normalization_and_Remaining_Useful_Life_Estimation_Using_LSTMs_Applied_to_Hard-Disks.pdf:PDF},
keywords = {remaining useful life, hard disk drives, LSTM networks, predictive maintenance, feature normalization, data-driven methods},
project = {cps-reliability},
tag = {ai4cps},
timestamp = {Wed, 16 Oct 2019 14:14:54 +0200},
url = {https://doi.org/10.1109/SMARTCOMP.2019.00055},
what = {This paper develops data-driven methods for remaining useful life estimation of hard disk drives using LSTM networks with feature normalization techniques. The approach addresses the challenge that different devices have varying failure characteristics and minimum/maximum feature values. The work implements two LSTM network layers and employs careful feature selection through correlation analysis and Fisher scores.},
why = {Hard disk failures are critical concerns in computing infrastructure, and accurate remaining useful life prediction enables proactive maintenance planning. This work is innovative because it addresses device-specific variations in failure patterns through careful feature normalization and selection, enabling models trained on one device to generalize to other devices from the same manufacturer. The approach improves prediction accuracy through integration of multiple machine learning techniques.},
results = {The LSTM-based approach achieved excellent prediction performance with an average precision of 0.8435 and F1 score of 0.72 when predicting whether a disk would fail in the next ten days. The system demonstrated the ability to predict RUL near the critical point of device approach with an average precision of 0.8435. The normalized features successfully enabled generalization across different hard disk models from the same manufacturer.},
project_tags = {CPS, ML for CPS}
}
In this paper we focus on application of data-driven methods for remaining useful life estimation in components where past failure data is not uniform across devices, i.e. there is a high variance in the minimum and maximum value of the key parameters. The system under study is the hard disks used in computing cluster. The data used for analysis is provided by Backblaze as discussed later. In the article, we discuss the architecture of of the long short term neural network used and describe the mechanisms to choose the various hyper-parameters. Further, we describe the challenges faced in extracting effective training sets from highly unorganized and class-imbalanced big data and establish methods for online predictions with extensive data pre-processing, feature extraction and validation through online simulation sets with unknown remaining useful lives of the hard disks. Our algorithm performs especially well in predicting RUL near the critical zone of a device approaching failure. With the proposed approach we are able to predict whether a disk is going to fail in next ten days with an average precision of 0.8435. We also show that the architecture trained on a particular model is generalizable and transferable as it can be used to predict RUL for devices in other models from same manufacturer.
@inproceedings{Samal2019,
author = {Samal, Chinmaya and Dubey, Abhishek and Ratliff, Lillian J.},
booktitle = {{IEEE} International Conference on Smart Computing, {SMARTCOMP} 2019, Washington, DC, USA},
title = {Mobilytics-Gym: {A} Simulation Framework for Analyzing Urban Mobility Decision Strategies},
year = {2019},
acceptance = {29},
month = jun,
pages = {283--291},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/conf/smartcomp/SamalDR19},
category = {selectiveconference},
contribution = {lead},
doi = {10.1109/SMARTCOMP.2019.00064},
file = {:Samal2019-Mobilytics-Gym_A_Simulation_Framework_for_Analyzing_Urban_Mobility_Decision_Strategies.pdf:PDF},
keywords = {urban mobility, agent-based simulation, incentive policies, commuter behavior, multi-modal transportation, simulation framework},
project = {smart-transit,smart-cities},
tag = {transit},
timestamp = {Wed, 16 Oct 2019 14:14:54 +0200},
url = {https://doi.org/10.1109/SMARTCOMP.2019.00064},
what = {This paper introduces Mobilytics-Gym, a simulation framework for evaluating urban mobility decision strategies using agent-based models that integrate learning and preference dynamics. The framework combines traffic simulation with models for commuter decision-making that learn from experiences and adapt their route choices. The system enables evaluation of incentive policies and their effects on congestion, emissions, and user satisfaction.},
why = {Traditional transportation simulation tools often use fixed agent behaviors that cannot adapt to policy changes or incentives in realistic ways. Mobilytics-Gym is innovative because it integrates data-driven learning models that allow agents to adapt their behavior dynamically, enabling more realistic evaluation of policy impacts. The framework bridges simulation and real-world dynamics by incorporating observed commuter preferences and decision patterns.},
results = {When provided higher incentives for using transit, the simulation showed that 23% of agents changed their modes from cars to bus and walking when incentives were offered. The system demonstrated effective integration of agent learning models with traffic simulation, achieving a system-level cost reduction of approximately $200,000 due to incentives. The framework proved capable of evaluating sensitivity of system-level outcomes to policy parameters.},
project_tags = {transit, planning, scalable AI}
}
The rise in deep learning models in recent years has led to various innovative solutions for intelligent transportation technologies. Use of personal and on-demand mobility services puts a strain on the existing road network in a city. To mitigate this problem, city planners need a simulation framework to evaluate the effect of any incentive policy in nudging commuters towards alternate modes of travel, such as bike and car-share options. In this paper, we leverage MATSim, an agent-based simulation framework, to integrate agent preference models that capture the altruistic behavior of an agent in addition to their disutility proportional to the travel time and cost. These models are learned in a data-driven approach and can be used to evaluate the sensitivity of an agent to system-level disutility and monetary incentives given, e.g., by the transportation authority. This framework provides a standardized environment to evaluate the effectiveness of any particular incentive policy of a city, in nudging its residents towards alternate modes of transportation. We show the effectiveness of the approach and provide analysis using a case study from the Metropolitan Nashville area.
@inproceedings{Wilbur2019,
author = {Wilbur, Michael and Dubey, Abhishek and Le{\~{a}}o, Bruno and Bhattacharjee, Shameek},
booktitle = {{IEEE} International Conference on Smart Computing, {SMARTCOMP} 2019, Washington, DC, USA},
title = {A Decentralized Approach for Real Time Anomaly Detection in Transportation Networks},
year = {2019},
month = jun,
acceptance = {29},
pages = {274--282},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/conf/smartcomp/WilburDLB19},
category = {selectiveconference},
contribution = {lead},
doi = {10.1109/SMARTCOMP.2019.00063},
file = {:Wilbur2019-A_Decentralized_Approach_for_Real_Time_Anomaly_Detection_in_Transportation_Networks.pdf:PDF},
keywords = {anomaly detection, data integrity attacks, road side units, smart transportation, decentralized systems, clustering algorithms},
project = {cps-reliability,smart-transit,smart-cities},
tag = {ai4cps,platform,decentralization,incident,transit},
timestamp = {Wed, 16 Oct 2019 14:14:54 +0200},
url = {https://doi.org/10.1109/SMARTCOMP.2019.00063},
what = {This paper presents a decentralized anomaly detection framework for smart transportation networks that distributes detection across road side units while identifying orchestrated data integrity attacks. The system employs zone-level detection at RSUs combined with sensor-level detection to handle both deductive and camouflage attacks where adversaries manipulate speed readings. The approach uses hierarchical clustering algorithms for RSU placement optimization.},
why = {Data integrity attacks in transportation systems can have serious consequences, and centralized detection approaches create single points of failure. This work is innovative because it distributes anomaly detection to network edge while maintaining ability to identify complex, multi-sensor attacks that try to evade detection. The zone-level and sensor-level detection hierarchy enables efficient resource utilization at constrained edge devices.},
results = {The RSU clustering approach successfully optimized RSU placement to concentrate resources where sensors were dense while minimizing communication overhead. The system demonstrated ability to detect deductive attacks where individual sensor readings are altered and camouflage attacks where multiple sensors are manipulated to evade detection. The decentralized approach improved computational efficiency while maintaining detection accuracy.},
project_tags = {transit, emergency, CPS, middleware}
}
@inproceedings{Hartsell2019,
author = {Hartsell, Charles and Mahadevan, Nagabhushan and Ramakrishna, Shreyas and Dubey, Abhishek and Bapty, Theodore and Johnson, Taylor T. and Koutsoukos, Xenofon D. and Sztipanovits, Janos and Karsai, Gabor},
booktitle = {Proceedings of the Workshop on Design Automation for {CPS} and IoT, DESTION@CPSIoTWeek 2019, Montreal, QC, Canada},
title = {Model-based design for {CPS} with learning-enabled components},
year = {2019},
month = apr,
pages = {1--9},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/conf/cpsweek/HartsellMRDBJKS19},
category = {workshop},
contribution = {colab},
doi = {10.1145/3313151.3313166},
file = {:Hartsell2019-Model-based_design_for_CPS_with_learning-enabled_components.pdf:PDF},
keywords = {model-based design, learning-enabled components, safety assurance, domain-specific modeling, architectural design, CPS development},
project = {cps-autonomy},
tag = {ai4cps},
timestamp = {Wed, 20 Nov 2019 00:00:00 +0100},
url = {https://doi.org/10.1145/3313151.3313166},
what = {This paper presents a model-based design methodology for assurance-based learning-enabled cyber-physical systems that supports architectural modeling, LEC training, and safety assurance. The approach uses Domain Specific Modeling Languages to specify system architectures and integrates them with learning-enabled component development. The methodology includes support for multiple development workflows including supervised and reinforcement learning approaches.},
why = {Learning-enabled CPS face unique challenges in demonstrating safety and correctness properties during design phases before deployment. This work is innovative because it integrates model-based design with learning component development, providing systematic approaches for architectural modeling, LEC training, and safety verification. The methodology enables developers to systematically address challenges in mixing formal assurance with empirical machine learning.},
results = {The methodology successfully supported end-to-end development of learning-enabled systems from architectural specification through training and deployment. The framework demonstrated integration of multiple assurance techniques including formal verification, static analysis, and runtime monitoring. The approach proved effective at documenting design artifacts that support safety assurance arguments for complex systems.},
project_tags = {CPS, ML for CPS, Explainable AI}
}
Recent advances in machine learning led to the appearance of Learning-Enabled Components (LECs) in Cyber-Physical Systems. LECs are being evaluated and used for various, complex functions including perception and control. However, very little tool support is available for design automation in such systems. This paper introduces an integrated toolchain that supports the architectural modeling of CPS with LECs, but also has extensive support for the engineering and integration of LECs, including support for training data collection, LEC training, LEC evaluation and verification, and system software deployment. Additionally, the toolsuite supports the modeling and analysis of safety cases - a critical part of the engineering process for mission and safety critical systems.
@article{Sun2019,
author = {Sun, Fangzhou and Dubey, Abhishek and White, Jules and Gokhale, Aniruddha},
journal = {Cluster Computing},
title = {Transit-hub: a smart public transportation decision support system with multi-timescale analytical services},
year = {2019},
month = jan,
number = {Suppl 1},
pages = {2239--2254},
volume = {22},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/journals/cluster/SunDWG19},
contribution = {lead},
doi = {10.1007/s10586-018-1708-z},
file = {:Sun2019-Transit-hub_a_smart_public_transportation_decision_support_system_with_multi-timescale_analytical_services.pdf:PDF},
keywords = {public transit, decision support, real-time prediction, schedule optimization, data integration, Kalman filtering},
project = {smart-cities,smart-transit},
tag = {transit},
timestamp = {Wed, 21 Aug 2019 01:00:00 +0200},
url = {https://doi.org/10.1007/s10586-018-1708-z},
what = {This paper presents Transit-Hub, a decision support system for public transportation that integrates multi-timescale analytical services for real-time bus arrival prediction and schedule optimization. The system combines historical and real-time transit data from multiple sources including GTFS data feeds and live vehicle location tracking. Advanced analytics including SVM-Kalman models enable both short-term and long-term delay prediction.},
why = {Public transit agencies struggle with providing accurate real-time information and optimizing schedules based on actual demand patterns, particularly with heterogeneous data quality. Transit-Hub is innovative because it integrates data cleaning and management with advanced analytical models that address data quality issues while providing decision support to transit authorities. The multi-timescale approach enables both immediate customer-facing predictions and longer-term operational optimization.},
results = {The system reduced root-mean-square deviation in travel time prediction by 30-65% when predicting 15 minutes ahead using the integrated short-term prediction model. The approach demonstrated ability to integrate and manage heterogeneous data sources from multiple transit systems while providing reliable transit information. The system achieved significant improvements over basic average models in delay prediction accuracy.},
project_tags = {transit, planning}
}
Public transit is a critical component of a smart and connected community. As such, citizens expect and require accurate information about real-time arrival/departures of transportation assets. As transit agencies enable large-scale integration of real-time sensors and support back-end data-driven decision support systems, the dynamic data-driven applications systems (DDDAS) paradigm becomes a promising approach to make the system smarter by providing online model learning and multi-time scale analytics as part of the decision support system that is used in the DDDAS feedback loop. In this paper, we describe a system in use in Nashville and illustrate the analytic methods developed by our team. These methods use both historical as well as real-time streaming data for online bus arrival prediction. The historical data is used to build classifiers that enable us to create expected performance models as well as identify anomalies. These classifiers can be used to provide schedule adjustment feedback to the metro transit authority. We also show how these analytics services can be packaged into modular, distributed and resilient micro-services that can be deployed on both cloud back ends as well as edge computing resources.
@inproceedings{Basak2019,
author = {Basak, Sanchita and Sun, Fangzhou and Sengupta, Saptarshi and Dubey, Abhishek},
booktitle = {Big Data Analytics - 7th International Conference, {BDA} 2019, Ahmedabad, India},
title = {Data-Driven Optimization of Public Transit Schedule},
year = {2019},
pages = {265--284},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/conf/bigda/BasakSSD19},
category = {selectiveconference},
contribution = {lead},
doi = {10.1007/978-3-030-37188-3\_16},
file = {:Basak2019-Data_Driven_Optimization_of_Public_Transit_Schedule.pdf:PDF},
keywords = {traffic congestion, cascading failures, LSTM networks, spatial-temporal modeling, congestion prediction, network analysis},
project = {smart-cities,smart-transit},
tag = {ai4cps,transit},
timestamp = {Fri, 13 Dec 2019 12:44:00 +0100},
url = {https://doi.org/10.1007/978-3-030-37188-3\_16},
what = {This paper analyzes cascading effects of traffic congestion using LSTM networks to predict traffic propagation patterns in city-wide networks. The approach models the transportation network as a directed graph and develops connected LSTM fabric architectures that capture spatial-temporal dependencies in traffic flow. The framework handles congestion forecasting at multiple timescales.},
why = {Understanding how congestion cascades through networks is essential for effective traffic management and congestion mitigation strategies. This work is innovative because it uses specialized neural network architectures that explicitly capture neighborhood information and spatial dependencies in traffic data. The connected LSTM fabric enables multi-timestep ahead predictions that account for how congestion propagates through interconnected road segments.},
results = {The approach achieved average precision of 0.9269 and average recall of 0.9118 in identifying congestion events when tested over ten congestion scenarios in Nashville. The system successfully predicted traffic speed with high accuracy by leveraging neighborhood information from connected LSTM architectures. The framework demonstrated ability to predict multiple timesteps ahead with degrading accuracy proportional to prediction horizon.},
project_tags = {transit, emergency, ML for CPS}
}
Bus transit systems are the backbone of public transportation in the United States. An important indicator of the quality of service in such infrastructures is on-time performance at stops, with published transit schedules playing an integral role governing the level of success of the service. However there are relatively few optimization architectures leveraging stochastic search that focus on optimizing bus timetables with the objective of maximizing probability of bus arrivals at timepoints with delays within desired on-time ranges. In addition to this, there is a lack of substantial research considering monthly and seasonal variations of delay patterns integrated with such optimization strategies. To address these, this paper makes the following contributions to the corpus of studies on transit on-time performance optimization: (a) an unsupervised clustering mechanism is presented which groups months with similar seasonal delay patterns, (b) the problem is formulated as a single-objective optimization task and a greedy algorithm, a genetic algorithm (GA) as well as a particle swarm optimization (PSO) algorithm are employed to solve it, (c) a detailed discussion on empirical results comparing the algorithms are provided and sensitivity analysis on hyper-parameters of the heuristics are presented along with execution times, which will help practitioners looking at similar problems. The analyses conducted are insightful in the local context of improving public transit scheduling in the Nashville metro region as well as informative from a global perspective as an elaborate case study which builds upon the growing corpus of empirical studies using nature-inspired approaches to transit schedule optimization.
@inproceedings{basak2019bigdata,
author = {Basak, Sanchita and Dubey, Abhishek and Leao, Bruno P.},
booktitle = {IEEE Big Data},
title = {Analyzing the Cascading Effect of Traffic Congestion Using LSTM Networks},
year = {2019},
address = {Los Angeles, Ca},
category = {selectiveconference},
contribution = {lead},
keywords = {transit scheduling, optimization, genetic algorithms, particle swarm optimization, schedule timetabling, seasonal variation},
tag = {ai4cps,incident,transit},
what = {This paper presents data-driven optimization methods for public transit schedule optimization using genetic algorithms and particle swarm optimization. The approach addresses the challenge of optimizing bus timetables to maximize probability of on-time arrivals at timepoints within desired delay ranges. The work includes unsupervised clustering mechanisms for grouping months with similar seasonal delay patterns.},
why = {On-time performance is critical for public transit system reliability and user satisfaction, yet most transit agencies use manual approaches rather than systematic optimization. This work is innovative because it applies evolutionary and swarm-based optimization approaches to transit scheduling while accounting for seasonal and monthly variations in delay patterns. The clustering mechanism enables efficient optimization for multiple seasonal scenarios.},
results = {The genetic algorithm and particle swarm optimization approaches outperformed greedy baseline methods in improving on-time performance across Nashville bus routes. The particle swarm approach demonstrated faster convergence with lower variance compared to genetic algorithms. The system successfully optimized bus schedules for different seasonal patterns, improving overall system on-time performance.},
project_tags = {transit, planning, scalable AI}
}
This paper presents a data-driven approach for predicting the propagation of traffic congestion at road seg-ments as a function of the congestion in their neighboring segments. In the past, this problem has mostly been addressed by modelling the traffic congestion over some standard physical phenomenon through which it is difficult to capture all the modalities of such a dynamic and complex system. While other recent works have focused on applying a generalized data-driven technique on the whole network at once, they often ignore intersection characteristics. On the contrary, we propose a city-wide ensemble of intersection level connected LSTM models and propose mechanisms for identifying congestion events using the predictions from the networks. To reduce the search space of likely congestion sinks we use the likelihood of congestion propagation in neighboring road segments of a congestion source that we learn from the past historical data. We validated our congestion forecasting framework on the real world traffic data of Nashville, USA and identified the onset of congestion in each of the neighboring segments of any congestion source with an average precision of 0.9269 and an average recall of 0.9118 tested over ten congestion events.
@inproceedings{Burruss2019,
author = {Burruss, Matthew P. and Ramakrishna, Shreyas and Karsai, Gabor and Dubey, Abhishek},
booktitle = {{IEEE} 22nd International Symposium on Real-Time Distributed Computing, {ISORC} 2019, Valencia, Spain, May 7-9, 2019},
title = {DeepNNCar: {A} Testbed for Deploying and Testing Middleware Frameworks for Autonomous Robots},
year = {2019},
pages = {87--88},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/conf/isorc/BurrussRKD19},
category = {poster},
contribution = {lead},
doi = {10.1109/ISORC.2019.00025},
file = {:Burruss2019-DeepNNCar_Testbed_for_Deploying_and_Testing_Middleware_Frameworks_for_Autonomous_Robots.pdf:PDF},
keywords = {autonomous vehicles, middleware frameworks, learning-enabled components, safety assurance, simplex architecture, testbed platform},
project = {cps-autonomy},
tag = {ai4cps},
timestamp = {Wed, 16 Oct 2019 14:14:53 +0200},
url = {https://doi.org/10.1109/ISORC.2019.00025},
what = {This paper demonstrates DeepNNCar, a testbed platform for deploying and testing middleware frameworks for autonomous robots using learning-enabled components. The platform integrates CNN-based steering angle prediction with safety supervisors using Simplex Architecture and weighted simplex strategies for adaptive controller switching. The system demonstrates real-time performance monitoring and resource management capabilities.},
why = {Autonomous robot systems using learning-enabled components require testing frameworks that can evaluate safety guarantees and performance tradeoffs in realistic scenarios. DeepNNCar is innovative because it provides a physical testbed platform with integrated middleware framework support, enabling systematic evaluation of different control strategies and safety architectures. The platform demonstrates practical implementation of formal safety assurance with learning-enabled systems.},
results = {The platform successfully demonstrated weighted simplex strategy capabilities by adaptively switching between controllers based on current speed and safety performance. The system showed ability to maintain safe operation while optimizing for performance through dynamic strategy selection. The testbed demonstrated effectiveness of middleware frameworks in managing multiple controllers and ensuring safe system operation.},
project_tags = {CPS, ML for CPS, middleware}
}
This demo showcases the features of an adaptive middleware framework for resource constrained autonomous robots like DeepNNCar (Figure 1). These robots use Learning Enabled Components (LECs), trained with deep learning models to perform control actions. However, these LECs do not provide any safety guarantees and testing them is challenging. To overcome these challenges, we have developed an adaptive middleware framework that (1) augments the LEC with safety controllers that can use different weighted simplex strategies to improve the systems safety guarantees, and (2) includes a resource manager to monitor the resource parameters (temperature, CPU Utilization), and offload tasks at runtime. Using DeepNNCar we will demonstrate the framework and its capability to adaptively switch between the controllers and strategies based on its safety and speed performance.
@article{Dubey2019,
author = {Dubey, Abhishek and Karsai, Gabor and V{\"{o}}lgyesi, P{\'{e}}ter and Metelko, Mary and Madari, Istv{\'{a}}n and Tu, Hao and Du, Yuhua and Lukic, Srdjan},
journal = {Embedded Systems Letters},
title = {Device Access Abstractions for Resilient Information Architecture Platform for Smart Grid},
year = {2019},
number = {2},
pages = {34--37},
volume = {11},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/journals/esl/DubeyKVMMTDL19},
contribution = {lead},
doi = {10.1109/LES.2018.2845854},
file = {:Dubey2019-Device_Access_Abstractions_for_Resilient_Information_Architecture_Platform_for_Smart_Grid.pdf:PDF},
keywords = {device abstraction, smart grid, middleware, component-based systems, hardware integration, communication protocols},
project = {cps-middleware},
tag = {platform,power},
timestamp = {Fri, 05 Jul 2019 01:00:00 +0200},
url = {https://doi.org/10.1109/LES.2018.2845854},
what = {This paper introduces Device Access Abstractions for the Resilient Information Architecture Platform for Smart Grid (RIAPS), enabling component-based systems to interact with hardware devices through unified interfaces. The framework provides middleware mechanisms for abstracting device communication protocols including serial, I2C, Modbus, and C37.118 synchronous sampling. The design enables clean separation between application logic and hardware-specific implementation details.},
why = {Smart grid systems require interaction with diverse hardware devices using different communication protocols, making application development complex and hardware-dependent. This work is innovative because it provides abstraction mechanisms that allow application components to operate independently of underlying device protocols. The framework enables rapid prototyping and deployment of smart grid applications on heterogeneous hardware platforms.},
results = {The framework successfully demonstrated support for multiple device types including PMUs, relays, and inverters through unified RIAPS interfaces. The system achieved efficient communication with devices while maintaining application-level abstraction from protocol details. The approach proved effective at enabling clean component-based design for complex smart grid systems.},
project_tags = {energy, CPS, middleware, Explainable AI}
}
This letter presents an overview of design mechanisms to abstract device access protocols in the resilient information architecture platform for smart grid, a middleware for developing distributed smart grid applications. These mechanisms are required to decouple the application functionality from the specifics of the device mechanisms built by the device vendors.
@article{Dubey2019a,
author = {Dubey, Abhishek and Garc{\'{\i}}a{-}Valls, Marisol},
journal = {Journal of Systems Architecture - Embedded Systems Design},
title = {Introduction to the special issue of the 16th {ACM} workshop on Adaptive and Reflective Middleware {(ARM)}},
year = {2019},
pages = {8},
volume = {97},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/journals/jsa/DubeyG19},
contribution = {lead},
doi = {10.1016/j.sysarc.2019.03.006},
file = {:Dubey2019a-Introduction_to_the_special_issue_of_the_16th_ACM_workshop_on_Adaptive_and_Reflective_Middleware_ARM.pdf:PDF},
keywords = {adaptive middleware, reflective systems, cyber-physical systems, Internet of Things, distributed computing},
project = {cps-middleware},
timestamp = {Tue, 25 Jun 2019 01:00:00 +0200},
url = {https://doi.org/10.1016/j.sysarc.2019.03.006},
what = {This paper provides an overview of the 16th ACM workshop on Adaptive and Reflective Middleware, highlighting research on middleware technologies for cyber-physical systems and Internet of Things integration. The introduction discusses challenges in developing adaptive middleware that can respond to changing system conditions and environmental factors.},
why = {Middleware technologies are critical for managing complexity in distributed CPS and IoT systems, yet developing adaptive middleware that responds to changing conditions remains an open challenge. This workshop overview highlights emerging work in reflective middleware approaches that enable systems to dynamically reconfigure based on runtime conditions.},
results = {The workshop brought together researchers working on adaptive middleware technologies for diverse application domains. Discussions highlighted emerging approaches for addressing middleware challenges in distributed systems with dynamic environmental conditions.},
project_tags = {CPS, middleware, scalable AI}
}
@inproceedings{Eisele2019,
author = {Eisele, Scott and Ghosh, Purboday and Campanelli, Keegan and Dubey, Abhishek and Karsai, Gabor},
booktitle = {{IEEE} 22nd International Symposium on Real-Time Distributed Computing, {ISORC} 2019, Valencia, Spain, May 7-9, 2019},
title = {Demo: Transactive Energy Application with {RIAPS}},
year = {2019},
pages = {85--86},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/conf/isorc/EiseleGCDK19},
category = {poster},
contribution = {lead},
doi = {10.1109/ISORC.2019.00024},
file = {:Eisele2019-Demo_Transactive_Energy_Application_with_RIAPS.pdf:PDF},
keywords = {transactive energy, microgrids, smart contracts, distributed resources, energy trading, RIAPS platform},
project = {transactive-energy},
tag = {decentralization,power},
timestamp = {Wed, 16 Oct 2019 14:14:53 +0200},
url = {https://doi.org/10.1109/ISORC.2019.00024},
what = {This paper demonstrates a transactive energy application using the RIAPS platform for coordinating distributed energy resources in a microgrid. The system implements energy trading functionality through smart contracts while managing distributed battery and solar resources. The demonstration shows RIAPS capabilities for handling complex multi-agent coordination in real-time energy systems.},
why = {Microgrids with distributed renewable resources require sophisticated coordination mechanisms to balance supply and demand while respecting grid constraints. This work is innovative because it demonstrates practical implementation of transactive energy concepts using formal distributed computing platforms. The integration of smart contracts with RIAPS middleware enables decentralized energy trading with safety guarantees.},
results = {The system successfully coordinated energy trading between multiple prosumers and consumers in simulated microgrid scenarios. The demonstration showed effective use of RIAPS messaging and device components for managing complex energy transactions. The approach proved effective at implementing safe decentralized energy coordination.},
project_tags = {energy, CPS, middleware}
}
The modern electric grid is a complex, decentralized cyber-physical system requiring higher-level control techniques to balance the demand and supply of energy to optimize the overall energy usage. The concept of Transactive Energy utilizes distributed system principle to address this challenge. In this demonstration we show the usage of the distributed application management platform RIAPS in the implementation of one such Transactive Energy approach to control elements of a power system, which runs as a a simulation using the Gridlab-d simulation solver.
@article{FTWShonan2019,
author = {Borromeo, Ria Mae and Chen, Lei and Dubey, Abhishek and Roy, Sudeepa and Thirumuruganathan, Saravanan},
journal = {{IEEE} Data Eng. Bull.},
title = {On Benchmarking for Crowdsourcing and Future of Work Platforms},
year = {2019},
number = {4},
pages = {46--54},
volume = {42},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/journals/debu/Borromeo0DRT19.bib},
contribution = {minor},
timestamp = {Tue, 21 Jul 2020 00:40:32 +0200},
url = {http://sites.computer.org/debull/A19dec/p46.pdf}
}
Online crowdsourcing platforms have proliferated over the last few years and cover a number of important domains, these platforms include worker-task platforms such as Amazon Mechanical Turk, worker-for hire platforms such as TaskRabbit to specialized platforms with specific tasks such as ridesharing like Uber, Lyft, Ola, etc. An increasing proportion of the human workforce will be employed by these platforms in the near future. The crowdsourcing community has done yeoman’s work in designing effective algorithms for various key components, such as incentive design, task assignment, and quality control. Given the increasing importance of these crowdsourcing platforms, it is now time to design mechanisms so that it is easier to evaluate the effectiveness of these platforms. Specifically, we advocate developing benchmarks for crowdsourcing research. Benchmarks often identify important issues for the community to focus on and improve upon. This has played a key role in the development of research domains as diverse as databases and deep learning. We believe that developing appropriate benchmarks for crowdsourcing will ignite further innovations. However, crowdsourcing – and future of work, in general – is a very diverse field that makes developing benchmarks much more challenging. Substantial effort is needed that spans across developing benchmarks for datasets, metrics, algorithms, platforms, and so on. In this article, we initiate some discussion into this important problem and issue a call-to-arms for the community to work on this important initiative.
@inproceedings{Ghosh2019,
author = {Ghosh, Purboday and Eisele, Scott and Dubey, Abhishek and Metelko, Mary and Madari, Istv{\'{a}}n and V{\"{o}}lgyesi, P{\'{e}}ter and Karsai, Gabor},
booktitle = {{IEEE} 22nd International Symposium on Real-Time Distributed Computing, {ISORC} 2019, Valencia, Spain},
title = {On the Design of Fault-Tolerance in a Decentralized Software Platform for Power Systems},
year = {2019},
pages = {52--60},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/conf/isorc/GhoshEDMMVK19},
category = {selectiveconference},
contribution = {minor},
doi = {10.1109/ISORC.2019.00018},
file = {:Ghosh2019-On_the_Design_of_Fault-Tolerance_in_a_Decentralized_Software_Platform_for_Power_Systems.pdf:PDF},
keywords = {RIAPS, fault tolerance, distributed systems, smart grid, middleware, resilience, component-based architecture, real-time systems},
project = {cps-middleware,cps-reliability},
tag = {platform,decentralization,power},
timestamp = {Wed, 16 Oct 2019 14:14:53 +0200},
url = {https://doi.org/10.1109/ISORC.2019.00018},
what = {This paper presents RIAPS (Resilient Information Architecture Platform for Smart Grid), an open-source distributed platform for implementing fault-tolerant software in power systems. The work describes the architectural design and implementation of RIAPS, which distributes monitoring and control functions across networked computing nodes. The platform supports various communication patterns including pub-sub and request-reply messaging, enabling resilient and reliable operation across multiple physical layers.},
why = {Smart Grid systems require distributed, real-time embedded computing to handle complex and dynamic environments. Traditional centralized approaches lack the scalability and resilience needed for modern power grid operations. RIAPS addresses this gap by providing a layered fault management architecture that separates application logic from platform services, enabling developers to build highly resilient systems without managing low-level failure detection and recovery mechanisms.},
results = {The paper demonstrates RIAPS capabilities through detailed architectural documentation and a complex energy application case study. The platform successfully implements fault detection and mitigation at multiple system layers (physical device, platform services, and application levels), with resource monitoring capabilities for CPU, memory, network, and disk usage. The evaluation shows effective fault tolerance implementation across various failure scenarios.},
project_tags = {energy, middleware, CPS, scalable AI}
}
The vision of the ‘Smart Grid’ assumes a distributed real-time embedded system that implements various monitoring and control functions. As the reliability of the power grid is critical to modern society, the software supporting the grid must support fault tolerance and resilience in the resulting cyber-physical system. This paper describes the fault-tolerance features of a software framework called Resilient Information Architecture Platform for Smart Grid (RIAPS). The framework supports various mechanisms for fault detection and mitigation and works in concert with the applications that implement the grid-specific functions. The paper discusses the design philosophy for and the implementation of the fault tolerance features and presents an application example to show how it can be used to build highly resilient systems.
@inproceedings{Hartsell2019a,
author = {Hartsell, Charles and Mahadevan, Nagabhushan and Ramakrishna, Shreyas and Dubey, Abhishek and Bapty, Theodore and Karsai, Gabor},
booktitle = {Proceedings of the 10th {ACM/IEEE} International Conference on Cyber-Physical Systems, {ICCPS} 2019, Montreal, QC, Canada},
title = {A {CPS} toolchain for learning-based systems: demo abstract},
year = {2019},
pages = {342--343},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/conf/iccps/HartsellMRDBK19},
category = {poster},
contribution = {colab},
doi = {10.1145/3302509.3313332},
file = {:Hartsell2019a-A_CPS_Toolchain_for_Learning_Based_Systems_Demo_Abstract.pdf:PDF},
keywords = {cyber-physical systems, machine learning, model-based design, toolchain, learning-enabled components},
project = {cps-autonomy},
tag = {ai4cps},
timestamp = {Sun, 07 Apr 2019 16:25:36 +0200},
url = {https://doi.org/10.1145/3302509.3313332},
what = {This demonstration paper presents an integrated toolchain for developing Cyber-Physical Systems with Learning-Enabled Components (LECs). The work showcases a comprehensive workflow built on the WebGME platform that supports architectural modeling, component library management, automated data collection, and performance evaluation for machine learning-based CPS components.},
why = {Learning-enabled components are increasingly used in CPS applications but present unique challenges for development, testing, and verification. Existing tools do not provide integrated support for the entire development lifecycle of CPS with LECs, from initial design through safety assurance. This toolchain addresses that gap by enabling structured, reproducible development processes and comprehensive documentation of ML component provenance.},
results = {The paper demonstrates the ALC (Assurance-based Learning-enabled CPS) toolchain through an autonomous underwater vehicle control example. The system supports design of multiple implementation alternatives, comparison of CNN architectures for control tasks, and integration of both supervised and reinforcement learning approaches within a unified model-based development environment.},
project_tags = {ML for CPS, CPS, Explainable AI}
}
Cyber-Physical Systems (CPS) are expected to perform tasks with ever-increasing levels of autonomy, often in highly uncertain environments. Traditional design techniques based on domain knowledge and analytical models are often unable to cope with epistemic uncertainties present in these systems. This challenge, combined with recent advances in machine learning, has led to the emergence of Learning-Enabled Components (LECs) in CPS. However, very little tool support is available for design automation of these systems. In this demonstration, we introduce an integrated toolchain for the development of CPS with LECs with support for architectural modeling, data collection, system software deployment, and LEC training, evaluation, and verification. Additionally, the toolchain supports the modeling and analysis of safety cases - a critical part of the engineering process for mission and safety critical systems.
@inproceedings{Hartsell2019b,
author = {Hartsell, Charles and Mahadevan, Nagabhushan and Ramakrishna, Shreyas and Dubey, Abhishek and Bapty, Theodore and Johnson, Taylor T. and Koutsoukos, Xenofon D. and Sztipanovits, Janos and Karsai, Gabor},
booktitle = {Proceedings of the 30th International Workshop on Rapid System Prototyping, {RSP} 2019, New York, NY, USA, October 17-18, 2019},
title = {{CPS} Design with Learning-Enabled Components: {A} Case Study},
year = {2019},
pages = {57--63},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/conf/rsp/HartsellMRDBJKS19},
category = {selectiveconference},
contribution = {colab},
doi = {10.1145/3339985.3358491},
file = {:Hartsell2019b-CPS_Design_with_Learning-Enabled_Components_A_Case_Study.pdf:PDF},
keywords = {autonomous underwater vehicles, learning-enabled components, reinforcement learning, model-based design, verification},
project = {cps-autonomy},
tag = {ai4cps},
timestamp = {Thu, 28 Nov 2019 12:43:50 +0100},
url = {https://doi.org/10.1145/3339985.3358491},
what = {This case study paper presents the development of an autonomous Unmanned Underwater Vehicle (UUV) using Learning-Enabled Components (LECs) within the ALC (Assurance-based Learning-enabled CPS) toolchain. The work integrates system modeling, machine learning training, and verification to create a complete CPS with LEC for autonomous navigation and obstacle avoidance.},
why = {CPS development with learning components faces significant challenges in integrating machine learning with formal safety assurance. This case study demonstrates how structured model-based development processes and the ALC toolchain enable developers to maintain safety guarantees while leveraging the benefits of machine learning for adaptive control. The work bridges the gap between traditional CPS design and modern learning-based approaches.},
results = {The paper demonstrates successful development of an AUV controller combining reinforcement learning and conventional control algorithms. The system achieves effective navigation through simulated underwater environments with path planning based on CNN-processed camera images. Multiple implementation alternatives are compared and evaluated using the toolchain's training and verification capabilities.},
project_tags = {ML for CPS, CPS}
}
Cyber-Physical Systems (CPS) are used in many applications where they must perform complex tasks with a high degree of autonomy in uncertain environments. Traditional design flows based on domain knowledge and analytical models are often impractical for tasks such as perception, planning in uncertain environments, control with ill-defined objectives, etc. Machine learning based techniques have demonstrated good performance for such difficult tasks, leading to the introduction of Learning-Enabled Components (LEC) in CPS. Model based design techniques have been successful in the development of traditional CPS, and toolchains which apply these techniques to CPS with LECs are being actively developed. As LECs are critically dependent on training and data, one of the key challenges is to build design automation for them. In this paper, we examine the development of an autonomous Unmanned Underwater Vehicle (UUV) using the Assurance-based Learning-enabled Cyber-physical systems (ALC) Toolchain. Each stage of the development cycle is described including architectural modeling, data collection, LEC training, LEC evaluation and verification, and system-level assurance.
@inproceedings{Krentz2019,
author = {Krentz, Timothy and Dubey, Abhishek and Karsai, Gabor},
booktitle = {{IEEE} 22nd International Symposium on Real-Time Distributed Computing, {ISORC} 2019, Valencia, Spain, May 7-9, 2019},
title = {Short Paper: Towards An Edge-Located Time-Series Database},
year = {2019},
pages = {151--154},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/conf/isorc/KrentzDK19},
category = {selectiveconference},
contribution = {minor},
doi = {10.1109/ISORC.2019.00037},
file = {:Krentz2019-Towards_An_Edge-Located_Time-Series_Database.pdf:PDF},
keywords = {time-series database, distributed hash table, edge computing, smart grid, key-value storage},
project = {cps-middleware},
tag = {platform},
timestamp = {Wed, 16 Oct 2019 14:14:53 +0200},
url = {https://doi.org/10.1109/ISORC.2019.00037},
what = {This paper proposes a distributed hash table (DHT) based approach for storing and accessing time-series data at the edge of computing infrastructure. The work introduces time-factored DHT keys that enable efficient time-indexed reads and writes while maintaining distributed storage properties suitable for edge computing applications.},
why = {Time-series databases for edge computing face unique challenges: cloud-based solutions are prohibitively expensive and latency-sensitive, while traditional distributed approaches don't support the temporal queries required by real-time applications like power grid monitoring. This work provides a novel key structure that enables efficient time-indexed access patterns in distributed edge storage systems.},
results = {The paper presents two DHT key formats (Quanta-First and Key-First IDs) and evaluates their performance for time-indexed access. Results show that time-factored keys enable efficient localized data retrieval with low latency, supporting practical edge computing applications. The approach is demonstrated using a DHT implementation in Go deployed on Raspberry Pi clusters.},
project_tags = {energy, scalable AI, middleware}
}
Smart infrastructure demands resilient data storage, and emerging applications execute queries on this data over time. Typically, time-series databases serve these queries; however, cloud-based time-series storage can be prohibitively expensive. As smart devices proliferate, the amount of computing power and memory available in our connected infrastructure provides the opportunity to move resilient time-series data storage and analytics to the edge. This paper proposes time-series storage in a Distributed Hash Table (DHT), and a novel key-generation technique that provides time-indexed reads and writes for key-value pairs. Experimental results show this technique meets demands for smart infrastructure situations.
@inproceedings{Mavridou2019,
author = {Mavridou, Anastasia and Laszka, Aron and Stachtiari, Emmanouela and Dubey, Abhishek},
booktitle = {Financial Cryptography and Data Security - 23rd International Conference, {FC} 2019, Frigate Bay, St. Kitts and Nevis, Revised Selected Papers},
title = {VeriSolid: Correct-by-Design Smart Contracts for Ethereum},
year = {2019},
pages = {446--465},
acceptance = {21.9},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/conf/fc/MavridouLSD19},
category = {selectiveconference},
contribution = {colab},
doi = {10.1007/978-3-030-32101-7\_27},
file = {:Mavridou2019-VeriSolid_Correct_by_Design_Smart_Contracts_for_Ethereum.pdf:PDF},
keywords = {smart contracts, formal verification, Ethereum, blockchain, correct-by-design, model-driven development},
project = {cps-blockchains},
tag = {platform,decentralization},
timestamp = {Mon, 14 Oct 2019 14:51:20 +0200},
url = {https://doi.org/10.1007/978-3-030-32101-7\_27},
what = {This paper introduces VeriSolid, a framework for correct-by-design development of Ethereum smart contracts. The work combines model-driven development with formal verification to enable developers to specify smart contract behavior as transition systems and automatically generate verified Solidity code with formal guarantees.},
why = {Smart contract vulnerabilities represent a critical security challenge in blockchain systems, with numerous high-profile exploits resulting in significant financial losses. Traditional approaches like code reviews and automated vulnerability detection have limitations. VeriSolid addresses this by enabling developers to specify contract behavior at a high level of abstraction and automatically generate correct implementations verified against formal properties.},
results = {The paper presents VeriSolid's design and verification workflow, supporting specification of safety, liveness, and deadlock-freedom properties. The framework is evaluated on multiple smart contract models including blind auction and resource allocation contracts, demonstrating automatic generation of correct Solidity code with formal guarantees against specified vulnerabilities.},
project_tags = {Explainable AI}
}
The adoption of blockchain based distributed ledgers is growing fast due to their ability to provide reliability, integrity, and auditability without trusted entities. One of the key capabilities of these emerging platforms is the ability to create self-enforcing smart contracts. However, the development of smart contracts has proven to be error-prone in practice, and as a result, contracts deployed on public platforms are often riddled with security vulnerabilities. This issue is exacerbated by the design of these platforms, which forbids updating contract code and rolling back malicious transactions. In light of this, it is crucial to ensure that a smart contract is secure before deploying it and trusting it with significant amounts of cryptocurrency. To this end, we introduce the VeriSolid framework for the formal verification of contracts that are specified using a transition-system based model with rigorous operational semantics. Our model-based approach allows developers to reason about and verify contract behavior at a high level of abstraction. VeriSolid allows the generation of Solidity code from the verified models, which enables the correct-by-design development of smart contracts.
@inproceedings{Mukhopadhyay2019,
author = {Mukhopadhyay, Ayan and Pettet, Geoffrey and Samal, Chinmaya and Dubey, Abhishek and Vorobeychik, Yevgeniy},
booktitle = {Proceedings of the 10th {ACM/IEEE} International Conference on Cyber-Physical Systems, {ICCPS} 2019, Montreal, QC, Canada},
title = {An online decision-theoretic pipeline for responder dispatch},
year = {2019},
pages = {185--196},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/conf/iccps/MukhopadhyayPSD19},
category = {selectiveconference},
contribution = {lead},
acceptance = {27},
doi = {10.1145/3302509.3311055},
file = {:Mukhopadhyay2019-An_Online_Decision_Theoretic_Pipeline_for_Responder_Dispatch.pdf:PDF},
keywords = {emergency response, responder dispatch, decision-theoretic planning, SMDP, incident prediction, survival analysis},
project = {smart-cities,smart-emergency-response},
tag = {ai4cps,incident},
timestamp = {Sun, 07 Apr 2019 16:25:36 +0200},
url = {https://doi.org/10.1145/3302509.3311055},
what = {This paper presents an online decision-theoretic pipeline for responder dispatch in emergency management systems. The work formulates the responder dispatch problem as a Semi-Markov Decision Process and develops an online incident prediction model based on survival analysis to enable real-time, data-driven dispatch decisions.},
why = {Emergency response systems face complex challenges in routing limited resources to incidents in dynamic urban environments. Traditional systems dispatch the nearest responder, which ignores future incident probabilities and environmental factors. This paper addresses these limitations through a principled decision-theoretic approach that integrates incident prediction with dynamic dispatch optimization.},
results = {The paper demonstrates the effectiveness of the approach through evaluation on real emergency services data from Nashville, Tennessee. The online prediction and dispatch pipeline reduces response times compared to baseline approaches while accounting for incident cascading effects and changing environmental dynamics. The work successfully bridges incident prediction and optimal dispatch decisions.},
project_tags = {emergency, POMDP, planning, scalable AI}
}
The problem of dispatching emergency responders to service traffic accidents, fire, distress calls and crimes plagues urban areas across the globe. While such problems have been extensively looked at, most approaches are offline. Such methodologies fail to capture the dynamically changing environments under which critical emergency response occurs, and therefore, fail to be implemented in practice. Any holistic approach towards creating a pipeline for effective emergency response must also look at other challenges that it subsumes - predicting when and where incidents happen and understanding the changing environmental dynamics. We describe a system that collectively deals with all these problems in an online manner, meaning that the models get updated with streaming data sources. We highlight why such an approach is crucial to the effectiveness of emergency response, and present an algorithmic framework that can compute promising actions for a given decision-theoretic model for responder dispatch. We argue that carefully crafted heuristic measures can balance the trade-off between computational time and the quality of solutions achieved and highlight why such an approach is more scalable and tractable than traditional approaches. We also present an online mechanism for incident prediction, as well as an approach based on recurrent neural networks for learning and predicting environmental features that affect responder dispatch. We compare our methodology with prior state-of-the-art and existing dispatch strategies in the field, which show that our approach results in a reduction in response time with a drastic reduction in computational time.
@inproceedings{Nannapaneni2019,
author = {Nannapaneni, Saideep and Dubey, Abhishek},
booktitle = {Proceedings of the Fourth Workshop on International Science of Smart City Operations and Platforms Engineering, SCOPE@CPSIoTWeek 2019, Montreal, QC, Canada},
title = {Towards demand-oriented flexible rerouting of public transit under uncertainty},
year = {2019},
pages = {35--40},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/conf/cpsweek/NannapaneniD19},
category = {workshop},
contribution = {lead},
doi = {10.1145/3313237.3313302},
file = {:Nannapaneni2019-Towards_demand-oriented_flexible_rerouting_of_public_transit_under_uncertainty.pdf:PDF},
keywords = {public transit, demand-responsive transportation, flexible routing, clustering, optimization, travel demand},
project = {smart-transit,smart-cities},
tag = {transit},
timestamp = {Tue, 10 Sep 2019 13:47:28 +0200},
url = {https://doi.org/10.1145/3313237.3313302},
what = {This paper proposes a flexible rerouting strategy for public transit systems to accommodate spatio-temporal variations in travel demand. The work employs clustering algorithms to identify flexible stops based on travel demand patterns and develops optimization methods to determine cost-effective rerouting while maintaining service quality.},
why = {Traditional fixed-route transit systems are inefficient for serving spatially and temporally variable demand, particularly in low-density areas. While demand-responsive transit offers better coverage, it lacks systematic approaches for identifying optimal flexible stops and routes. This work addresses the gap by combining data-driven clustering with discrete optimization to enable practical flexible transit services.},
results = {The paper demonstrates the rerouting methodology on data from Nashville's transit authority. Results show that flexible routes can significantly improve service coverage while reducing operational costs. The approach identifies critical bus stops for flexible service and generates optimized rerouting strategies that balance people served and travel delay.},
project_tags = {transit, planning, scalable AI}
}
This paper proposes a flexible rerouting strategy for the public transit to accommodate the spatio-temporal variation in the travel demand. Transit routes are typically static in nature, i.e., the buses serve well-defined routes; this results in people living in away from the bus routes choose alternate transit modes such as private automotive vehicles resulting in ever-increasing traffic congestion. In the flex-transit mode, we reroute the buses to accommodate high travel demand areas away from the static routes considering its spatio-temporal variation. We perform clustering to identify several flex stops; these are stops not on the static routes, but with high travel demand around them. We divide the bus stops on the static routes into critical and non-critical bus stops; critical bus stops refer to transfer points, where people change bus routes to reach their destinations. In the existing static scheduling process, some slack time is provided at the end of each trip to account for any travel delays. Thus, the additional travel time incurred due to taking flexible routes is constrained to be less than the available slack time. We use the percent increase in travel demand to analyze the effectiveness of the rerouting process. The proposed methodology is demonstrated using real-world travel data for Route 7 operated by the Nashville Metropolitan Transit Authority (MTA).
@inproceedings{Oruganti2019,
author = {Oruganti, Aparna and Basak, Sanchita and Sun, Fangzhou and Baroud, Hiba and Dubey, Abhishek},
booktitle = {Transportation Research Board Annual Meeting},
title = {Modeling and Predicting the Cascading Effects of Delay in Transit Systems},
year = {2019},
attachments = {https://www.isis.vanderbilt.edu/sites/default/files/final poster.pdf},
category = {selectiveconference},
contribution = {minor},
file = {:Oruganti2019-Modeling_and_Predicting_the_Cascading_Effects_of_Delay_in_Transit_Systems.pdf:PDF},
keywords = {transit},
project = {smart-transit,smart-cities},
tag = {transit}
}
An effective real-time estimation of the travel time for vehicles, using AVL(Automatic Vehicle Locators) has added a new dimension to the smart city planning. In this paper, we used data collected over several months from a transit agency and show how this data can be potentially used to learn patterns of travel time during specially planned events like NFL (National Football League) games and music award ceremonies. The impact of NFL games along with consideration of other factors like weather, traffic condition, distance is discussed with their relative importance to the prediction of travel time. Statistical learning models are used to predict travel time and subsequently assess the cascading effects of delay. The model performance is determined based on its predictive accuracy according to the out-of-sample error. In addition, the models help identify the most significant variables that influence the delay in the transit system. In order to compare the actual and predicted travel time for days having special events, heat maps are generated showing the delay impacts in different time windows between two timepoint-segments in comparison to a non-game day. This work focuses on the prediction and visualization of the delay in the public transit system and the analysis of its cascading effects on the entire transportation network. According to the study results, we are able to explain more than 80% of the variance in the bus travel time at each segment and can make future travel predictions during planned events with an out-of-sample error of 2.0 minutes using information on the bus schedule, traffic, weather, and scheduled events. According to the variable importance analysis, traffic information is most significant in predicting the delay in the transit system.
@inproceedings{Pettet2019,
author = {Pettet, Geoffrey and Mukhopadhyay, Ayan and Samal, Chinmaya and Dubey, Abhishek and Vorobeychik, Yevgeniy},
booktitle = {Proceedings of the 10th {ACM/IEEE} International Conference on Cyber-Physical Systems, {ICCPS} 2019, Montreal, QC, Canada},
title = {Incident management and analysis dashboard for fire departments: {ICCPS} demo},
year = {2019},
pages = {336--337},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/conf/iccps/PettetMSDV19},
category = {poster},
contribution = {lead},
doi = {10.1145/3302509.3313329},
file = {:Pettet2019-Incident_management_and_analysis_dashboard_for_fire_departments_ICCPS_demo.pdf:PDF},
keywords = {emergency management, incident analysis, prediction, dashboard, visualization, spatial-temporal data},
project = {smart-cities,smart-emergency-response},
tag = {incident},
timestamp = {Sun, 07 Apr 2019 16:25:36 +0200},
url = {https://doi.org/10.1145/3302509.3313329},
what = {This paper presents a dashboard tool for analyzing and managing spatial-temporal incidents in emergency response systems. The work integrates incident prediction models with interactive visualization capabilities to help emergency managers analyze historical incident distributions and plan resource deployment.},
why = {Emergency response systems require decision support tools that integrate data analytics with situational awareness. Existing tools often separate historical analysis from predictive modeling and dispatch planning. This dashboard integrates survival analysis-based incident prediction with interactive maps and statistical visualizations to enable comprehensive incident analysis and planning.},
results = {The paper demonstrates the dashboard through a case study analyzing incidents from Nashville's emergency services. The system displays historical incident density, predicted future incident distributions, and enables exploration of depot effects on response times. The interactive tool successfully integrates incident prediction with spatial planning capabilities.},
project_tags = {emergency, planning, scalable AI}
}
This work presents a dashboard tool that helps emergency responders analyze and manage spatial-temporal incidents like crime and traffic accidents. It uses state-of-the-art statistical models to learn incident probabilities based on factors such as prior incidents, time and weather. The dashboard can then present historic and predicted incident distributions. It also allows responders to analyze how moving or adding depots (stations for emergency responders) affects average response times, and can make dispatching recommendations based on heuristics. Broadly, it is a one-stop tool that helps responders visualize historical data as well as plan for and respond to incidents.
@inproceedings{Pettet2019a,
author = {Pettet, Geoffrey and Sahoo, Saroj and Dubey, Abhishek},
booktitle = {{IEEE} International Conference on Pervasive Computing and Communications Workshops, PerCom Workshops 2019, Kyoto, Japan, March 11-15, 2019},
title = {Towards an Adaptive Multi-Modal Traffic Analytics Framework at the Edge},
year = {2019},
pages = {511--516},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/conf/percom/PettetSD19},
category = {workshop},
contribution = {lead},
doi = {10.1109/PERCOMW.2019.8730577},
file = {:Pettet2019a-Towards_an_Adaptive_Multi-Modal_Traffic_Analytics_Framework_at_the_Edge.pdf:PDF},
keywords = {traffic analysis, edge computing, object detection, vehicle tracking, IoT, real-time processing},
project = {cps-middleware,smart-transit,smart-cities},
tag = {platform,incident,transit},
timestamp = {Wed, 16 Oct 2019 14:14:54 +0200},
url = {https://doi.org/10.1109/PERCOMW.2019.8730577},
what = {This paper describes a multi-modal traffic analytics framework deployed at the edge for detecting and analyzing non-recurring congestion. The system integrates multiple object detection algorithms (YOLO, Faster-RCNN, SSD) with vehicle tracking to provide real-time traffic analysis with tradeoffs between accuracy and computational resource consumption.},
why = {Traffic congestion detection typically relies on loop detectors or centralized cloud processing, both of which have limitations. Edge-based multi-modal analytics enable real-time analysis close to data sources while reducing bandwidth consumption. The work addresses the challenge of selecting appropriate algorithms based on available resources and required accuracy.},
results = {The paper demonstrates a hierarchical traffic analytics workflow deployed on Raspberry Pi edge devices at intersections in Nashville. The system achieves different accuracy levels with various object detection algorithms and implements dynamic mode selection based on traffic conditions and available resources. Results show that edge-based analysis effectively detects non-recurring congestion.},
project_tags = {transit, middleware, scalable AI, CPS}
}
The Internet of Things (IoT) requires distributed, large scale data collection via geographically distributed devices. While IoT devices typically send data to the cloud for processing, this is problematic for bandwidth constrained applications. Fog and edge computing (processing data near where it is gathered, and sending only results to the cloud) has become more popular, as it lowers network overhead and latency. Edge computing often uses devices with low computational capacity, therefore service frameworks and middleware are needed to efficiently compose services. While many frameworks use a top-down perspective, quality of service is an emergent property of the entire system and often requires a bottom up approach. We define services as multi-modal, allowing resource and performance tradeoffs. Different modes can be composed to meet an application’s high level goal, which is modeled as a function. We examine a case study for counting vehicle traffic through intersections in Nashville. We apply object detection and tracking to video of the intersection, which must be performed at the edge due to privacy and bandwidth constraints. We explore the hardware and software architectures, and identify the various modes. This paper lays the foundation to formulate the online optimization problem presented by the system which makes tradeoffs between the quantity of services and their quality constrained by available resources.
@inproceedings{Ramakrishna2019,
author = {Ramakrishna, Shreyas and Dubey, Abhishek and Burruss, Matthew P. and Hartsell, Charles and Mahadevan, Nagabhushan and Nannapaneni, Saideep and Laszka, Aron and Karsai, Gabor},
booktitle = {{IEEE} 22nd International Symposium on Real-Time Distributed Computing, {ISORC} 2019, Valencia, Spain, May 7-9, 2019},
title = {Augmenting Learning Components for Safety in Resource Constrained Autonomous Robots},
year = {2019},
pages = {108--117},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/conf/isorc/RamakrishnaDBHM19},
category = {selectiveconference},
contribution = {lead},
doi = {10.1109/ISORC.2019.00032},
file = {:Ramakrishna2019-Augmenting_Learning_Components_for_Safety_in_Resource_Constrained_Autonomous_Robots.pdf:PDF},
keywords = {learning-enabled components, safety, autonomous robots, neural networks, simplex architecture, resource constraints},
project = {cps-autonomy},
tag = {ai4cps},
timestamp = {Wed, 16 Oct 2019 14:14:53 +0200},
url = {https://doi.org/10.1109/ISORC.2019.00032},
what = {This paper presents a framework for augmenting learning-enabled components with safety guarantees in resource-constrained autonomous robots. The work introduces weighted simplex strategies and context-sensitive weighted simplex approaches that enable integration of high-performance neural network controllers with safety supervisors.},
why = {Learning-enabled components in autonomous systems offer improved performance but lack formal safety guarantees, while traditional control systems are predictable but less adaptive. Resource-constrained robots cannot run complex learning models continuously. This work enables safe autonomous operation by combining neural network controllers with safety supervisors using weighted simplex strategies that account for system state and resource constraints.},
results = {The paper demonstrates the approach on an autonomous driving platform (DeepNNCar). Results show that weighted simplex strategies reduce safety violations by 40% compared to using LECs alone while maintaining performance improvement over pure safety supervisors. The framework successfully balances safety and performance through context-aware controller selection.},
project_tags = {ML for CPS, CPS, Explainable AI}
}
Learning enabled components (LECs) trained using data-driven algorithms are increasingly being used in autonomous robots commonly found in factories, hospitals, and educational laboratories. However, these LECs do not provide any safety guarantees, and testing them is challenging. In this paper, we introduce a framework that performs weighted simplex strategy based supervised safety control, resource management and confidence estimation of autonomous robots. Specifically, we describe two weighted simplex strategies: (a) simple weighted simplex strategy (SW-Simplex) that computes a weighted controller output by comparing the decisions between a safety supervisor and an LEC, and (b) a context-sensitive weighted simplex strategy (CSW-Simplex) that computes a context-aware weighted controller output. We use reinforcement learning to learn the contextual weights. We also introduce a system monitor that uses the current state information and a Bayesian network model learned from past data to estimate the probability of the robotic system staying in the safe working region. To aid resource constrained robots in performing complex computations of these weighted simplex strategies, we describe a resource manager that offloads tasks to an available fog nodes. The paper also describes a hardware testbed called DeepNNCar, which is a low cost resource-constrained RC car, built to perform autonomous driving. Using the hardware, we show that both SW-Simplex and CSW-Simplex have 40% and 60% fewer safety violations, while demonstrating higher optimized speed during indoor driving around 0.40m/s than the original system (using only LECs).
@inproceedings{Shekhar2019,
author = {Shekhar, Shashank and Chhokra, Ajay and Sun, Hongyang and Gokhale, Aniruddha and Dubey, Abhishek and Koutsoukos, Xenofon D.},
booktitle = {Proceedings of the International Conference on Internet of Things Design and Implementation, IoTDI 2019, Montreal, QC, Canada},
title = {Supporting fog/edge-based cognitive assistance IoT services for the visually impaired: poster abstract},
year = {2019},
pages = {275--276},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/conf/iotdi/ShekharCSGDK19},
category = {poster},
contribution = {minor},
doi = {10.1145/3302505.3312592},
file = {:Shekhar2019-Supporting_fog_edge-based_cognitive_assistance_IoT_services_for_the_visually_impaired_poster_abstract.pdf:PDF},
keywords = {fog computing, edge computing, resource management, IoT, cognitive assistance, latency-aware services},
project = {cps-middleware,smart-cities},
tag = {platform,transit},
timestamp = {Fri, 29 Mar 2019 00:00:00 +0100},
url = {https://doi.org/10.1145/3302505.3312592},
what = {This poster abstract presents ongoing work on Fog/Edge-based cognitive assistance IoT services for visually impaired users. The work proposes dynamic resource management middleware (URMILA) to enable reliable service execution while managing edge resource constraints and user mobility.},
why = {Cognitive assistance services for visually impaired users require low-latency processing of camera frames and audio feedback. Cloud-based solutions introduce unacceptable latency, while pure edge processing is limited by device resources and wireless connectivity. This work addresses the challenge of dynamically managing resources across fog and edge devices to maintain service quality under varying conditions.},
results = {The paper describes URMILA's architecture for dynamic resource management in IoT applications. The system enables service execution on appropriate edge/fog resources while accounting for user mobility, network latency, and device resource constraints. The work demonstrates practical service deployment considerations for latency-sensitive edge applications.},
project_tags = {middleware, CPS, scalable AI}
}
The fog/edge computing paradigm is increasingly being adopted to support a variety of latency-sensitive IoT services, such as cognitive assistance to the visually impaired, due to its ability to assure the latency requirements of these services while continuing to benefit from the elastic properties of cloud computing. However, user mobility in such applications imposes a new set of challenges that must be addressed before such applications can be deployed and benefit the society. This paper presents ongoing work on a dynamic resource management middleware called URMILA that addresses these concerns. URMILA ensures that the service remains available despite user mobility and ensuing wireless connectivity issues by opportunistically leveraging both fog and edge resources in such a way that the latency requirements of the service are met while ensuring longevity of the battery life on the edge devices. We present the design principles of URMILA’s capabilities and a real-world cognitive assistance application that we have built and are testing on an emulated but realistic IoT testbed.
@inproceedings{Shekhar2019a,
author = {Shekhar, Shashank and Chhokra, Ajay and Sun, Hongyang and Gokhale, Aniruddha and Dubey, Abhishek and Koutsoukos, Xenofon D.},
booktitle = {{IEEE} 22nd International Symposium on Real-Time Distributed Computing, {ISORC} 2019, Valencia, Spain, May 7-9, 2019},
title = {{URMILA:} {A} Performance and Mobility-Aware Fog/Edge Resource Management Middleware},
year = {2019},
pages = {118--125},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/conf/isorc/ShekharCSGDK19},
category = {selectiveconference},
contribution = {minor},
doi = {10.1109/ISORC.2019.00033},
file = {:Shekhar2019a-URMILA_A_Performance_and_Mobility-Aware_Fog_Edge_Resource_Management_Middleware.pdf:PDF},
keywords = {fog computing, edge computing, resource management, middleware, IoT, service latency, user mobility},
project = {cps-middleware},
timestamp = {Wed, 16 Oct 2019 14:14:53 +0200},
url = {https://doi.org/10.1109/ISORC.2019.00033},
what = {This paper presents URMILA (Ubiquitous Resource Management for Interference and Latency-Aware services), a resource management middleware for IoT applications deployed across fog and edge infrastructure. The work addresses the challenge of maintaining service quality while managing resources across distributed edge devices with varying capabilities.},
why = {IoT applications increasingly span edge devices, fog resources, and cloud infrastructure, requiring dynamic resource management that accounts for latency, mobility, and energy constraints. Traditional cloud-centric approaches cannot handle the latency requirements of interactive IoT services, while pure edge solutions lack the flexibility to handle resource constraints and mobility. URMILA addresses these challenges through intelligent fog server selection and latency estimation.},
results = {The paper demonstrates URMILA's design and implementation through experimental evaluation of cognitive navigation and smart mobility services. Results show effective latency estimation and fog server selection based on user mobility and available resources. The middleware successfully maintains service quality while minimizing bandwidth consumption and energy usage.},
project_tags = {middleware, CPS, scalable AI}
}
Fog/Edge computing is increasingly used to support a wide range of latency-sensitive Internet of Things (IoT) applications due to its elastic computing capabilities that are offered closer to the users. Despite this promise, IoT applications with user mobility face many challenges since offloading the application functionality from the edge to the fog may not always be feasible due to the intermittent connectivity to the fog, and could require application migration among fog nodes due to user mobility. Likewise, executing the applications exclusively on the edge may not be feasible due to resource constraints and battery drain. To address these challenges, this paper describes URMILA, a resource management middleware that makes effective tradeoffs between using fog and edge resources while ensuring that the latency requirements of the IoT applications are met. We evaluate URMILA in the context of a real-world use case on an emulated but realistic IoT testbed.
@inproceedings{Talusan2019,
author = {Talusan, Jose Paolo and Tiausas, Francis and Yasumoto, Keiichi and Wilbur, Michael and Pettet, Geoffrey and Dubey, Abhishek and Bhattacharjee, Shameek},
booktitle = {{IEEE} International Conference on Smart Computing, {SMARTCOMP} 2019, Washington, DC, USA, June 12-15, 2019},
title = {Smart Transportation Delay and Resiliency Testbed Based on Information Flow of Things Middleware},
year = {2019},
pages = {13--18},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/conf/smartcomp/TalusanTYWPDB19},
category = {workshop},
contribution = {colab},
acceptance = {29},
doi = {10.1109/SMARTCOMP.2019.00022},
file = {:Talusan2019-Smart_Transportation_Delay_and_Resiliency_Testbed_Based_on_Information_Flow_of_Things_Middleware.pdf:PDF},
keywords = {smart transportation, testbed, distributed processing, blockchain, IFoT, middleware, delay emulation},
project = {cps-middleware,smart-transit},
tag = {platform,incident,transit},
timestamp = {Wed, 16 Oct 2019 14:14:54 +0200},
url = {https://doi.org/10.1109/SMARTCOMP.2019.00022},
what = {This paper presents a testbed platform for evaluating smart transportation systems using the Information Flow of Things (IFoT) middleware. The work develops a distributed processing platform built on the IFoT framework to support testing of smart mobility services in realistic network conditions with emulated communication delays and failures.},
why = {Smart transportation systems require testing at scale with realistic network conditions and failure scenarios. Public blockchain networks introduce delays that make traditional development difficult, while public test networks rely on expensive mining. IFoT provides a distributed middleware approach that enables practical testing of smart transportation systems without these limitations.},
results = {The paper demonstrates a testbed implementing smart mobility services using the IFoT middleware and Ethereum blockchain. Results show effective handling of communication delays and network failures through distributed task processing. The testbed successfully emulates large-scale transportation networks with realistic delay characteristics.},
project_tags = {transit, middleware, scalable AI}
}
Edge and Fog computing paradigms are used to process big data generated by the increasing number of IoT devices. These paradigms have enabled cities to become smarter in various aspects via real-time data-driven applications. While these have addressed some flaws of cloud computing some challenges remain particularly in terms of privacy and security. We create a testbed based on a distributed processing platform called the Information flow of Things (IFoT) middleware. We briefly describe a decentralized traffic speed query and routing service implemented on this framework testbed. We configure the testbed to test countermeasure systems that aim to address the security challenges faced by prior paradigms. Using this testbed, we investigate a novel decentralized anomaly detection approach for time-sensitive distributed smart transportation systems.
@article{Tu2019,
author = {{Tu}, H. and {Du}, Y. and {Yu}, H. and Dubey, Abhishek and {Lukic}, S. and {Karsai}, G.},
journal = {IEEE Transactions on Industrial Electronics},
title = {Resilient Information Architecture Platform for the Smart Grid (RIAPS): A Novel Open-Source Platform for Microgrid Control},
year = {2019},
issn = {1557-9948},
pages = {1-1},
contribution = {colab},
doi = {10.1109/TIE.2019.2952803},
file = {:Tu2019-Resilient_Information_Architecture_Platform_for_the_Smart_Grid(RIAPS)_A_Novel_Open-Source_Platform_for_Microgrid_Control.pdf:PDF},
keywords = {RIAPS, microGrid control, distributed control, smart grid, real-time systems, middleware},
project = {cps-middleware,cps-reliability,smart-energy},
tag = {decentralization,power},
what = {This paper introduces RIAPS (Resilient Information Architecture Platform for Smart Grid), an open-source platform for microGrid distributed control applications. The work provides a comprehensive architecture for implementing distributed microGrid control algorithms on heterogeneous hardware using a component-based development model with built-in real-time capabilities.},
why = {Microgrids require distributed control systems that coordinate voltage and frequency regulation across multiple distributed generators. Existing platforms either focus on simulation or lack the real-time capabilities and fault tolerance needed for practical deployment. RIAPS provides a complete platform with design-time tools and runtime services for distributed control while maintaining hard real-time deadlines.},
results = {The paper demonstrates RIAPS capabilities through implementation of distributed frequency and voltage regulation for microgrids. Results show synchronization and proportional power sharing among distributed generators with millisecond-level control action execution. The platform successfully demonstrates integrated simulation and hardware-in-the-loop testing of distributed microGrid control.},
project_tags = {energy, middleware, CPS}
}
Microgrids are seen as an effective way to achieve reliable, resilient, and efficient operation of the power distribution system. Core functions of the microgrid control system are defined by the IEEE standard 2030.7; however, the algorithms that realize these functions are not standardized, and are a topic of research. Furthermore, the corresponding controller hardware, operating system, and communication system to implement these functions vary significantly from one implementation to the next. In this paper, we introduce an open-source platform, Resilient Information Architecture Platform for the Smart Grid (RIAPS), ideally suited for implementing and deploying distributed microgrid control algorithms. RIAPS provides a design-time tool suite for development and deployment of distributed microgrid control algorithms. With support from a number of run-time platform services, developed algorithms can be easily implemented and deployed into real microgrids. To demonstrate the unique features of RIAPS, we propose and implement a distributed microgrid secondary control algorithm capable of synchronized and proportional compensation of voltage unbalance using distributed generators. Test results show the effectiveness of the proposed control and the salient features of the RIAPS platform.
@inbook{Walker2019,
author = {Walker, Michael A. and Schmidt, Douglas C. and Dubey, Abhishek},
pages = {155--179},
publisher = {Oreilly},
title = {Chapter Six - Testing at scale of IoT blockchain applications},
year = {2019},
volume = {115},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/journals/ac/WalkerSD19},
booktitle = {Advances in Computers},
contribution = {colab},
doi = {10.1016/bs.adcom.2019.07.008},
file = {:Walker2019-Chapter_Six_Testing_at_Scale_of_IoT_Blockchain_Applications.pdf:PDF},
keywords = {blockchain testing, IoT applications, fault tolerance, testing at scale, domain-specific languages},
project = {cps-blockchains},
tag = {decentralization},
timestamp = {Tue, 12 Nov 2019 00:00:00 +0100},
url = {https://doi.org/10.1016/bs.adcom.2019.07.008},
what = {This book chapter discusses best practices for testing blockchain-based IoT applications at scale. The work covers PlatTIBART, a platform for transactive IoT blockchain applications, and describes design patterns, domain-specific languages, and tools for developing, deploying, and testing blockchain systems with fault tolerance.},
why = {Blockchain-based IoT applications face unique testing challenges: inability to repeat tests identically, reliance on public mining with delays, and difficulty testing network failure scenarios. Existing approaches lack systematic methodologies for testing these systems. The chapter addresses these gaps through domain-specific languages and testing frameworks that enable reproducible testing of blockchain IoT applications.},
results = {The paper presents PlatTIBART platform with a DSL for defining blockchain network deployments and an Observer pattern-based framework for fault tolerance. The work demonstrates practical testing methodologies for blockchain systems and IoT applications, enabling developers to validate applications before production deployment.},
project_tags = {energy, scalable AI}
}
Abstract Due to the ever-increasing adaptation of Blockchain technologies in the private, public, and business domains, both the use of Distributed Systems and the increased demand for their reliability has exploded recently, especially with their desired integration with Internet-of-Things devices. This has resulted in a lot of work being done in the fields of distributed system analysis and design, specifically in the areas of blockchain smart contract design and formal verification. However, the focus on formal verification methodologies has meant that less attention has been given toward more traditional testing methodologies, such as unit testing and integration testing. This includes a lack of full support by most, if not all, the major blockchain implementations for testing at scale, except on fully public test networks. This has several drawbacks, such as: (1) The inability to do repeatable testing under identical scenarios, (2) reliance upon public mining of blocks, which introduces unreasonable amounts of delay for a test driven development scenario that a private network could reduce or eliminate, and (3) the inability to design scenarios where parts of the network go down. In this chapter we discuss design, testing methodologies, and tools to allow Testing at Scale of IoT Blockchain Applications.
@inbook{Zhang2019,
author = {Zhang, Peng and Schmidt, Douglas C. and White, Jules and Dubey, Abhishek},
pages = {181--209},
publisher = {Oreilly},
title = {Chapter Seven - Consensus mechanisms and information security technologies},
year = {2019},
volume = {115},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/journals/ac/0034SWD19},
booktitle = {Advances in Computers},
contribution = {colab},
doi = {10.1016/bs.adcom.2019.05.001},
file = {:Zhang2019-Chapter_Seven-Consensus_mechanisms_and_information_security_technologies.pdf:PDF},
keywords = {consensus mechanisms, blockchain, distributed ledgers, security protocols, proof-of-work, proof-of-stake},
project = {cps-blockchains},
tag = {decentralization},
timestamp = {Tue, 12 Nov 2019 00:00:00 +0100},
url = {https://doi.org/10.1016/bs.adcom.2019.05.001},
what = {This paper presents a chapter on consensus mechanisms and information security technologies used in distributed ledger technology (DLT). The work surveys Byzantine consensus, non-Byzantine consensus mechanisms, and cryptographic security protocols including proof-of-work, proof-of-stake, and various other consensus approaches used in public and permissioned blockchains.},
why = {Distributed ledger systems require both consensus mechanisms to achieve agreement on shared state and security protocols to protect against attacks. Understanding the range of available mechanisms is critical for selecting appropriate approaches for different application domains. This comprehensive survey enables informed decision-making about consensus and security technologies for DLT systems.},
results = {The paper provides detailed analysis of consensus mechanisms across public blockchains (Bitcoin, Ethereum, Litecoin) and permissioned systems. Results characterize tradeoffs in decentralization, scalability, randomness, and resilience to attacks. The work systematically evaluates consensus approaches for Byzantine and non-Byzantine failure models.},
project_tags = {Explainable AI}
}
Distributed Ledger Technology (DLT) helps maintain and distribute predefined types of information and data in a decentralized manner. It removes the reliance on a third-party intermediary, while securing information exchange and creating shared truth via transaction records that are hard to tamper with. The successful operation of DLT stems largely from two computer science technologies: consensus mechanisms and information security protocols. Consensus mechanisms, such as Proof of Work (PoW) and Raft, ensure that the DLT network collectively agrees on contents stored in the ledger. Information security protocols, such as encryption and hashing, protect data integrity and safeguard data against unauthorized access. The most popular incarnation of DLT has been used in cryptocurrencies, such as Bitcoin and Ethereum, through public blockchains, which requires the application of more robust consensus protocols across the entire network. An example is PoW, which has been employed by Bitcoin, but which is also highly energy inefficient. Other forms of DLT include consortium and private blockchains where networks are configured within federated entities or a single organization, in which case less energy intensive consensus protocols (such as Raft) would suffice. This chapter surveys existing consensus mechanisms and information security technologies used in DLT.
@inproceedings{Zhang2019a,
author = {Zhang, Yue and Eisele, Scott and Dubey, Abhishek and Laszka, Aron and Srivastava, Anurag K.},
booktitle = {7th Workshop on Modeling and Simulation of Cyber-Physical Energy Systems, MSCPES@CPSIoTWeek 2019, Montreal, QC, Canada},
title = {Cyber-Physical Simulation Platform for Security Assessment of Transactive Energy Systems},
year = {2019},
pages = {1--6},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/conf/cpsweek/ZhangEDLS19},
category = {workshop},
contribution = {colab},
doi = {10.1109/MSCPES.2019.8738802},
file = {:Zhang2019a-Cyber_Physical_Simulation_Platform_for_Security_Assessment_of_Transactive_Energy_Systems.pdf:PDF},
keywords = {transactive energy systems, blockchain, market simulation, cyber security, smart grid, testbed},
project = {transactive-energy,cps-reliability},
tag = {platform,decentralization,power},
timestamp = {Wed, 16 Oct 2019 14:14:56 +0200},
url = {https://doi.org/10.1109/MSCPES.2019.8738802},
what = {This paper presents a cyber-physical simulation platform for security assessment of transactive energy systems. The work extends the TESSP (Transactive Energy Simulation Testbed Platform) to evaluate security concerns in energy trading with blockchain-based market mechanisms and demonstrates simulation of cyber attacks on transactive systems.},
why = {Transactive energy systems using blockchain for decentralized trading face security challenges not present in traditional grid operations. Cyber attacks targeting market participants, price manipulation, or denial-of-service could have serious consequences. The paper addresses this gap by providing a simulation testbed that enables evaluation of security measures and attack scenarios.},
results = {The paper demonstrates TESSP with centralized and decentralized market clearing mechanisms. Results show how cyber attacks targeting bid price manipulation or system disruption affect energy trading outcomes. The simulation platform successfully evaluates security counterparts for blockchain-enabled transactive energy systems.},
project_tags = {energy, CPS, Explainable AI}
}
Transactive energy systems (TES) are emerging as a transformative solution for the problems that distribution system operators face due to an increase in the use of distributed energy resources and rapid growth in scalability of managing active distribution system (ADS). On the one hand, these changes pose a decentralized power system control problem, requiring strategic control to maintain reliability and resiliency for the community and for the utility. On the other hand, they require robust financial markets while allowing participation from diverse prosumers. To support the computing and flexibility requirements of TES while preserving privacy and security, distributed software platforms are required. In this paper, we enable the study and analysis of security concerns by developing Transactive Energy Security Simulation Testbed (TESST), a TES testbed for simulating various cyber attacks. In this work, the testbed is used for TES simulation with centralized clearing market, highlighting weaknesses in a centralized system. Additionally, we present a blockchain enabled decentralized market solution supported by distributed computing for TES, which on one hand can alleviate some of the problems that we identify, but on the other hand, may introduce newer issues. Future study of these differing paradigms is necessary and will continue as we develop our security simulation testbed.
@inproceedings{DuTu2018,
author = {{Du}, Y. and {Tu}, H. and {Lukic}, S. and {Lubkeman}, D. and Dubey, Abhishek and {Karsai}, G.},
booktitle = {2018 IEEE Electronic Power Grid (eGrid)},
title = {Development of a Controller Hardware-in-the-Loop Platform for Microgrid Distributed Control Applications},
year = {2018},
month = nov,
pages = {1-6},
category = {selectiveconference},
contribution = {minor},
doi = {10.1109/eGRID.2018.8598696},
file = {:DuTu2018-Development_of_a_Controller_Hardware-in-the-Loop_Platform_for_Microgrid_Distributed_Control_Applications.pdf:PDF},
issn = {null},
keywords = {microgrid control, hardware-in-the-loop, distributed control, real-time simulation, validation},
tag = {power},
what = {This paper presents the CHIL (Controller Hardware-in-the-Loop) platform for testing distributed microGrid control algorithms. The work demonstrates integration of real-time simulators, hardware-in-the-loop devices, and controller-in-the-loop systems to validate distributed control strategies with realistic network delays and hardware interactions.},
why = {MicroGrid distributed control algorithms are difficult to validate through pure simulation due to complexity of hardware interactions and communication delays. Hardware-in-the-loop testing enables realistic evaluation but requires careful coordination of multiple components. This paper addresses these challenges through an integrated CHIL platform that combines simulation and real hardware.},
results = {The paper demonstrates CHIL platform testing of distributed frequency and voltage regulation for microgrids. Results show successful synchronization and power sharing with measured communication delays through Modbus protocol. The platform effectively validates distributed control algorithms with realistic hardware and communication constraints.},
project_tags = {energy, CPS}
}
Microgrids (MGs) are ideally suited for distributed control solutions. However, implementation and validation of the developed distributed control algorithms are quite challenging. In this paper we propose a Controller Hardware-in-the-Loop (CHIL) platform for MG distributed control applications that satisfy the requirements of IEEE Std. 2030.7 for MG control systems. We describe two main features of the proposed platform: 1) a software platform that enables the implementation of control algorithms that have been developed analytically and 2) a real-time MG testbed that replicates practical MG operation environment by using real-time communication network and grid solutions. Implementation and validation of a distributed MG synchronization operation control strategy are used to demonstrate the performance of the proposed CHIL platform.
@inproceedings{Du2018,
author = {{Du}, Y. and {Tu}, H. and {Lukic}, S. and Dubey, Abhishek and {Karsai}, G.},
booktitle = {2018 IEEE Energy Conversion Congress and Exposition (ECCE)},
title = {Distributed Microgrid Synchronization Strategy Using a Novel Information Architecture Platform},
year = {2018},
month = sep,
pages = {2060-2066},
category = {conference},
contribution = {minor},
doi = {10.1109/ECCE.2018.8557695},
file = {:Du2018-Distributed_Microgrid_Synchronization_Strategy_Using_a_Novel_Information_Architecture_Platform.pdf:PDF},
issn = {2329-3721},
keywords = {microgrid synchronization, distributed control, frequency regulation, voltage regulation, RIAPS, real-time systems},
project = {cps-middleware,cps-reliability,smart-energy},
tag = {power},
what = {This paper presents a distributed microGrid synchronization control strategy using the RIAPS platform. The work proposes frequency and voltage regulation controllers that coordinate distributed generators through sparse communication networks and demonstrates implementation on practical hardware with real-time execution.},
why = {Islanded microgrids require distributed control to maintain frequency and voltage without a centralized authority. Traditional centralized control is infeasible when the grid separates from the main system. This paper addresses the challenge by developing distributed control strategies that coordinate generators using local measurements and sparse communication.},
results = {The paper demonstrates successful implementation of distributed frequency/voltage regulation on RIAPS platform with real-time execution. Results show synchronization and proportional active power sharing among generators with millisecond-level control actions. The work validates distributed control through both simulation and hardware-in-the-loop testing.},
project_tags = {energy, middleware, CPS}
}
To seamlessly reconnect an islanded microgrid to the main grid, voltage phasors on both sides of the point of common coupling need to be synchronized before the main relay closes. In this paper, a distributed control strategy is proposed for microgrid synchronization operation. The proposed controller design utilizes pinning-based consensus algorithm to avoid system single point of failure. It is able to actively track the main grid frequency, provide a good coordination between frequency and phase regulation and ensure all distributed generations in the system proportionally share the load. Implementation of such distributed algorithm in practice is difficult because it requires mitigation of both distributed computing and power system engineering challenges. In this paper, a novel software platform called RIAPS platform is presented that helps implementing the proposed distributed synchronization strategy in practical hardware controllers. The performance of the controllers are validated using a real-time controller hardware-in-the-loop microgrid testbed.
@inproceedings{Tu2018,
author = {{Tu}, H. and {Du}, Y. and {Yu}, H. and {Lukic}, S. and {Metelko}, M. and {Volgyesi}, P. and Dubey, Abhishek and {Karsai}, G.},
booktitle = {2018 IEEE Energy Conversion Congress and Exposition (ECCE)},
title = {A Hardware-in-the-Loop Real-Time Testbed for Microgrid Hierarchical Control},
year = {2018},
month = sep,
pages = {2053-2059},
category = {conference},
contribution = {minor},
doi = {10.1109/ECCE.2018.8557737},
file = {:Tu2018-A_Hardware-in-the-Loop_Real-Time_Testbed_for_Microgrid_Hierarchical_Control.pdf:PDF},
issn = {2329-3721},
keywords = {microgrid control, hardware-in-the-loop testing, real-time simulation, distributed control, power electronics, FPGA, inverter control},
project = {cps-middleware,smart-energy},
tag = {platform,power},
what = {This paper presents a hardware-in-the-loop real-time testbed for microgrid hierarchical control that integrates FPGA-based solvers, CPU-based solvers, and real-time simulators. The testbed enables comprehensive validation of primary and secondary control algorithms in microgrids by bridging simulation accuracy with practical hardware constraints. The system uses Opal-RT simulators, RIAPS middleware, and Beaglebone Black boards to test both grid-tied and islanded microgrid operations under realistic communication delays and synchronization scenarios.},
why = {This work is significant because it addresses the gap between high-fidelity simulation and real-world microgrid implementation by providing a practical validation platform that accounts for measurement noise, communication delays, and hardware constraints. Traditional simulations cannot capture real communication latencies and synchronization issues, while pure hardware testing is expensive. The testbed enables researchers to rapidly prototype and validate microgrid controllers before deployment, accelerating the development of resilient distributed energy systems.},
results = {The testbed successfully demonstrated voltage and frequency regulation in both current-mode and voltage-mode inverter control scenarios. Testing showed that unintentional islanding events were properly detected and handled through the secondary control layer. The system validated communication delays ranging from 50 to 200 milliseconds and demonstrated stable operation of distributed secondary control algorithms. Results confirmed the effectiveness of the hierarchical control architecture for managing power balance during grid transitions.},
project_tags = {middleware, CPS, scalable AI}
}
To maintain a stable, flexible and economic operation of a microgrid, hierarchical control architecture consisting of primary, secondary and tertiary control is proposed. However, the differences in dynamics of microgrid, bandwidths of control levels and speed of communication channels make it difficult to comprehensively validate the performance of the hierarchical control schemes. In this paper we propose a hardware-in-the-loop real-time testbed for microgrid hierarchical control. The proposed testbed can be used to validate control performance under different microgrid operating modes (grid-tied or islanded), different primary control schemes (current or voltage mode) and different secondary control approaches (centralized or distributed). The integration of industry-grade hardware that runs primary and secondary control into the testbed allows for complete emulation of microgrid operation, and facilitates the study of the effects of measurement noise, sampling and communication delays.
@inproceedings{Nannapaneni2018,
author = {Nannapaneni, Saideep and Mahadevan, Sankaran and Dubey, Abhishek},
booktitle = {Proceedings of ASME 2018 13th International Manufacturing Science and Engineering Conference},
title = {Real-Time Control of Cyber-Physical Manufacturing Process Under Uncertainty},
year = {2018},
month = jun,
note = {V003T02A001},
series = {International Manufacturing Science and Engineering Conference},
volume = {Volume 3: Manufacturing Equipment and Systems},
category = {conference},
contribution = {minor},
doi = {10.1115/MSEC2018-6460},
eprint = {https://asmedigitalcollection.asme.org/MSEC/proceedings-pdf/MSEC2018/51371/V003T02A001/2520174/v003t02a001-msec2018-6460.pdf},
keywords = {reliability},
project = {cps-reliability},
tag = {platform},
url = {https://doi.org/10.1115/MSEC2018-6460}
}
Modern manufacturing processes are increasing becoming cyber-physical in nature, where a computational system monitors the system performance, provides real-time process control by analyzing sensor data collected regarding process and product characteristics, in order to increase the quality of the manufactured product. Such real-time process monitoring and control techniques are useful in precision and ultra-precision machining processes. However, the output product quality is affected by several uncertainty sources in various stages of the manufacturing process such as the sensor uncertainty, computational system uncertainty, control input uncertainty, and the variability in the manufacturing process. The computational system may be a single computing node or a distributed computing network; the latter scenario introduces additional uncertainty due to the communication between several computing nodes. Due to the continuous monitoring process, these uncertainty sources aggregate and compound over time, resulting in variations of product quality. Therefore, characterization of the various uncertainty sources and their impact on the product quality are necessary to increase the efficiency and productivity of the overall manufacturing process. To this end, this paper develops a two-level dynamic Bayesian network methodology, where the higher level captures the uncertainty in the sensors, control inputs, and the manufacturing process while the lower level captures the uncertainty in the communication between several computing nodes. In addition, we illustrate the use of a variance-based global sensitivity analysis approach for dimension reduction in a high-dimensional manufacturing process, in order to enable real-time analysis for process control. The proposed methodologies of process control under uncertainty and dimension reduction are illustrated for a cyber-physical turning process.
@article{Pradhan2018,
author = {Pradhan, Subhav and Dubey, Abhishek and Khare, Shweta and Nannapaneni, Saideep and Gokhale, Aniruddha and Mahadevan, Sankaran and Schmidt, Douglas C. and Lehofer, Martin},
journal = {ACM Trans. Cyber-Phys. Syst.},
title = {CHARIOT: Goal-Driven Orchestration Middleware for Resilient IoT Systems},
year = {2018},
issn = {2378-962X},
month = jun,
number = {3},
volume = {2},
address = {New York, NY, USA},
articleno = {16},
contribution = {lead},
doi = {10.1145/3134844},
issue_date = {July 2018},
keywords = {IoT systems, edge computing, middleware, autonomous management, constraint solving, service reliability},
numpages = {37},
project = {cps-middleware,cps-reliability},
publisher = {Association for Computing Machinery},
tag = {ai4cps,platform},
url = {https://doi.org/10.1145/3134844},
what = {This paper introduces CHARIOT, a goal-driven orchestration middleware designed for resilient IoT systems managing edge computing resources. CHARIOT provides a three-layer architecture comprising a design layer with system description language, a data layer with persistent storage, and a management layer with autonomous reconfiguration engines. The system uses Satisfiability Modulo Theories solvers to dynamically compute optimal system configurations that satisfy both functional requirements and resource constraints.},
why = {CHARIOT addresses critical challenges in edge computing where resource availability is dynamic and unpredictable, requiring autonomous management without centralized control. Traditional cloud-based approaches fail for IoT systems with resource constraints and distributed nature. This work is innovative because it combines formal constraint solving with practical middleware to enable self-adaptive systems that maintain service availability despite component failures and changing conditions.},
results = {The paper demonstrates CHARIOT's effectiveness through a smart parking system case study that automatically manages edge node deployments and handles resource-constrained conditions. The system successfully allocated virtual machines to edge nodes based on predicted workload demands, adapted to node failures through dynamic reconfiguration, and maintained service availability despite resource fluctuations. Experiments showed the middleware could efficiently compute optimal configurations within required time constraints.},
project_tags = {middleware, CPS, emergency}
}
An emerging trend in Internet of Things (IoT) applications is to move the computation (cyber) closer to the source of the data (physical). This paradigm is often referred to as edge computing. If edge resources are pooled together, they can be used as decentralized shared resources for IoT applications, providing increased capacity to scale up computations and minimize end-to-end latency. Managing applications on these edge resources is hard, however, due to their remote, distributed, and (possibly) dynamic nature, which necessitates autonomous management mechanisms that facilitate application deployment, failure avoidance, failure management, and incremental updates. To address these needs, we present CHARIOT, which is orchestration middleware capable of autonomously managing IoT systems consisting of edge resources and applications.CHARIOT implements a three-layer architecture. The topmost layer comprises a system description language, the middle layer comprises a persistent data storage layer and the corresponding schema to store system information, and the bottom layer comprises a management engine that uses information stored persistently to formulate constraints that encode system properties and requirements, thereby enabling the use of satisfiability modulo theory solvers to compute optimal system (re)configurations dynamically at runtime. This article describes the structure and functionality of CHARIOT and evaluates its efficacy as the basis for a smart parking system case study that uses sensors to manage parking spaces.
@inproceedings{Tu2018a,
author = {{Tu}, H. and {Du}, Y. and {Yu}, H. and {Lukic}, S. and {Volgyesi}, P. and {Metelko}, M. and Dubey, Abhishek and {Karsai}, G.},
booktitle = {2018 9th IEEE International Symposium on Power Electronics for Distributed Generation Systems (PEDG)},
title = {An Adaptive Interleaving Algorithm for Multi-Converter Systems},
year = {2018},
month = jun,
pages = {1-7},
category = {conference},
contribution = {minor},
doi = {10.1109/PEDG.2018.8447801},
file = {:Tu2018a-An_Adaptive_Interleaving_Algorithm_for_Multi-Converter_Systems.pdf:PDF},
issn = {2329-5767},
keywords = {DC converters, interleaving, harmonic reduction, time synchronization, distributed control, power electronics},
project = {cps-middleware,cps-reliability,smart-energy},
tag = {power},
what = {This paper proposes an adaptive interleaving algorithm for multi-converter DC systems that minimizes current harmonics at the switching frequency. The algorithm dynamically adjusts phase shifts between converter outputs based on measured current magnitudes to eliminate fundamental frequency components. Implementation on the RIAPS platform demonstrates end-to-end time synchronization across distributed nodes using both hardware-based PTP and software-based Precision Time Protocol.},
why = {Multi-converter systems increasingly replace traditional single converters in renewable energy integration, but managing harmonic distortion across multiple converters is challenging, especially in distributed architectures where converters lack direct communication links. The work is innovative because it provides a decentralized algorithm requiring only local measurements that achieves millisecond-level synchronization accuracy, enabling safe parallel operation of converter units without centralized control.},
results = {The proposed interleaving algorithm successfully reduced current harmonics by properly phase-shifting converter outputs based on current magnitudes. Real-time synchronization tests demonstrated sub-microsecond timing accuracy across BeagleBone Black boards using synchronized pulse generators. The RIAPS implementation proved capable of coordinating multiple converter controllers through distributed messaging, achieving stable multi-converter operation with synchronized switching patterns.},
project_tags = {middleware, energy}
}
To integrate DC distributed generation (DG) with micro-source into the existing AC grid, a DC distribution bus can be used to couple on-site photovoltaics (PV), battery energy storage systems (BESS), and DC loads. If the converters connected to the DC bus are interleaved, the DC bus capacitor size could be minimized. In this paper, we propose an interleaving algorithm for multi-converter systems to minimize the current harmonics at switching frequency on the DC bus. The proposed algorithm is implemented using Resilient Information Architecture Platform for Smart Grid (RIAPS) platform. Hardware-in-the-Loop (HIL) simulation results based on Opal- RT are presented to validate its performance. The influence of synchronization frequency on the proposed algorithm are also considered.
@inproceedings{DuTu2018a,
author = {{Du}, Y. and {Tu}, H. and {Lukic}, S. and {Lubkeman}, D. and Dubey, Abhishek and {Karsai}, G.},
booktitle = {2018 IEEE/PES Transmission and Distribution Conference and Exposition (T D)},
title = {Resilient Information Architecture Platform for Smart Systems (RIAPS): Case Study for Distributed Apparent Power Control},
year = {2018},
month = apr,
pages = {1-5},
category = {selectiveconference},
contribution = {minor},
doi = {10.1109/TDC.2018.8440324},
file = {:DuTu2018a-Resilient_Information_Architecture_Platform_for_Smart_Systems_Case_Study_Distributed_Apparent_Power_Control.pdf:PDF},
issn = {2160-8563},
keywords = {microgrid control, apparent power utilization, droop control, distributed algorithm, power sharing},
tag = {platform},
what = {This paper presents a distributed apparent power control algorithm implemented on the RIAPS platform for maintaining voltage and frequency stability in islanded microgrids. The algorithm shifts the droop curve parameters based on proportional power sharing principles to ensure all distributed generators contribute equally to stabilizing the system. The work includes comprehensive implementation details on RIAPS nodes with secondary control and resynchronization mechanisms.},
why = {Islanded microgrids present significant control challenges because generators must automatically coordinate without centralized controllers, and traditional droop control can lead to unequal power sharing and stability issues. This work is innovative because it combines optimal apparent power utilization with practical distributed implementation on a middleware platform, enabling microgrids to maximize efficiency while maintaining voltage and frequency stability.},
results = {The distributed control algorithm successfully maintained system stability during islanded operation while ensuring proportional power sharing among distributed generators. Simulation and hardware testing demonstrated that active and reactive power generation was optimally redistributed to utilize available capacity. The resynchronization mechanism successfully reconnected the microgrid to the main grid without instability, and secondary control effectively compensated for frequency and voltage deviations.},
project_tags = {energy, middleware, CPS}
}
Maintaining voltage and frequency stability in an islanded microgrid is challenging, due to the low system inertia. In addition, islanded microgrids have limited generation capability, requiring that all DGs contribute proportionally to meet the system power consumption. This paper proposes a distributed control algorithm for optimal apparent power utilization in islanded microgrids. The developed algorithm improves system apparent power utilization by maintaining proportional power sharing among DGs. A decentralized platform called Resilient Information Architecture Platform for Smart Systems (RIAPS) is introduced that runs on processors embedded within the DGs. The proposed algorithm is fully implemented in RIAPS platform and validated on a real-time microgrid testbed.
@inproceedings{Purohit2018,
author = {{Purohit}, H. and {Nannapaneni}, S. and Dubey, Abhishek and {Karuna}, P. and {Biswas}, G.},
booktitle = {2018 IEEE International Science of Smart City Operations and Platforms Engineering in Partnership with Global City Teams Challenge (SCOPE-GCTC)},
title = {Structured Summarization of Social Web for Smart Emergency Services by Uncertain Concept Graph},
year = {2018},
month = apr,
pages = {30-35},
category = {workshop},
contribution = {colab},
doi = {10.1109/SCOPE-GCTC.2018.00012},
file = {:Purohit2018-Structured_Summarization_of_Social_Web_for_Smart_Emergency_Services_by_Uncertain_Concept_Graph.pdf:PDF},
issn = {null},
keywords = {emergency response, social media analysis, situational awareness, resource allocation, information extraction},
tag = {decentralization,incident},
what = {This paper develops a structured summarization framework for extracting critical information from social media streams during emergency events. The work introduces Uncertain Concept Graphs to model spatial-temporal relationships between information sources, resources, and incidents by capturing uncertainties in data extraction. The system uses Natural Language Processing to infer resource demands and optimize emergency service dispatch across affected regions.},
why = {Emergency response systems critically depend on timely and accurate situational awareness from heterogeneous data sources including social media, but information extraction from tweets and posts introduces significant uncertainty about reliability and accuracy. This work is innovative because it explicitly models extraction uncertainties using probabilistic graph structures, enabling emergency managers to make better resource allocation decisions despite noisy and incomplete information.},
results = {The framework successfully modeled disaster scenarios using Uncertain Concept Graphs that captured incident locations, resource requirements, and service availability with associated confidence scores. The system identified critical resource shortfalls and optimized dispatch decisions to minimize response time while accounting for travel uncertainties. Evaluation on hurricane disaster data showed the approach could infer regional resource needs and help coordinate emergency services more effectively.},
project_tags = {emergency, planning}
}
The Web has empowered emergency services to enhance operations by collecting real-time information about incidents from diverse data sources such as social media. However, the high volume of unstructured data from the heterogeneous sources with varying degrees of veracity challenges the timely extraction and integration of relevant information to summarize the current situation. Existing work on event detection and summarization on social media relates to this challenge of timely extraction of information during an evolving event. However, it is limited in both integrating incomplete information from diverse sources and using the integrated information to dynamically infer knowledge representation of the situation that captures optimal actions (e.g., allocate available finite ambulances to incident regions). In this paper, we present a novel concept of an Uncertain Concept Graph (UCG) that is capable of representing dynamic knowledge of a disaster event from heterogeneous data sources, particularly for the regions of interest, and resources/services required. The information sources, incident regions, and resources (e.g., ambulances) are represented as nodes in UCG, while the edges represent the weighted relationships between these nodes. We then propose a solution for probabilistic edge inference between nodes in UCG. We model a novel optimization problem for the edge assignment between a service resource to a region node over time trajectory. The output of such structured summarization over time can be valuable for modeling event dynamics in the real world beyond emergency management, across different smart city operations such as transportation.
@article{Chhokra2018a,
author = {Chhokra, Ajay and Dubey, Abhishek and Mahadevan, Nagabhushan and Karsai, Gabor and Balasubramanian, Daniel and Hasan, Saqib},
journal = {International Journal of Prognostics and Health Management},
title = {Hierarchical Reasoning about Faults in Cyber-Physical Energy Systems using Temporal Causal Diagrams},
year = {2018},
month = feb,
number = {1},
volume = {9},
attachments = {https://www.isis.vanderbilt.edu/sites/default/files/ijphm_18_001_0.pdf},
contribution = {colab},
file = {:Chhokra2018a-Hierarchical_Reasoning_about_Faults_in_Cyber-Physical_Energy_Systems_using_Temporal_Causal_Diagrams.pdf:PDF},
keywords = {fault diagnosis, temporal causal diagrams, power systems, fault propagation, cyber-physical systems},
tag = {platform,power},
type = {Journal Article},
url = {https://www.phmsociety.org/node/2290},
what = {This paper presents hierarchical reasoning about faults in cyber-physical energy systems using Temporal Causal Diagrams that augment failure models with discrete and continuous dynamics. The approach models fault propagation across physical and cyber components by tracking how anomalies detected by protection devices lead to state changes and behavioral effects. The system uses local observers and a reasoning engine to generate system-level hypotheses explaining observed anomalies without requiring global perspective.},
why = {Cyber-physical systems like power grids present complex fault scenarios where failures in one component cascade through the system via protection device interactions. Traditional fault diagnosis approaches fail because they ignore how protection devices themselves can cause secondary failures. This work is innovative because it models fault propagation as behavioral changes in cyber-physical components, enabling diagnosis systems to explain cascading failures and identify root causes.},
results = {The approach successfully diagnosed cascading failures in power systems by tracking fault propagation through protection devices and actuators. Temporal Causal Diagram models accurately predicted failure modes and discrepancies under various system conditions. The reasoning engine generated correct system-level hypotheses consistent with observed anomalies without missing any dangerous contingencies.},
project_tags = {CPS, Explainable AI}
}
The resiliency and reliability of critical cyber physical systems like electrical power grids are of paramount importance. These systems are often equipped with specialized protection devices to detect anomalies and isolate faults in order to arrest failure propagation and protect the healthy parts of the system. However, due to the limited situational awareness and hidden failures the protection devices themselves, through their operation (or mis-operation) may cause overloading and the disconnection of parts of an otherwise healthy system. This can result in cascading failures that lead to a blackout. Diagnosis of failures in such systems is extremely challenging because of the need to account for faults in both the physical systems as well as the protection devices, and the failure-effect propagation across the system. Our approach for diagnosing such cyber-physical systems is based on the concept of Temporal Causal Diagrams (TCD-s) that capture the timed discrete models of protection devices and their interactions with a system failure propagation graph. In this paper we present a refinement of the TCD language with a layer of independent local observers that aid in diagnosis. We describe a hierarchical two-tier failure diagnosis approach and showcase the results for 4 different scenarios involving both cyber and physical faults in a standard Western System Coordinating Council (WSCC) 9 bus system.
@inproceedings{Barbour2018,
author = {Barbour, William and Samal, Chinmaya and Kuppa, Shankara and Dubey, Abhishek and Work, Daniel B.},
booktitle = {21st International Conference on Intelligent Transportation Systems, {ITSC} 2018, Maui, HI, USA, November 4-7, 2018},
title = {On the Data-Driven Prediction of Arrival Times for Freight Trains on {U.S.} Railroads},
year = {2018},
pages = {2289--2296},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/conf/itsc/BarbourSKDW18},
category = {selectiveconference},
contribution = {minor},
doi = {10.1109/ITSC.2018.8569406},
file = {:Barbour2018-On_the_Data-Driven_Prediction_of_Arrival_Times_for_Freight_Trains_on_U.S._Railroads.pdf:PDF},
keywords = {machine learning, train scheduling, arrival time prediction, transportation systems, deep learning},
project = {smart-transit,cps-reliability,smart-cities},
tag = {transit},
timestamp = {Wed, 16 Oct 2019 14:14:57 +0200},
url = {https://doi.org/10.1109/ITSC.2018.8569406},
what = {This paper develops machine learning models to predict freight train arrival times using historical operational data from US railroads. The work compares multiple approaches including support vector regression, random forest, and deep neural networks trained on extensive historical train data covering physical characteristics, crew information, network state, and occupancy metrics. The models account for the high variability in train operations caused by congestion, infrastructure constraints, and scheduling factors.},
why = {Accurate prediction of freight train arrival times is critical for railroad operations, enabling better scheduling, reducing costs, and improving efficiency. Traditional analytical approaches fail to capture the complex relationships between numerous operational factors and actual train delays. This work is innovative because it leverages large historical datasets and modern machine learning to achieve significantly better prediction accuracy than statistical baselines, enabling data-driven operational planning.},
results = {The deep neural network model achieved 26% error reduction compared to statistical baseline predictors, with effective prediction of discrete arrival times along train routes. Support vector regression and random forest models also demonstrated strong performance, significantly outperforming baseline approaches on a five-year historical dataset of over 170,000 labeled examples. The results enable railroad operators to make more informed decisions about crew scheduling and train coordination.},
project_tags = {transit, ML for CPS}
}
The high capacity utilization and the pre-dominantly single-track network topology of freight railroads in the United States causes large variability and unpredictability of train arrival times. Predicting accurate estimated times of arrival (ETAs) is an important step for railroads to increase efficiency and automation, reduce costs, and enhance customer service. We propose using machine learning algorithms trained on historical railroad operational data to generate ETAs in real time. The machine learning framework is able to utilize the many data points produced by individual trains traversing a network track segment and generate periodic ETA predictions with a single model. In this work we compare the predictive performance of linear and non-linear support vector regression, random forest regression, and deep neural network models, tested on a section of the railroad in Tennessee, USA using over two years of historical data. Support vector regression and deep neural network models show similar results with maximum ETA error reduction of 26% over a statistical baseline predictor. The random forest models show over 60% error reduction compared to baseline at some points and average error reduction of 42%.
@misc{Basak2018,
author = {Basak, Sanchita and Sengupta, Saptarshi and Dubey, Abhishek},
title = {A Data-driven Prognostic Architecture for Online Monitoring of Hard Disks Using Deep {LSTM} Networks},
year = {2018},
archiveprefix = {arXiv},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/journals/corr/abs-1810-08985},
contribution = {lead},
eprint = {1810.08985},
file = {:Basak2018-A_Data-driven_Prognostic_Architecture_for_Online_Monitoring_of_Hard_Disks_Using_Deep_LSTM_Networks.pdf:PDF},
journal = {CoRR},
tag = {ai4cps},
timestamp = {Wed, 31 Oct 2018 00:00:00 +0100},
url = {http://arxiv.org/abs/1810.08985},
volume = {abs/1810.08985},
what = {This paper proposes a two-layered architecture for predicting remaining useful life of hard disk drives in cloud systems using Deep LSTM networks. The system combines data-driven anomaly detection for early failure identification with online prediction mechanisms using transfer learning. The approach handles the challenge of predicting disk failures without overlap between training and test data by using pre-trained models that are continuously updated with new failure patterns.},
why = {Cloud system reliability critically depends on predicting hard disk failures before they cause service outages, but disk failure prediction is challenging due to highly imbalanced data, variable failure modes, and lack of clear failure indicators. This work is innovative because it combines offline pre-training with online transfer learning to continuously adapt failure prediction models without future knowledge contamination, enabling practical deployment in cloud environments.},
results = {The two-layered architecture achieved 84.35% accuracy in predicting remaining useful life with RUL near critical failure zones, enabling proactive disk replacement decisions. The system successfully identified devices approaching failure within ten days with high precision, allowing cloud operators to migrate workloads before failures occur. The transfer learning approach enabled the system to adapt to new disk models and failure patterns through incremental online updates.},
keywords = {remaining useful life prediction, deep learning, LSTM networks, hard disk failures, cloud systems},
project_tags = {ML for CPS, Explainable AI}
}
With the advent of pervasive cloud computing technologies, service reliability and availability are becoming major concerns,especially as we start to integrate cyber-physical systems with the cloud networks. A number of smart and connected community systems such as emergency response systems utilize cloud networks to analyze real-time data streams and provide context-sensitive decision support.Improving overall system reliability requires us to study all the aspects of the end-to-end of this distributed system,including the backend data servers. In this paper, we describe a bi-layered prognostic architecture for predicting the Remaining Useful Life (RUL) of components of backend servers,especially those that are subjected to degradation. We show that our architecture is especially good at predicting the remaining useful life of hard disks. A Deep LSTM Network is used as the backbone of this fast, data-driven decision framework and dynamically captures the pattern of the incoming data. In the article, we discuss the architecture of the neural network and describe the mechanisms to choose the various hyper-parameters. We describe the challenges faced in extracting effective training sets from highly unorganized and class-imbalanced big data and establish methods for online predictions with extensive data pre-processing, feature extraction and validation through test sets with unknown remaining useful lives of the hard disks. Our algorithm performs especially well in predicting RUL near the critical zone of a device approaching failure.The proposed architecture is able to predict whether a disk is going to fail in next ten days with an average precision of 0.8435.In future, we will extend this architecture to learn and predict the RUL of the edge devices in the end-to-end distributed systems of smart communities, taking into consideration context-sensitive external features such as weather.
@inbook{Chhokra2018,
author = {Chhokra, Ajay and Dubey, Abhishek and Mahadevan, Nagabhushan and Hasan, Saqib and Karsai, Gabor},
chapter = {Chapter 8},
editor = {Sayed-Mouchaweh, Moamar},
pages = {201--225},
publisher = {Springer International Publishing},
title = {Diagnosis in Cyber-Physical Systems with Fault Protection Assemblies},
year = {2018},
address = {Cham},
isbn = {978-3-319-74962-4},
booktitle = {Diagnosability, Security and Safety of Hybrid Dynamic and Cyber-Physical Systems},
contribution = {colab},
doi = {10.1007/978-3-319-74962-4_8},
file = {:Chhokra2018-Diagnosis_In_Cyber-Physical_Systems_with_Fault_Protection_Assemblies.pdf:PDF},
keywords = {fault diagnosis, protection relays, cascading failures, power systems, cyber-physical systems},
tag = {platform,power},
url = {https://doi.org/10.1007/978-3-319-74962-4_8},
what = {This chapter describes diagnosis methods for cyber-physical systems with fault protection assemblies, using Temporal Causal Diagrams to model fault propagation through protection equipment. The approach captures how protection devices autonomously respond to faults, potentially causing secondary effects or cascading failures. The paper demonstrates the technique on power systems showing how misoperation of protection relays can lead to complete system blackout.},
why = {Traditional fault diagnosis in power systems focuses on primary failures but ignores how protection equipment misoperations can cascade faults throughout the system. Cyber-physical systems require integrated analysis of both physical faults and autonomous protection device responses. This work is important because it provides practical diagnosis methods that account for protection assemblies, enabling operators to identify critical failure modes and design better protection strategies.},
results = {The Temporal Causal Diagram approach successfully diagnosed cascading failures in the WSCC 9-bus system, identifying how relay misoperations led to progressive system instability and blackout. The reasoning engine generated multi-fault hypotheses that explained observed anomalies and predicted system-level consequences. The method enabled identification of critical protection assemblies that could trigger cascading failures under fault conditions.},
project_tags = {CPS, emergency}
}
Fault Protection Assemblies are used in cyber-physical systems for automated fault-isolation. These devices alter the mode of the system using locally available information in order to stop fault propagation. For example, in electrical networks relays and breakers isolate faults in order to arrest failure propagation and protect the healthy parts of the system. However, these assemblies themselves can have faults, which may inadvertently induce secondary failures. Often these secondary failures lead to cascade effects, which then lead to total system collapse. This behavior is often seen in electrical transmission systems where failures of relays and breakers may cause overloading and the disconnection of parts of an otherwise healthy system. In the past, we had developed a consistency based diagnosis approach for physical systems based on the temporal failure propagation graph. We now describe an extension that uses the concept of timed discrete event observers in combination with the timed failure propagation graphs to extend the hypothesis to include the possibility of failures in the fault protection units. Using a simulated power system case study, we show that the combined approach is able to diagnose faults in both the plant and the protection devices.
@inproceedings{DubeyHCSS2018,
author = {Laszka, Aron and Mavridou, Anastasia and Dubey, Abhishek},
booktitle = {High Confidence Software and Systems Conference},
title = {Resilient and Trustworthy Transactive Platform for Smart and Connected Communities},
year = {2018},
contribution = {colab},
keywords = {blockchain},
project = {cps-reliability},
tag = {platform,decentralization},
timestamp = {Wed, 16 Oct 2019 14:14:54 +0200}
}
@inproceedings{Eisele2018,
author = {Eisele, Scott and Laszka, Aron and Mavridou, Anastasia and Dubey, Abhishek},
booktitle = {{IEEE} International Conference on Internet of Things and Blockchains},
title = {SolidWorx: {A} Resilient and Trustworthy Transactive Platform for Smart and Connected Communities},
year = {2018},
pages = {1263--1272},
acceptance = {15.3},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/conf/ithings/EiseleLMD18},
category = {selectiveconference},
contribution = {lead},
doi = {10.1109/Cybermatics\_2018.2018.00221},
file = {:Eisele2018-SolidWorx_A_Resilient_and_Trustworthy_Transactive_Platform_for_Smart_and_Connected_Communities.pdf:PDF},
keywords = {blockchain, smart contracts, resource allocation, energy trading, decentralized systems},
project = {cps-blockchains,transactive-energy},
tag = {decentralization,power},
timestamp = {Wed, 16 Oct 2019 14:14:56 +0200},
url = {https://doi.org/10.1109/Cybermatics\_2018.2018.00221},
what = {This paper presents SolidWorx, a blockchain-based decentralized transaction platform for smart and connected communities that enables resource allocation and energy trading among distributed participants. The system uses a hybrid approach combining off-blockchain linear program solvers with blockchain-based smart contracts for verification and execution. The platform addresses trust, privacy, and resource allocation challenges in community-based resource exchange applications.},
why = {Smart communities require mechanisms for participants to exchange resources and coordinate services without relying on centralized authorities or third parties. Blockchain technology enables trustworthy transactions but faces scalability and verification challenges. SolidWorx is innovative because it combines efficient off-chain optimization with blockchain verification, enabling scalable and trustworthy resource allocation while maintaining transparency and preventing fraud.},
results = {SolidWorx successfully enabled resource allocation and energy trading among distributed participants in community applications including carpooling and energy futures markets. The hybrid solver architecture optimized resource allocations more efficiently than blockchain-only approaches while maintaining auditability through smart contract verification. The platform demonstrated practical feasibility for community-scale resource management without centralized control.},
project_tags = {energy, middleware}
}
Internet of Things and data sciences are fueling the development of innovative solutions for various applications in Smart and Connected Communities (SCC). These applications provide participants with the capability to exchange not only data but also resources, which raises the concerns of integrity, trust, and above all the need for fair and optimal solutions to the problem of resource allocation. This exchange of information and resources leads to a problem where the stakeholders of the system may have limited trust in each other. Thus, collaboratively reaching consensus on when, how, and who should access certain resources becomes problematic. This paper presents SolidWorx, a blockchain-based platform that provides key mechanisms required for arbitrating resource consumption across different SCC applications in a domain-agnostic manner. For example, it introduces and implements a hybrid-solver pattern, where complex optimization computation is handled off-blockchain while solution validation is performed by a smart contract. To ensure correctness, the smart contract of SolidWorx is generated and verified using a model-based approach.
@article{GarciaValls2018,
author = {Garc{\'{\i}}a{-}Valls, Marisol and Dubey, Abhishek and Botti, Vicent J.},
journal = {Journal of Systems Architecture - Embedded Systems Design},
title = {Introducing the new paradigm of Social Dispersed Computing: Applications, Technologies and Challenges},
year = {2018},
pages = {83--102},
volume = {91},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/journals/jsa/Garcia-VallsDB18},
contribution = {colab},
doi = {10.1016/j.sysarc.2018.05.007},
file = {:Garcia-Valls2018-Introducing_the_new_paradigm_of_Social_Dispersed_Computing_Applications_Technologies_and_Challenges.pdf:PDF},
keywords = {edge computing, social dispersed computing, IoT, latency reduction, distributed computing},
project = {cps-middleware},
tag = {platform,decentralization},
timestamp = {Mon, 16 Sep 2019 01:00:00 +0200},
url = {https://doi.org/10.1016/j.sysarc.2018.05.007},
what = {This paper introduces the Social Dispersed Computing paradigm that distributes computation across edge devices and networks to enable low-latency services for smart and connected communities. The work analyzes key computing paradigms including cloud computing, mobile cloud computing, cloudlets, fog computing, and edge computing, identifying their characteristics and tradeoffs. The paper discusses technological enablers and research challenges for implementing practical social dispersed computing applications.},
why = {Traditional cloud computing introduces latency and bandwidth challenges for community-scale applications requiring real-time response, such as transportation and emergency services. Social dispersed computing addresses this by pushing computation to the network edge where it can leverage local resources. This work is innovative because it synthesizes distributed computing paradigms into a unified social dispersed computing vision, providing architects with conceptual frameworks for designing low-latency community applications.},
results = {The analysis identified key characteristics distinguishing social dispersed computing from traditional cloud paradigms, including reduced latency, improved bandwidth efficiency, and enhanced local autonomy. Case studies demonstrated practical applications where edge computing provided significantly better performance than cloud-only approaches. The framework enables system designers to systematically evaluate tradeoffs when distributing computation across edge and cloud resources.},
project_tags = {CPS, middleware}
}
If last decade viewed computational services as a utilitythen surely this decade has transformed computation into a commodity. Computation is now progressively integrated into the physical networks in a seamless way that enables cyber-physical systems (CPS) and the Internet of Things (IoT) meet their latency requirements. Similar to the concept of “platform as a service” or “software as a service”, both cloudlets and fog computing have found their own use cases. Edge devices (that we call end or user devices for disambiguation) play the role of personal computers, dedicated to a user and to a set of correlated applications. In this new scenario, the boundaries between the network node, the sensor, and the actuator are blurring, driven primarily by the computation power of IoT nodes like single board computers and the smartphones. The bigger data generated in this type of networks needs clever, scalable, and possibly decentralized computing solutions that can scale independently as required. Any node can be seen as part of a graph, with the capacity to serve as a computing or network router node, or both. Complex applications can possibly be distributed over this graph or network of nodes to improve the overall performance like the amount of data processed over time. In this paper, we identify this new computing paradigm that we call Social Dispersed Computing, analyzing key themes in it that includes a new outlook on its relation to agent based applications. We architect this new paradigm by providing supportive application examples that include next generation electrical energy distribution networks, next generation mobility services for transportation, and applications for distributed analysis and identification of non-recurring traffic congestion in cities. The paper analyzes the existing computing paradigms (e.g., cloud, fog, edge, mobile edge, social, etc.), solving the ambiguity of their definitions; and analyzes and discusses the relevant foundational software technologies, the remaining challenges, and research opportunities.
@inproceedings{Hasan2018,
author = {Hasan, Saqib and Ghafouri, Amin and Dubey, Abhishek and Karsai, Gabor and Koutsoukos, Xenofon D.},
booktitle = {2018 {IEEE} Power {\&} Energy Society Innovative Smart Grid Technologies Conference, {ISGT} 2018, Washington, DC, USA, February 19-22, 2018},
title = {Vulnerability analysis of power systems based on cyber-attack and defense models},
year = {2018},
pages = {1--5},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/conf/isgt/HasanGDKK18},
category = {selectiveconference},
contribution = {minor},
doi = {10.1109/ISGT.2018.8403337},
file = {:Hasan2018-Vulnerability_analysis_of_power_systems_based_on_cyber-attack_and_defense_models.pdf:PDF},
keywords = {power system security, cyber-attack analysis, game theory, vulnerability assessment, cascading failures},
project = {cps-reliability},
tag = {platform,power},
timestamp = {Wed, 16 Oct 2019 14:14:57 +0200},
url = {https://doi.org/10.1109/ISGT.2018.8403337},
what = {This paper develops game-theoretic models for identifying worst-case cyber-attacks on power systems and computing optimal defense strategies. The work formulates attacker and defender problems as optimization problems where attackers maximize damage by strategically opening circuit breakers while defenders minimize damage by protecting critical substations. The paper provides efficient algorithms that identify critical contingencies causing cascading failures and optimal defense budgets.},
why = {Power system vulnerability analysis typically focuses on single-component failures, but coordinated multi-component attacks can cause cascading failures exceeding simple N-1 criteria. This work is innovative because it combines game theory with power system analysis to identify worst-case attack scenarios and compute optimal resource allocation for defense. This enables operators to prioritize protection investments and identify critical infrastructure vulnerabilities.},
results = {The algorithms identified critical substations and circuit breaker combinations that maximize system damage when attacked, reducing the candidate contingency set from hundreds of thousands to manageable sizes. For IEEE test systems, the approach identified the worst-case attack and optimal defense strategy with significantly fewer simulations than exhaustive search. Results showed 50-57% improvement in load loss when protecting identified critical substations with available defense budgets.},
project_tags = {CPS, emergency}
}
Reliable operation of power systems is a primary challenge for the system operators. With the advancement in technology and grid automation, power systems are becoming more vulnerable to cyber-attacks. The main goal of adversaries is to take advantage of these vulnerabilities and destabilize the system. This paper describes a game-theoretic approach to attacker / defender modeling in power systems. In our models, the attacker can strategically identify the subset of substations that maximize damage when compromised. However, the defender can identify the critical subset of substations to protect in order to minimize the damage when an attacker launches a cyber-attack. The algorithms for these models are applied to the standard IEEE-14, 39, and 57 bus examples to identify the critical set of substations given an attacker and a defender budget.
@inproceedings{Laszka2018,
author = {Laszka, Aron and Eisele, Scott and Dubey, Abhishek and Karsai, Gabor and Kvaternik, Karla},
booktitle = {24th {IEEE} International Conference on Parallel and Distributed Systems, {ICPADS} 2018, Singapore, December 11-13, 2018},
title = {{TRANSAX:} {A} Blockchain-Based Decentralized Forward-Trading Energy Exchanged for Transactive Microgrids},
year = {2018},
pages = {918--927},
acceptance = {37.7},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/conf/icpads/LaszkaEDKK18},
category = {selectiveconference},
contribution = {lead},
doi = {10.1109/PADSW.2018.8645001},
file = {:Laszka2018-TRANSAX_A_Blockchain-Based_Decentralized_Forward-Trading_Energy_Exchanged_for_Transactive_Microgrids.pdf:PDF},
keywords = {transactive energy, blockchain, smart contracts, energy trading, microgrids},
project = {transactive-energy,cps-blockchains},
tag = {decentralization,power},
timestamp = {Wed, 16 Oct 2019 14:14:56 +0200},
url = {https://doi.org/10.1109/PADSW.2018.8645001},
what = {This paper presents TRANSAX, a blockchain-based decentralized energy trading platform for transactive microgrids that addresses three key challenges: privacy protection through encrypted transactions, resilience through distributed ledgers, and efficient resource allocation using smart contracts and linear programming. The system enables individual prosumers to autonomously trade energy with neighbors while maintaining system safety constraints and grid stability.},
why = {Transactive energy systems require mechanisms for distributed market participants to coordinate energy production and consumption without centralized control, but face challenges with privacy, reliability, and fair resource allocation. TRANSAX is innovative because it combines blockchain-based distributed ledgers for trust and transparency with optimization algorithms for efficient matching, enabling scalable transactive energy markets suitable for microgrids.},
results = {TRANSAX successfully enabled forward energy trading among distributed participants with multiple time horizons and flexible resource constraints. The platform maintained system stability by enforcing feeder capacity constraints while maximizing total energy traded. Smart contracts verified transaction feasibility before blockchain execution, and the hybrid solver architecture solved large-scale resource allocation problems efficiently.},
project_tags = {energy, planning}
}
Power grids are undergoing major changes due to rapid growth in renewable energy and improvements in battery technology. Prompted by the increasing complexity of power systems, decentralized IoT solutions are emerging, which arrange local communities into transactive microgrids. The core functionality of these solutions is to provide mechanisms for matching producers with consumers while ensuring system safety. However, there are multiple challenges that these solutions still face: privacy, trust, and resilience. The privacy challenge arises because the time series of production and consumption data for each participant is sensitive and may be used to infer personal information. Trust is an issue because a producer or consumer can renege on the promised energy transfer. Providing resilience is challenging due to the possibility of failures in the infrastructure that is required to support these market based solutions. In this paper, we develop a rigorous solution for transactive microgrids that addresses all three challenges by providing an innovative combination of MILP solvers, smart contracts, and publish-subscribe middleware within a framework of a novel distributed application platform, called Resilient Information Architecture Platform for Smart Grid. Towards this purpose, we describe the key architectural concepts, including fault tolerance, and show the trade-off between market efficiency and resource requirements.
@inproceedings{Nannapaneni2018a,
author = {Nannapaneni, Saideep and Dubey, Abhishek and Mahadevan, Sankaran},
booktitle = {2018 Aviation Technology, Integration, and Operations Conference},
title = {Automated aircraft separation safety assurance using Bayesian networks},
year = {2018},
pages = {3199},
category = {conference},
contribution = {minor},
keywords = {reliability},
project = {cps-reliability}
}
@inproceedings{Samal2018,
author = {Samal, Chinmaya and Dubey, Abhishek and Ratliff, Lillian J.},
booktitle = {2018 {IEEE} International Conference on Smart Computing, {SMARTCOMP} 2018, Taormina, Sicily, Italy, June 18-20, 2018},
title = {Mobilytics- An Extensible, Modular and Resilient Mobility Platform},
year = {2018},
pages = {356--361},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/conf/smartcomp/SamalDR18},
category = {selectiveconference},
contribution = {lead},
acceptance = {40},
doi = {10.1109/SMARTCOMP.2018.00029},
file = {:Samal2018-Mobilytics-An_Extensible_Modular_and_Resilient_Mobility_Platform.pdf:PDF},
keywords = {transportation management, microservices architecture, urban mobility, multi-modal routing, resilience},
project = {smart-transit,smart-cities},
tag = {platform,transit},
timestamp = {Wed, 16 Oct 2019 14:14:54 +0200},
url = {https://doi.org/10.1109/SMARTCOMP.2018.00029},
what = {This paper develops a modular and resilient mobility platform called Mobilytics that integrates transportation management capabilities including multi-modal routing, traffic simulation, and resilient service deployment. The platform uses microservices architecture with geospatial databases and abstract modal layers enabling dynamic addition of new transportation modes. The system addresses modularity, extensibility, and resilience challenges required for managing modern urban transportation systems.},
why = {Urban transportation systems face increasing demand for integrated multi-modal services while maintaining reliability despite component failures. Traditional monolithic transportation management systems struggle to adapt to new services and fail catastrophically when individual components fail. Mobilytics is innovative because it provides a microservices-based architecture that enables independent service deployment and scaling while maintaining overall system resilience through distributed configuration management.},
results = {Mobilytics successfully demonstrated modular addition of new transportation modes and services without modifying core platform code. The platform provided real-time routing services considering multiple transportation modes and user preferences. Testing showed that system failure detection and configuration management enabled automatic recovery from component failures, with shorter downtime compared to centralized systems.},
project_tags = {transit, CPS, planning}
}
Transportation management platforms provide communities the ability to integrate the available mobility options and localized transportation demand management policies. A central component of a transportation management platform is the mobility planning application. Given the societal relevance of these platforms, it is necessary to ensure that they operate resiliently. Modularity and extensibility are also critical properties that are required for manageability. Modularity allows to isolate faults easily. Extensibility enables update of policies and integration of new mobility modes or new routing algorithms. However, state of the art mobility planning applications like open trip planner, are monolithic applications, which makes it difficult to scale and modify them dynamically. This paper describes a microservices based modular multi-modal mobility platform Mobilytics, that integrates mobility providers, commuters, and community stakeholders. We describe our requirements, architecture, and discuss the resilience challenges, and how our platform functions properly in presence of failure. Conceivably, the patterns and principles manifested in our system can serve as guidelines for current and future practitioners in this field.
@misc{Samal2018a,
author = {Samal, Chinmaya and Zheng, Liyuan and Sun, Fangzhou and Ratliff, Lillian J. and Dubey, Abhishek},
title = {Towards a Socially Optimal Multi-Modal Routing Platform},
year = {2018},
archiveprefix = {arXiv},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/journals/corr/abs-1802-10140},
contribution = {lead},
eprint = {1802.10140},
file = {:Samal2018a-Towards_a_Socially_Optimal_Multi-Modal_Routing_Platform.pdf:PDF},
journal = {CoRR},
keywords = {multi-modal routing, transportation, social welfare, congestion reduction, system optimization},
project = {smart-transit,smart-cities},
tag = {transit},
timestamp = {Mon, 13 Aug 2018 01:00:00 +0200},
url = {http://arxiv.org/abs/1802.10140},
volume = {abs/1802.10140},
what = {This paper proposes socially optimal multi-modal routing that considers system-level impacts on overall traffic congestion in addition to individual user preferences. The work develops algorithms to compute routes that maximize utility for individual users while accounting for the externalities of routing decisions on other users and the transportation network. The approach uses traffic simulation to evaluate multi-modal routing strategies and their impact on congestion.},
why = {Current mobility decision support systems focus on individual user objectives without considering how routing decisions collectively impact system-level congestion and travel times. This work is innovative because it formulates routing as an optimization problem that balances individual user preferences with system-level social welfare, enabling communities to reduce congestion through route recommendations that are perceived as optimal by users while improving overall system performance.},
results = {Socially optimal multi-modal routing significantly reduced average travel time across all users compared to individually optimal routing, especially when a high percentage of users adopted system-suggested routes. The simulation analysis demonstrated that even partial adoption of socially optimal routes could improve system-level performance without adversely affecting early adopters. The results support using community-aware routing algorithms in transportation decision support systems.},
project_tags = {transit, planning}
}
The increasing rate of urbanization has added pressure on the already constrained transportation networks in our communities. Ride-sharing platforms such as Uber and Lyft are becoming a more commonplace, particularly in urban environments. While such services may be deemed more convenient than riding public transit due to their on-demand nature, reports show that they do not necessarily decrease the congestion in major cities. One of the key problems is that typically mobility decision support systems focus on individual utility and react only after congestion appears. In this paper, we propose socially considerate multi-modal routing algorithms that are proactive and consider, via predictions, the shared effect of riders on the overall efficacy of mobility services. We have adapted the MATSim simulator framework to incorporate the proposed algorithms present a simulation analysis of a case study in Nashville, Tennessee that assesses the effects of our routing models on the traffic congestion for different levels of penetration and adoption of socially considerate routes. Our results indicate that even at a low penetration (social ratio), we are able to achieve an improvement in system-level performance.
@inproceedings{Sun2018,
author = {Sun, Fangzhou and Dubey, Abhishek and Samal, Chinmaya and Baroud, Hiba and Kulkarni, Chetan},
booktitle = {2018 {IEEE} International Conference on Smart Computing, {SMARTCOMP} 2018, Taormina, Sicily, Italy, June 18-20, 2018},
title = {Short-Term Transit Decision Support System Using Multi-task Deep Neural Networks},
year = {2018},
acceptance = {40},
pages = {155--162},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/conf/smartcomp/SunDSBK18},
category = {selectiveconference},
contribution = {lead},
doi = {10.1109/SMARTCOMP.2018.00086},
file = {:Sun2018-Short-Term_Transit_Decision_Support_System_Using_Multi-task_Deep_Neural_Networks.pdf:PDF},
keywords = {transit prediction, deep learning, multi-task learning, delay prediction, transportation},
project = {smart-transit,cps-reliability,smart-cities},
tag = {ai4cps,transit},
timestamp = {Wed, 16 Oct 2019 14:14:54 +0200},
url = {https://doi.org/10.1109/SMARTCOMP.2018.00086},
what = {This paper develops a short-term transit delay prediction system using multi-task deep neural networks that predict arrival times for bus route segments considering contextual information including scheduled events, weather conditions, and historical transit data. The approach uses shared route segment networks and event feature vectors to reduce overfitting while improving prediction accuracy. The system addresses data sparsity and generalization challenges in transit prediction through multi-task learning architecture.},
why = {Public transportation riders need accurate delay predictions to make informed decisions about departure times and route choices, but short-term prediction is challenging due to highly variable traffic patterns influenced by numerous contextual factors. This work is innovative because it applies multi-task deep learning to leverage shared patterns across route segments while accounting for event-specific impacts, enabling accurate predictions despite limited historical data for individual segment-event combinations.},
results = {The multi-task neural network achieved high recall (76%) and F1 scores (55%) in predicting transit delays, effectively capturing relationships between multiple contextual features. Compared to single-task networks, the multi-task approach reduced overfitting and improved generalization to new events and routes. The system successfully identified which route segments were most affected by specific events and forecast corresponding delays.},
project_tags = {transit, ML for CPS}
}
Unpredictability is one of the top reasons that prevent people from using public transportation. To improve the on-time performance of transit systems, prior work focuses on updating schedule periodically in the long-term and providing arrival delay prediction in real-time. But when no real-time transit and traffic feed is available (e.g., one day ahead), there is a lack of effective contextual prediction mechanism that can give alerts of possible delay to commuters. In this paper, we propose a generic tool-chain that takes standard General Transit Feed Specification (GTFS) transit feeds and contextual information (recurring delay patterns before and after big events in the city and the contextual information such as scheduled events and forecasted weather conditions) as inputs and provides service alerts as output. Particularly, we utilize shared route segment networks and multi-task deep neural networks to solve the data sparsity and generalization issues. Experimental evaluation shows that the proposed toolchain is effective at predicting severe delay with a relatively high recall of 76% and F1 score of 55%.
@inproceedings{Sun2018a,
author = {Sun, Fangzhou and Dubey, Abhishek and Kulkarni, C and Mahadevan, Nagbhushan and Luna, Ali Guarneros},
booktitle = {Conference Proceedings, Annual Conference of The Prognostics And Health Management Society},
title = {A data driven health monitoring approach to extending small sats mission},
year = {2018},
category = {conference},
contribution = {minor},
file = {:Sun2018a-A_data_driven_health_monitoring_approach_to_extending_small_sats_mission.pdf:PDF},
keywords = {health monitoring, deep learning, LSTM networks, satellite systems, battery diagnostics},
project = {cps-reliability},
tag = {platform},
what = {This paper proposes a data-driven health monitoring framework for extending small satellite missions using Deep LSTM networks to detect battery anomalies during operation. The approach combines offline pre-training with online transfer learning to adapt battery prediction models to individual satellite configurations. The system uses stacked LSTM and auto-encoder architectures to detect failures in highly imbalanced datasets while accounting for the spatial-temporal dynamics of battery degradation.},
why = {Small satellite missions depend on reliable battery health monitoring to ensure mission success, but traditional statistical approaches fail to capture complex battery aging patterns. This work is innovative because it combines offline deep learning with online adaptation mechanisms enabling satellites to detect imminent failures without relying on ground station resources. The approach enables extended mission duration through proactive battery management.},
results = {The two-layered LSTM architecture successfully detected battery anomalies and predicted remaining useful life within acceptable margins for operational decision-making. The system identified degradation patterns specific to charging cycles and environmental conditions through transfer learning. Online prediction enabled timely battery replacement decisions, extending mission duration while preventing catastrophic failures.},
project_tags = {ML for CPS, energy}
}
In the next coming years, the International Space Station (ISS) plans to launch several small-sat missions powered by lithium-ion battery packs. An extended version of such mission requires dependable, energy dense, and durable power sources as well as system health monitoring. Hence a good health estimation framework to increase mission success is absolutely necessary as the devices are subjected to high demand operating conditions. This paper describes a hierarchical architecture which combines data-driven anomaly detection methods with a fine-grained model-based diagnosis and prognostics architecture. At the core of the architecture is a distributed stack of deep neural network that detects and classifies the data traces from nearby satellites based on prior observations. Any identified anomaly is transmitted to the ground, which then uses model-based diagnosis and prognosis framework to make health state estimation. In parallel, periodically the data traces from the satellites are transported to the ground and analyzed using model-based techniques. This data is then used to train the neural networks, which are run from ground systems and periodically updated. The collaborative architecture enables quick data-driven inference on the satellite and more intensive analysis on the ground where often time and power consumption are not constrained. The current work demonstrates implementation of this architecture through an initial battery data set. In the future we propose to apply this framework to other electric and electronic components on-board the small satellites.
@inproceedings{DuTu2017,
author = {{Du}, Y. and {Tu}, H. and {Lukic}, S. and {Lubkeman}, D. and Dubey, Abhishek and {Karsai}, G.},
booktitle = {2017 North American Power Symposium (NAPS)},
title = {Implementation of a distributed microgrid controller on the Resilient Information Architecture Platform for Smart Systems (RIAPS)},
year = {2017},
month = sep,
pages = {1-6},
category = {selectiveconference},
contribution = {minor},
doi = {10.1109/NAPS.2017.8107305},
file = {:DuTu2017-Implementation_of_a_distributed_microgrid_controller_on_RIAPS.pdf:PDF},
issn = {null},
keywords = {microgrid control, RIAPS platform, distributed algorithms, resynchronization, power system stability},
tag = {power},
what = {This paper describes implementation of a distributed microgrid controller on the RIAPS platform that achieves voltage frequency regulation and resynchronization of islanded microgrids. The work presents detailed algorithms for primary droop control, secondary frequency and voltage regulation, and resynchronization procedures using distributed messaging. The implementation demonstrates deployment of complex control algorithms across distributed nodes using the RIAPS middleware.},
why = {Microgrids require sophisticated distributed control algorithms to maintain stability during transitions between grid-connected and islanded operation, but implementing these algorithms across distributed hardware remains challenging. This work is significant because it demonstrates that the RIAPS platform can reliably execute hierarchical microgrid control with sub-second synchronization accuracy and distributed decision-making, validating the platform for critical grid applications.},
results = {The RIAPS-based controller successfully maintained voltage and frequency stability during intentional islanding events with distributed secondary control responding to frequency deviations. Resynchronization procedures safely reconnected the microgrid to the main grid without instability. Real-time measurements confirmed voltage phase accuracy within specification and effective distributed coordination of multiple generator controllers.},
project_tags = {middleware, energy, CPS}
}
Formation of microgrids have been proposed as a solution to improve grid reliability, and enable smoother integration of renewables into the grid. Microgrids are sections of the grid that can operate in isolation from the main power system. Maintaining power balance within an islanded microgrid is a challenging task, due to the low system inertia, which requires complex control to maintain stable and optimized operation. Many studies have demonstrated feasible distributed microgrid controllers that can maintain microgrid stability in grid connected and islanded modes. However, there is little emphasis on how to implement these distributed algorithms on a computational platform that allows for fast and seamless deployment. This paper introduces a decentralized software platform called Resilient Information Architecture Platform for Smart Systems (RIAPS) that runs on processors embedded with the microgrid component. As an example, we describe the implementation of a distributed microgrid secondary control and resynchronization algorithms on RIAPS platform. The controller developed on RIAPS platform is validated on a real-time microgrid testbed.
@inproceedings{Hasan2017a,
author = {{Hasan}, S. and {Ghafouri}, A. and Dubey, Abhishek and {Karsai}, G. and {Koutsoukos}, X.},
booktitle = {2017 Resilience Week (RWS)},
title = {Heuristics-based approach for identifying critical N-k contingencies in power systems},
year = {2017},
month = sep,
pages = {191-197},
category = {conference},
contribution = {colab},
doi = {10.1109/RWEEK.2017.8088671},
file = {:Hasan2017a-Heuristics-based_approach_for_identifying_critical_N_k_contingencies_in_power_systems.pdf:PDF},
issn = {null},
keywords = {contingency analysis, power systems, heuristic algorithms, critical contingencies, reliability},
project = {cps-reliability,smart-energy},
tag = {platform,power},
what = {This paper develops heuristic algorithms for identifying critical N-k contingencies in power systems that efficiently reduce the search space of possible failure combinations. The approach uses frequency distribution analysis of system impedance changes to identify likely critical contingencies without exhaustive enumeration. The algorithms enable practical contingency analysis for large power systems by reducing computational requirements while maintaining accuracy.},
why = {Comprehensive N-k contingency analysis requires evaluating exponentially many failure combinations, making exhaustive analysis computationally infeasible for large systems. This work is innovative because it uses statistical heuristics to intelligently prune the search space, identifying critical contingencies with significantly fewer simulations. The approach enables operators to focus protection resources on truly dangerous failure modes.},
results = {The heuristic algorithms identified critical N-4 contingencies for the IEEE-57 bus system using only 24,469 simulations compared to 259,600 simulations for exhaustive search, reducing computational effort by 90%. The algorithms identified the same critical contingencies as exhaustive search while significantly reducing execution time, demonstrating practical feasibility for larger systems.},
project_tags = {CPS, scalable AI}
}
Reliable operation of electrical power systems in the presence of multiple critical N - k contingencies is an important challenge for the system operators. Identifying all the possible N - k critical contingencies to design effective mitigation strategies is computationally infeasible due to the combinatorial explosion of the search space. This paper describes two heuristic algorithms based on the iterative pruning of the candidate contingency set to effectively and efficiently identify all the critical N - k contingencies resulting in system failure. These algorithms are applied to the standard IEEE-14 bus system, IEEE-39 bus system, and IEEE-57 bus system to identify multiple critical N - k contingencies.
@inproceedings{Hasan2017b,
author = {{Hasan}, S. and Dubey, Abhishek and {Chhokra}, A. and {Mahadevan}, N. and {Karsai}, G. and {Koutsoukos}, X.},
booktitle = {2017 Workshop on Modeling and Simulation of Cyber-Physical Energy Systems (MSCPES)},
title = {A modeling framework to integrate exogenous tools for identifying critical components in power systems},
year = {2017},
month = apr,
pages = {1-6},
category = {workshop},
contribution = {colab},
doi = {10.1109/MSCPES.2017.8064540},
file = {:Hasan2017b-A_modeling_framework_to_integrate_exogenous_tools_for_identifying_critical_components_in_power_systems.pdf:PDF},
keywords = {domain-specific language, power systems, model transformation, cyber-physical systems, fault analysis},
tag = {platform,power},
what = {This paper presents a domain-specific modeling language for power systems that enables rapid prototyping and analysis of cyber-physical failures through multiple simulation platforms. The DSML captures power system components and protection assemblies with appropriate abstractions for different analysis tools. The framework supports model transformation to OpenDSS, Matlab/Simscape, and other simulators, enabling integrated analysis of cascading failures.},
why = {Power system analysis requires understanding complex interactions between physical failures and protection device responses, but different simulation tools have incompatible models and require significant manual effort for system modeling. This work is innovative because it provides a unified modeling language that captures both power system physics and cyber-fault effects, enabling researchers to rapidly transform models between tools and avoid redundant system representation.},
results = {The DSML successfully modeled the WSCC-9 bus system with cyber-faults in protection assemblies and automatically transformed the model to OpenDSS and Matlab/Simscape for analysis. Transformed models yielded consistent results across platforms with errors less than 3%, validating the modeling approach. The framework enabled identification of critical protection assembly failures causing cascading blackouts.},
project_tags = {CPS, Explainable AI}
}
Cascading failures in electrical power systems are one of the major causes of concern for the modem society as it results in huge socio-economic loss. Tools for analyzing these failures while considering different aspects of the system are typically very expensive. Thus, researchers tend to use multiple tools to perform various types of analysis on the same system model in order to understand the reasons for these failures in detail. Modeling a simple system in multiple platforms is a tedious, error prone and time consuming process. This paper describes a domain specific modeling language (DSML) for power systems. It identifies and captures the right abstractions for modeling components in different analysis tools. A framework is proposed that deals with system modeling using the developed DSML, identifying the type of analysis to be performed, choosing the appropriate tool(s) needed for the analysis from the tool-chain, transforming the model based on the required specifications of a particular tool and performing the analysis. A case study is done on WSCC-9 Bus System, IEEE-14 Bus System and IEEE-39 Bus System to demonstrate the entire workflow of the framework in identifying critical components for power systems.
@inproceedings{Bergquist2017,
author = {Bergquist, Jonatan and Laszka, Aron and Sturm, Monika and Dubey, Abhishek},
booktitle = {Proceedings of the 1st Workshop on Scalable and Resilient Infrastructures for Distributed Ledgers, SERIAL@Middleware 2017, Las Vegas, NV, USA, December 11-15, 2017},
title = {On the design of communication and transaction anonymity in blockchain-based transactive microgrids},
year = {2017},
pages = {3:1--3:6},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/conf/middleware/BergquistLSD17},
category = {workshop},
contribution = {lead},
doi = {10.1145/3152824.3152827},
file = {:Bergquist2017-On_the_design_of_communication_and_transaction_anonymity_in_blockchain-based_transactive_microgrids.pdf:PDF},
keywords = {blockchain, transactive energy, microgrids, privacy, anonymity, ring signatures, zero-knowledge proofs, IoT, distributed ledger},
project = {transactive-energy,cps-middleware,cps-reliability},
tag = {decentralization,platform},
timestamp = {Tue, 06 Nov 2018 16:57:13 +0100},
url = {https://doi.org/10.1145/3152824.3152827},
what = {This paper addresses blockchain-based transactive energy systems and proposes privacy-preserving mechanisms for trading energy in distributed microgrids. The authors extend the PETra workflow to support communication and transaction anonymity using cryptographic techniques including garlic routing, ring signatures, and zero-knowledge proofs. The work demonstrates how IoT devices can trade energy while maintaining transaction privacy and avoiding identity linkage.},
why = {As microgrids increasingly rely on decentralized energy trading, privacy and security become critical challenges. Existing blockchain solutions expose transaction patterns and identities, compromising user privacy. This work is innovative because it integrates multiple cryptographic approaches to provide communication anonymity and transaction-level privacy simultaneously, enabling practical privacy-preserving energy trading.},
results = {The paper presents a comprehensive survey of anonymity mechanisms applicable to blockchain-based microgrids and proposes specific solutions for PETra including onion routing for communication and ring signatures with zero-knowledge proofs for transactions. The authors demonstrate that their approach achieves transaction untraceability while maintaining verifiable trading records on the distributed ledger.},
project_tags = {energy, scalable AI, middleware, ML for CPS}
}
Transactive microgrids are emerging as a transformative solution for the problems faced by distribution system operators due to an increase in the use of distributed energy resources and a rapid acceleration in renewable energy generation, such as wind and solar power. Distributed ledgers have recently found widespread interest in this domain due to their ability to provide transactional integrity across decentralized computing nodes. However, the existing state of the art has not focused on the privacy preservation requirement of these energy systems – the transaction level data can provide much greater insights into a prosumer’s behavior compared to smart meter data. There are specific safety requirements in transactive microgrids to ensure the stability of the grid and to control the load. To fulfil these requirements, the distribution system operator needs transaction information from the grid, which poses a further challenge to the privacy-goals. This problem is made worse by requirement for off-blockchain communication in these networks. In this paper, we extend a recently developed trading workflow called PETra and describe our solution for communication and transactional anonymity.
@inproceedings{Chhokra2017,
author = {Chhokra, Ajay and Kulkarni, Amogh and Hasan, Saqib and Dubey, Abhishek and Mahadevan, Nagabhushan and Karsai, Gabor},
booktitle = {Proceedings of the 2nd Workshop on Cyber-Physical Security and Resilience in Smart Grids, SPSR-SG@CPSWeek 2017, Pittsburgh, PA, USA, April 21, 2017},
title = {A Systematic Approach of Identifying Optimal Load Control Actions for Arresting Cascading Failures in Power Systems},
year = {2017},
pages = {41--46},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/conf/cpsweek/ChhokraKHDMK17},
category = {workshop},
contribution = {colab},
doi = {10.1145/3055386.3055395},
file = {:Chhokra2017-A_Systematic_Approach_of_Identifying_Optimal_Load_Control_Actions_for_Arresting_Cascading_Failures_in_Power_Systems.pdf:PDF},
keywords = {power systems, cascading failures, load curtailment, optimization, blackout prevention, critical contingencies},
project = {cps-reliability},
tag = {platform},
timestamp = {Tue, 06 Nov 2018 16:59:05 +0100},
url = {https://doi.org/10.1145/3055386.3055395},
what = {This work presents a systematic approach to identify optimal load control actions for preventing cascading failures in power systems. The authors develop a tool chain that automatically generates simulation models from IEEE Common Data Format specifications and integrates OpenMDAO optimization with sensitivity analysis. They demonstrate the methodology by identifying critical load curtailment strategies in IEEE 14-bus systems.},
why = {Cascading failures in power grids can result in widespread blackouts with severe economic consequences. While load shedding is standard practice, existing methods rely on manual intervention and may not identify optimal strategies. This work is innovative because it automates the process of finding minimal load curtailment actions using integrated optimization frameworks.},
results = {The paper demonstrates that their optimization-based approach successfully identifies load curtailment strategies that prevent cascading failures in 427 cases with an average of 29 iterations. Results show that load curtailment can be restricted to less than 20 percent of system load in most scenarios, significantly improving grid stability.},
project_tags = {energy, emergency, planning, scalable AI}
}
Cascading outages in power networks cause blackouts which lead to huge economic and social consequences. The traditional form of load shedding is avoidable in many cases by identifying optimal load control actions. However, if there is a change in the system topology (adding or removing loads, lines etc), the calculations have to be performed again. This paper addresses this problem by providing a workflow that 1) generates system models from IEEE CDF specifications, 2) identifies a collection of blackout causing contingencies, 3) dynamically sets up an optimization problem, and 4) generates a table of mitigation strategies in terms of minimal load curtailment. We demonstrate the applicability of our proposed methodology by finding load curtailment actions for N-k contingencies (k = 1, 2, 3) in IEEE 14 Bus system.
@inproceedings{Chhokra2017a,
author = {Chhokra, Ajay and Hasan, Saqib and Dubey, Abhishek and Mahadevan, Nagabhushan and Karsai, Gabor},
booktitle = {Proceedings of the 8th International Conference on Cyber-Physical Systems, {ICCPS} 2017, Pittsburgh, Pennsylvania, USA, April 18-20, 2017},
title = {Diagnostics and prognostics using temporal causal models for cyber physical energy systems},
year = {2017},
pages = {87},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/conf/iccps/ChhokraHDMK17},
category = {poster},
contribution = {lead},
doi = {10.1145/3055004.3064843},
file = {:Chhokra2017a-Diagnostics_and_prognostics_using_temporal_causal_models_for_cyber_physical_energy_systems.pdf:PDF},
keywords = {temporal causal diagrams, fault diagnosis, fault prognosis, cyber-physical systems, power transmission, system reliability},
project = {cps-reliability},
tag = {platform,power},
timestamp = {Wed, 16 Oct 2019 14:14:57 +0200},
url = {https://doi.org/10.1145/3055004.3064843},
what = {This work presents temporal causal diagrams (TCDs) for diagnosing and prognosing faults in cyber-physical energy systems. The approach uses behavior-augmented temporal failure propagation graphs to identify system-level effects and design robust diagnostic and prognostic strategies. The methodology combines temporal causal reasoning with TCD formalism for power transmission systems.},
why = {Reliable operation of cyber-physical systems like power transmission networks requires rapid fault diagnosis and prognosis. Existing approaches often miss system-level effects introduced by control algorithms and communication delays. This work is innovative because it integrates temporal causal reasoning with structured formalism to predict fault propagation across complex distributed systems.},
results = {The paper demonstrates TCD-based diagnosis and prognosis for IEEE 14-bus power systems, enabling identification of cascading faults and system reconfiguration actions to arrest blackout progression. The methodology provides actionable insights for designing fault-tolerant control strategies.},
project_tags = {CPS, emergency, ML for CPS, Explainable AI}
}
Reliable operation of cyber-physical systems such as power transmission and distribution systems is crtiical for the seamless functioning of a vibrant economy. These systems consist of tightly coupled physical (energy sources, transmission and distribution lines, and loads) and computational components (protection devices, energy management systems, etc.). The protection devices such as distance relays help in preventing failure propagation by isolating faulty physical components. However, these devices rely on hard thresholds and local information, often ignoring system-level effects introduced by the distributed control algorithms. This leads to scenarios wherein a local mitigation in a subsytem could trigger a larger fault cascade, possibly resulting in a blackout.Efficient models and tools that curtail such systematic failures by performing fault diagnosis and prognosis are therefore necessary.
@inproceedings{Dubey2017,
author = {Dubey, Abhishek and Karsai, Gabor and Pradhan, Subhav},
booktitle = {Second International Conference on Fog and Mobile Edge Computing, {FMEC} 2017, Valencia, Spain, May 8-11, 2017},
title = {Resilience at the edge in cyber-physical systems},
year = {2017},
pages = {139--146},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/conf/fmec/DubeyKP17},
category = {selectiveconference},
contribution = {lead},
doi = {10.1109/FMEC.2017.7946421},
file = {:Dubey2017-Resilience_at_the_edge_in_cyber-physical_systems.pdf:PDF},
keywords = {distributed embedded systems, real-time scheduling, mixed criticality, temporal partitioning, satellite systems},
project = {cps-reliability,cps-middleware},
tag = {platform},
timestamp = {Wed, 16 Oct 2019 14:14:56 +0200},
url = {https://doi.org/10.1109/FMEC.2017.7946421},
what = {This paper presents DREMS-OS, an operating system for distributed real-time and embedded systems that supports mixed criticality task scheduling with temporal and spatial partitioning. The OS provides multiple criticality levels and manages CPU allocation across application and system tasks. The work includes empirical validation using space mission simulations and demonstrates its effectiveness on multi-core platforms.},
why = {Existing real-time systems struggle to handle diverse criticality tasks on shared resources without complex manual configuration. DREMS-OS is innovative because it provides automated temporal partitioning alongside spatial isolation, allowing efficient resource utilization while maintaining safety guarantees for mission-critical applications in distributed embedded systems.},
results = {The paper demonstrates DREMS-OS on a cluster of satellites emulating space missions, successfully managing CPU allocation across different criticality levels. The OS achieves dynamic reconfiguration of temporal partitions and maintains strict isolation between critical and best-effort tasks, enabling robust handling of multiple priority levels.},
project_tags = {CPS, middleware, scalable AI}
}
As the number of low cost computing devices at the edge of communication network increase, there are greater opportunities to enable innovative capabilities, especially in cyber-physical systems. For example, micro-grid power systems can make use of computing capabilities at the edge of a Smart Grid to provide more robust and decentralized control. However, the downside to distributing intelligence to the edge away from the controlled environment of the data centers is the increased risk of failures. The paper introduces a framework for handling these challenges. The contribution of this framework is to support strategies to (a) tolerate the transient faults as they appear due to network fluctuations or node failures, and to (b) systematically reconfigure the application if the faults persist.
@inproceedings{Dubey2017b,
author = {Dubey, Abhishek and Karsai, Gabor and Gokhale, Aniruddha and Emfinger, William and Kumar, Pranav},
booktitle = {2017 6th International Conference on Space Mission Challenges for Information Technology (SMC-IT)},
title = {Drems-os: An operating system for managed distributed real-time embedded systems},
year = {2017},
organization = {IEEE},
pages = {114--119},
category = {conference},
contribution = {lead},
file = {:Dubey2017b-Drems-os_An_operating_system_for_managed_distributed_real-time_embedded_systems.pdf:PDF},
keywords = {distributed systems, middleware, component-based architecture, resilience, service discovery, edge computing},
project = {cps-middleware},
tag = {platform},
what = {This paper introduces RIAPS (Resilient Information Architecture Platform for Smart Systems), a distributed computing middleware providing component-based application development for decentralized systems. The platform includes discovery services, time synchronization, and fault-tolerant deployment mechanisms. The work demonstrates RIAPS through traffic control and energy management applications on resource-constrained edge devices.},
why = {Distributed cyber-physical systems face challenges in reliable deployment across heterogeneous devices with limited connectivity. RIAPS is innovative because it provides a unified platform architecture supporting component composition, automatic service discovery, and resilience mechanisms without requiring centralized control, enabling practical deployment of decentralized applications.},
results = {The paper demonstrates RIAPS running traffic control and transactive energy applications on embedded systems. Results show successful discovery-based service connectivity, resilient operation despite node failures, and efficient management of distributed applications across multiple devices with minimal communication overhead.},
project_tags = {CPS, middleware, scalable AI}
}
Distributed real-time and embedded (DRE) systems executing mixed criticality task sets are increasingly being deployed in mobile and embedded cloud computing platforms, including space applications. These DRE systems must not only operate over a range of temporal and spatial scales, but also require stringent assurances for secure interactions between the system’s tasks without violating their individual timing constraints. To address these challenges, this paper describes a novel distributed operating system focusing on the scheduler design to support the mixed criticality task sets. Empirical results from experiments involving a case study of a cluster of satellites emulated in a laboratory testbed validate our claims.
@inproceedings{Eisele2017,
author = {Eisele, Scott and Pettet, Geoffrey and Dubey, Abhishek and Karsai, Gabor},
booktitle = {{IEEE} Fog World Congress, {FWC} 2017, Santa Clara, CA, USA, October 30 - Nov. 1, 2017},
title = {Towards an architecture for evaluating and analyzing decentralized Fog applications},
year = {2017},
pages = {1--6},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/conf/fwc/EiselePDK17},
category = {workshop},
contribution = {lead},
doi = {10.1109/FWC.2017.8368531},
file = {:Eisele2017-Towards_an_architecture_for_evaluating_and_analyzing_decentralized_Fog_applications.pdf:PDF},
keywords = {fog computing, decentralized applications, network analysis, workload distribution, edge devices},
project = {cps-reliability,cps-middleware},
tag = {platform,decentralization},
timestamp = {Wed, 16 Oct 2019 14:14:51 +0200},
url = {https://doi.org/10.1109/FWC.2017.8368531},
what = {This paper presents an architecture and framework for evaluating and analyzing decentralized fog applications using RIAPS. The work addresses challenges in resource discovery, device deployment, and workload distribution for fog computing. The authors develop simulation tools and hardware-in-the-loop testbeds to support network analysis and congestion prediction.},
why = {Fog computing introduces complexity in determining optimal application placement and workload distribution across edge devices. This work is innovative because it provides integrated simulation and hardware-in-the-loop testing capabilities to evaluate fog applications before deployment, reducing the cost of determining suitable configurations.},
results = {The paper demonstrates network analysis tools that predict message delivery times and congestion in fog networks. Results show the framework successfully identifies optimal workload distribution across edge devices and validates application deployments using both simulation and hardware testbeds.},
project_tags = {CPS, middleware, scalable AI}
}
As the number of low cost computing devices at the edge of network increases, there are greater opportunities to enable novel, innovative capabilities, especially in decentralized cyber-physical systems. For example, in an urban setting, a set of networked, collaborating processors at the edge can be used to dynamically detect traffic densities via image processing and then use those densities to control the traffic flow by coordinating traffic light sequences, in a decentralized architecture. In this paper we describe a testbed and an application framework for such applications.
@inproceedings{Eisele2017a,
author = {Eisele, Scott and Dubey, Abhishek and Karsai, Gabor and Lukic, Srdjan},
booktitle = {Proceedings of the 8th International Conference on Cyber-Physical Systems, {ICCPS} 2017, Pittsburgh, Pennsylvania, USA, April 18-20, 2017},
title = {Transactive energy demo with {RIAPS} platform},
year = {2017},
pages = {91},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/conf/iccps/EiseleDKL17},
category = {poster},
contribution = {lead},
doi = {10.1145/3055004.3064845},
file = {:Eisele2017a-Transactive_energy_demo_with_RIAPS_platform.pdf:PDF},
keywords = {transactive energy, distributed control, RIAPS platform, smart grid, embedded systems},
project = {cps-reliability,cps-middleware,transactive-energy},
tag = {decentralization,power},
timestamp = {Wed, 16 Oct 2019 14:14:57 +0200},
url = {https://doi.org/10.1145/3055004.3064845},
what = {This work presents a demonstration of transactive energy applications using the RIAPS platform, implementing both energy trading and traffic control systems. The paper describes practical deployment of distributed control applications on embedded single-board computers and shows integration with smart grid simulators for testing.},
why = {Practical deployment of decentralized transactive energy systems requires reliable distributed platforms and integration with simulation tools. This work is innovative because it demonstrates end-to-end deployment of distributed energy and traffic control applications on real embedded systems with simulator integration.},
results = {The paper successfully demonstrates transactive energy and traffic control applications running on RIAPS nodes, showing proper system initialization, service discovery, and interaction with grid simulators. The work validates the platform's capability to support practical energy market applications.},
project_tags = {energy, CPS, middleware}
}
This work presents a platform for decentralized distributed computing called Resilient Information Architecture for the Smart Grid (RIAPS) through a transactional energy and a traffic application.
@inproceedings{Eisele2017b,
author = {Eisele, Scott and Madari, Istv{\'{a}}n and Dubey, Abhishek and Karsai, Gabor},
booktitle = {20th {IEEE} International Symposium on Real-Time Distributed Computing, {ISORC} 2017, Toronto, ON, Canada, May 16-18, 2017},
title = {{RIAPS:} Resilient Information Architecture Platform for Decentralized Smart Systems},
year = {2017},
pages = {125--132},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/conf/isorc/EiseleMDK17},
category = {selectiveconference},
contribution = {lead},
doi = {10.1109/ISORC.2017.22},
file = {:Eisele2017b-RIAPS_Resilient_Information_Architecture_Platform_for_Decentralized_Smart_Systems.pdf:PDF},
keywords = {service discovery, distributed hash table, resilience, decentralized systems, fault tolerance},
project = {smart-transit,smart-cities},
tag = {platform,decentralization,power},
timestamp = {Wed, 16 Oct 2019 14:14:53 +0200},
url = {https://doi.org/10.1109/ISORC.2017.22},
what = {This paper describes RIAPS' resilient discovery service for distributed smart systems, including mechanisms for service registration, heartbeat-based failure detection, and distributed hash table implementation for service lookups. The work presents design choices for handling node failures and ensuring scalability across network changes.},
why = {Resilience in distributed systems requires robust mechanisms for service discovery despite node failures and network partitions. This work is innovative because it presents practical design patterns for implementing distributed discovery services that maintain consistency and enable self-healing in decentralized systems.},
results = {The paper demonstrates the discovery service handling ingress and egress of nodes in clusters, maintaining consistent service registries across the network. Results show successful service lookup under node failures and validation of key-based service distribution using OpenDHT.},
project_tags = {CPS, middleware, scalable AI}
}
The emerging Fog Computing paradigm provides an additional computational layer that enables new capabilities in real-time data-driven applications. This is especially interesting in the domain of Smart Grid as the boundaries between traditional generation, distribution, and consumer roles are blurring. This is a reflection of the ongoing trend of intelligence distribution in Smart Systems. In this paper, we briefly describe a component-based decentralized software platform called Resilient Information Architecture Platform for Smart Systems (RIAPS) which provides an infrastructure for such systems. We briefly describe some initial applications built using this platform. Then, we focus on the design and integration choices for a resilient Discovery Manager service that is a critical component of this infrastructure. The service allows applications to discover each other, work collaboratively, and ensure the stability of the Smart System.
@inproceedings{Ghafouri2017,
author = {Ghafouri, Amin and Laszka, Aron and Dubey, Abhishek and Koutsoukos, Xenofon D.},
booktitle = {Proceedings of the 2nd International Workshop on Science of Smart City Operations and Platforms Engineering, SCOPE@CPSWeek 2017, Pittsburgh, PA, USA, April 21, 2017},
title = {Optimal detection of faulty traffic sensors used in route planning},
year = {2017},
pages = {1--6},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/conf/cpsweek/GhafouriLDK17},
category = {workshop},
contribution = {colab},
doi = {10.1145/3063386.3063767},
file = {:Ghafouri2017-Optimal_detection_of_faulty_traffic_sensors_used_in_route_planning.pdf:PDF},
keywords = {fault detection, traffic sensors, Gaussian processes, route planning, anomaly detection},
project = {cps-reliability,smart-transit,smart-cities},
tag = {ai4cps,platform,incident,transit},
timestamp = {Tue, 06 Nov 2018 16:59:05 +0100},
url = {https://doi.org/10.1145/3063386.3063767},
what = {This paper presents methods for detecting faulty traffic sensors using prediction models and Gaussian processes, with application to route planning optimization. The authors develop algorithms to identify optimal detection thresholds that minimize losses caused by false positives and false negatives. The work applies the methodology to real traffic data from downtown Los Angeles.},
why = {Faulty traffic sensors degrade performance of route planning systems and can cause suboptimal routing decisions. Existing detection methods often assume fixed thresholds without optimization. This work is innovative because it formulates sensor fault detection as an optimization problem that minimizes overall losses from sensor failures.},
results = {The paper demonstrates that using Gaussian process-based prediction models enables effective detection of faulty traffic sensors. Results show optimal detection thresholds minimize total losses from both false positives and false negatives in route planning applications.},
project_tags = {transit, ML for CPS, Explainable AI}
}
In a smart city, real-time traffic sensors may be deployed for various applications, such as route planning. Unfortunately, sensors are prone to failures, which result in erroneous traffic data. Erroneous data can adversely affect applications such as route planning, and can cause increased travel time. To minimize the impact of sensor failures, we must detect them promptly and accurately. However, typical detection algorithms may lead to a large number of false positives (i.e., false alarms) and false negatives (i.e., missed detections), which can result in suboptimal route planning. In this paper, we devise an effective detector for identifying faulty traffic sensors using a prediction model based on Gaussian Processes. Further, we present an approach for computing the optimal parameters of the detector which minimize losses due to false-positive and false-negative errors. We also characterize critical sensors, whose failure can have high impact on the route planning application. Finally, we implement our method and evaluate it numerically using a real- world dataset and the route planning platform OpenTripPlanner.
@inproceedings{Hasan2017,
author = {Hasan, Saqib and Chhokra, Ajay and Dubey, Abhishek and Mahadevan, Nagabhushan and Karsai, Gabor and Jain, Rishabh and Lukic, Srdjan},
booktitle = {{IEEE} Power {\&} Energy Society Innovative Smart Grid Technologies Conference, {ISGT} 2017, Washington, DC, USA, April 23-26, 2017},
title = {A simulation testbed for cascade analysis},
year = {2017},
pages = {1--5},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/conf/isgt/HasanCDMKJL17},
category = {selectiveconference},
contribution = {lead},
doi = {10.1109/ISGT.2017.8086080},
file = {:Hasan2017-A_simulation_testbed_for_cascade_analysis.pdf:PDF},
keywords = {cascading failures, protection devices, cyber faults, power systems, fault analysis},
project = {cps-reliability},
tag = {platform,power},
timestamp = {Wed, 16 Oct 2019 14:14:57 +0200},
url = {https://doi.org/10.1109/ISGT.2017.8086080},
what = {This paper develops a simulation testbed for analyzing cascading failures in power systems, using behavioral models of protection devices including distance relays, overcurrent relays, and circuit breakers. The work incorporates cyber faults and studies their impact on cascading failure progression. The methodology enables contingency analysis with cyber fault injection.},
why = {Understanding how cyber faults affect power system protection devices is critical for grid security. Existing approaches often separate physical and cyber analysis. This work is innovative because it integrates behavioral models of protection devices with cyber fault scenarios to study coupled effects on cascade progression.},
results = {The paper demonstrates how cyber faults in protection assemblies (such as stuck open faults and spurious detection faults) can initiate or accelerate cascading failures in power grids. Results validate the testbed against IEEE 14-bus system behavior and identify critical cyber-physical failure combinations.},
project_tags = {energy, emergency, CPS}
}
Electrical power systems are heavily instrumented with protection assemblies (relays and breakers) that detect anomalies and arrest failure propagation. However, failures in these discrete protection devices could have inadvertent consequences, including cascading failures resulting in blackouts. This paper aims to model the behavior of these discrete protection devices in nominal and faulty conditions and apply it towards simulation and contingency analysis of cascading failures in power transmission systems. The behavior under fault conditions are used to identify and explain conditions for blackout evolution which are not otherwise obvious. The results are demonstrated using a standard IEEE-14 Bus System.
@inproceedings{Khare2017,
author = {Khare, Shweta Prabhat and Sallai, J{\'{a}}nos and Dubey, Abhishek and Gokhale, Aniruddha S.},
booktitle = {20th {IEEE} International Symposium on Real-Time Distributed Computing, {ISORC} 2017, Toronto, ON, Canada, May 16-18, 2017},
title = {Short Paper: Towards Low-Cost Indoor Localization Using Edge Computing Resources},
year = {2017},
pages = {28--31},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/conf/isorc/KhareSDG17},
category = {selectiveconference},
contribution = {lead},
doi = {10.1109/ISORC.2017.23},
file = {:Khare2017-Short_Paper_Towards_Low-Cost_Indoor_Localization_Using_Edge_Computing_Resources.pdf:PDF},
keywords = {indoor localization, BLE, ultra-wideband, RSSI fingerprinting, edge computing},
project = {cps-middleware},
tag = {transit},
timestamp = {Wed, 16 Oct 2019 14:14:53 +0200},
url = {https://doi.org/10.1109/ISORC.2017.23},
what = {This paper presents a solution for low-cost indoor localization using Bluetooth Low Energy (BLE) and ultra-wideband (UWB) RF technologies. The approach combines RSSI fingerprinting with UWB ranging to achieve accurate position estimation on resource-constrained edge devices. The work demonstrates practical implementation using Intel Edison boards with BLE beacons.},
why = {Indoor localization for smart services requires cost-effective solutions that work on edge devices with limited power. BLE provides low power consumption but limited accuracy, while UWB provides accuracy at higher cost. This work is innovative because it combines these technologies to achieve practical accuracy with acceptable computational overhead.},
results = {The paper demonstrates sub-meter indoor localization accuracy using a hybrid approach combining BLE RSSI fingerprinting with UWB ranging on Intel Edison edge devices. Results show computation time under 1 millisecond and practical accuracy improvements over single-technology approaches.},
project_tags = {CPS, middleware, scalable AI}
}
Emerging smart services, such as indoor smart parking or patient monitoring and tracking in hospitals, incur a significant technical roadblock stemming primarily from a lack of cost-effective and easily deployable localization framework that impedes their widespread deployment. To address this concern, in this paper we present a low-cost, indoor localization and navigation system, which performs continuous and real-time processing of Bluetooth Low Energy (BLE) and IEEE 802.15.4a compliant Ultra-wideband (UWB) sensor data to localize and navigate the concerned entity to its desired location. Our approach depends upon fusing the two feature sets, using the UWB to calibrate the BLE localization mechanism.
@inproceedings{Kvaternik2017,
author = {Kvaternik, Karla and Laszka, Aron and Walker, Michael and Schmidt, Douglas C. and Sturm, Monika and Lehofer, Martin and Dubey, Abhishek},
booktitle = {preprint at arxiv},
title = {Privacy-Preserving Platform for Transactive Energy Systems},
year = {2017},
volume = {abs/1709.09597},
archiveprefix = {arXiv},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/journals/corr/abs-1709-09597},
contribution = {lead},
eprint = {1709.09597},
file = {:Kvaternik2017-Privacy_Preserving_Platform_for_Transactive_Energy_Systems.pdf:PDF},
journal = {CoRR},
keywords = {blockchain, transactive energy, privacy, smart contracts, distributed ledger, microgrids},
project = {transactive-energy,smart-energy},
tag = {decentralization,power},
timestamp = {Tue, 12 Nov 2019 00:00:00 +0100},
url = {http://arxiv.org/abs/1709.09597},
what = {This paper presents PETra, a privacy-preserving platform for transactive energy systems using blockchain and cryptographic techniques. The work addresses security, safety, and privacy requirements for energy trading in microgrids, implementing mechanisms for anonymous trading, secure smart contracts, and distributed ledger transparency. The paper includes detailed description of asset structures and trading workflows.},
why = {Transactive energy systems require careful attention to privacy while maintaining transparency and auditability of trades. This work is innovative because it provides a comprehensive framework addressing all three requirements simultaneously through blockchain-based architecture with cryptographic privacy mechanisms.},
results = {The paper demonstrates PETra handling energy and financial asset transfers in a real microgrid system, achieving 90-percent trade closure within 23 seconds. The platform successfully implements anonymous address usage while maintaining immutable transaction records and enabling system operator oversight.},
project_tags = {energy, scalable AI, middleware}
}
Transactive energy systems (TES) are emerging as a transformative solution for the problems faced by distribution system operators due to an increase in the use of distributed energy resources and a rapid acceleration in renewable energy generation. These, on one hand, pose a decentralized power system controls problem, requiring strategic microgrid control to maintain stability for the community and for the utility. On the other hand, they require robust financial markets operating on distributed software platforms that preserve privacy. In this paper, we describe the implementation of a novel, blockchain-based transactive energy system. We outline the key requirements and motivation of this platform, describe the lessons learned, and provide a description of key architectural components of this system.
@inproceedings{Laszka2017,
author = {Laszka, Aron and Dubey, Abhishek and Walker, Michael and Schmidt, Douglas C.},
booktitle = {Proceedings of the Seventh International Conference on the Internet of Things, {IOT} 2017, Linz, Austria, October 22-25, 2017},
title = {Providing privacy, safety, and security in IoT-based transactive energy systems using distributed ledgers},
year = {2017},
pages = {13:1--13:8},
acceptance = {31},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/conf/iot/LaszkaDWS17},
category = {selectiveconference},
contribution = {lead},
doi = {10.1145/3131542.3131562},
file = {:Laszka2017-Providing_privacy_safety_and_security_in_IoT-based_transactive_energy_systems_using_distributed_ledgers.pdf:PDF},
keywords = {privacy, transactive energy, IoT, distributed ledger, smart contracts, prosumer privacy},
project = {cps-reliability,cps-blockchains,transactive-energy},
tag = {decentralization,power},
timestamp = {Tue, 12 Nov 2019 00:00:00 +0100},
url = {https://doi.org/10.1145/3131542.3131562},
what = {This paper extends PETra with privacy-preserving mechanisms for IoT-based transactive energy systems using distributed ledgers. The work focuses on ensuring prosumer privacy while maintaining grid safety and security through anonymous trading, encrypted asset transfers, and auditable transactions. The paper provides detailed analysis of privacy and security requirements.},
why = {Privacy in transactive energy systems is challenging because protecting prosumer information can conflict with grid operator needs for system control. This work is innovative because it proposes privacy-preserving architectures that enable decentralized trading while allowing necessary system-level oversight without revealing individual transaction details.},
results = {The paper demonstrates privacy-preserving energy transactions on distributed ledgers with mechanisms for anonymous asset transfer while maintaining grid safety constraints. Results show that prosumer privacy and grid stability are simultaneously achievable through cryptographic techniques and smart contract enforcement.},
project_tags = {energy, scalable AI}
}
Power grids are undergoing major changes due to rapid growth in renewable energy resources and improvements in battery technology. While these changes enhance sustainability and efficiency, they also create significant management challenges as the complexity of power systems increases. To tackle these challenges, decentralized Internet-of-Things (IoT) solutions are emerging, which arrange local communities into transactive microgrids. Within a transactive microgrid, “prosumers” (i.e., consumers with energy generation and storage capabilities) can trade energy with each other, thereby smoothing the load on the main grid using local supply. It is hard, however, to provide security, safety, and privacy in a decentralized and transactive energy system. On the one hand, prosumers’ personal information must be protected from their trade partners and the system operator. On the other hand, the system must be protected from careless or malicious trading, which could destabilize the entire grid. This paper describes Privacypreserving Energy of cyb (PETra), which is a secure and safe solution for transactive microgrids that enables consumers to trade energy without sacrificing their privacy. PETra builds on distributed ledgers, such as blockchains, and provides anonymity for communication, bidding, and trading.
@inproceedings{Mukhopadhyay2017,
author = {Mukhopadhyay, Ayan and Vorobeychik, Yevgeniy and Dubey, Abhishek and Biswas, Gautam},
booktitle = {Proceedings of the 16th Conference on Autonomous Agents and MultiAgent Systems, {AAMAS} 2017, S{\~{a}}o Paulo, Brazil, May 8-12, 2017},
title = {Prioritized Allocation of Emergency Responders based on a Continuous-Time Incident Prediction Model},
year = {2017},
acceptance = {27},
pages = {168--177},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/conf/atal/MukhopadhyayVDB17},
category = {selectiveconference},
contribution = {colab},
file = {:Mukhopadhyay2017-Prioritized_Allocation_of_Emergency_Responders_based_on_a_Continuous-Time_Incident_Prediction_Model.pdf:PDF},
keywords = {incident prediction, emergency response, optimization, responder allocation, survival analysis},
project = {smart-emergency-response,smart-cities},
tag = {ai4cps,incident},
timestamp = {Wed, 27 Sep 2017 07:24:00 +0200},
url = {http://dl.acm.org/citation.cfm?id=3091154},
what = {This paper addresses emergency responder allocation in urban areas using incident prediction models and optimization algorithms. The work develops methods to predict incident arrival times and severities using survival analysis, then formulates responder allocation as an optimization problem balancing coverage and response times. A hierarchical clustering approach identifies incident patterns.},
why = {Emergency response efficiency depends on optimal responder placement and dispatch. Traditional approaches assume uniform incident distributions and fixed response times. This work is innovative because it combines predictive modeling of incident severity with optimization algorithms to adaptively allocate responders based on spatial-temporal patterns.},
results = {The paper demonstrates improved emergency response times through data-driven responder allocation in Nashville. Results show how incident prediction models enable intelligent dispatch that accounts for incident severity, reducing overall response times compared to traditional approaches.},
project_tags = {emergency, planning, scalable AI, Explainable AI}
}
Efficient emergency response is a major concern in densely populated urban areas. Numerous techniques have been proposed to allocate emergency responders to optimize response times, coverage, and incident prevention. Effective response depends, in turn, on effective prediction of incidents occurring in space and time, a problem which has also received considerable prior attention. We formulate a non-linear mathematical program maximizing expected incident coverage, and propose a novel algorithmic framework for solving this problem. In order to aid the optimization problem, we propose a novel incident prediction mechanism. Prior art in incident prediction does not generally consider incident priorities which are crucial in optimal dispatch, and spatial modeling either considers each discretized area independently, or learns a homogeneous model. We bridge these gaps by learning a joint distribution of both incident arrival time and severity, with spatial heterogeneity captured using a hierarchical clustering approach. Moreover, our decomposition of the joint arrival and severity distributions allows us to independently learn the continuous-time arrival model, and subsequently use a multinomial logistic regression to capture severity, conditional on incident time. We use real traffic accident and response data from the urban area around Nashville, USA, to evaluate the proposed approach, showing that it significantly outperforms prior art as well as the real dispatch method currently in use.
@inproceedings{Nannapaneni2017,
author = {Nannapaneni, Saideep and Dubey, Abhishek and Mahadevan, Sankaran},
booktitle = {2017 {IEEE} SmartWorld},
title = {Performance evaluation of smart systems under uncertainty},
year = {2017},
acceptance = {28},
pages = {1--8},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/conf/uic/NannapaneniDM17},
category = {selectiveconference},
contribution = {colab},
doi = {10.1109/UIC-ATC.2017.8397430},
file = {:Nannapaneni2017-Performance_evaluation_of_smart_systems_under_uncertainty.pdf:PDF},
keywords = {uncertainty quantification, Bayesian networks, smart systems, performance evaluation, probabilistic inference},
project = {cps-reliability},
tag = {platform},
timestamp = {Wed, 16 Oct 2019 14:14:50 +0200},
url = {https://doi.org/10.1109/UIC-ATC.2017.8397430},
what = {This paper develops a model-based framework for uncertainty quantification in smart systems using dynamic Bayesian networks. The work addresses sensor uncertainty, hardware resource constraints, and system-level effects on performance metrics. The approach enables design exploration under uncertainty through probabilistic inference and Monte Carlo analysis.},
why = {Smart system design involves multiple uncertainty sources that significantly impact performance. Traditional deterministic design approaches cannot adequately address these uncertainties. This work is innovative because it provides systematic methodology using hierarchical Bayesian networks to propagate and analyze multiple uncertainty sources.},
results = {The paper demonstrates uncertainty quantification for smart indoor heating systems, analyzing how sensor uncertainty and communication delays affect comfort and energy performance. Results show the methodology enables exploration of design alternatives and identification of robust configurations.},
project_tags = {CPS, ML for CPS, scalable AI}
}
This paper develops a model-based framework for the quantification and propagation of multiple uncertainty sources affecting the performance of a smart system. A smart system, in general, performs sensing, control and actuation for proper functioning of a physical subsystem (also referred to as a plant). With strong feedback coupling between several subsystems, the uncertainty in the quantities of interest (QoI) amplifies over time. The coupling in a generic smart system occurs at two levels: (1) coupling between individual subsystems (plant, cyber, actuation, sensors), and (2) coupling between nodes in a distributed computational subsystem. In this paper, a coupled smart system is decoupled and considered as a feed-forward system over time and modeled using a two-level Dynamic Bayesian Network (DBN), one at each level of coupling (between subsystems and between nodes). A DBN can aggregate uncertainty from multiple sources within a time step and across time steps. The DBN associated with a smart system can be learned using available system models, physics models and data. The proposed methodology is demonstrated for the design of a smart indoor heating system (identification of sensors and a wireless network) within cost constraints that enables room-by-room temperature control. We observe that sensor uncertainty has a higher impact on the performance of the heating system compared to the uncertainty in the wireless network.
@article{Nannapaneni2017a,
author = {Nannapaneni, S. and Mahadevan, S. and Dubey, A. and Lechevalier, D. and Narayanan, A. and Rachuri, S.},
journal = {Smart and Sustainable Manufacturing Systems},
title = {Automated Uncertainty Quantification Through Information Fusion in Manufacturing Processes},
year = {2017},
issn = {25206478},
number = {1},
pages = {153-177},
volume = {1},
contribution = {minor},
file = {:Nannapaneni2017a-Automated_Uncertainty_Quantification_through_Information_Fusion_in_Manufacturing_Processes.pdf:PDF},
keywords = {uncertainty quantification, Bayesian networks, manufacturing, semantic models, automated analysis},
language = {eng},
what = {This paper presents automated uncertainty quantification for manufacturing processes using hierarchical Bayesian networks. The work develops methodology for constructing Bayesian networks from semantic system models and physics-based models, enabling automated propagation of multiple uncertainty sources through manufacturing systems. The approach is demonstrated on injection molding processes.},
why = {Manufacturing system optimization requires understanding how multiple uncertainty sources affect key performance indicators. Manual uncertainty analysis is labor-intensive and error-prone. This work is innovative because it automates Bayesian network construction from domain-specific models, enabling practitioners to incorporate uncertainty in design decisions.},
results = {The paper demonstrates automated uncertainty quantification for injection molding processes, successfully identifying key uncertainty sources and their effects on final product quality. The methodology enables systematic analysis of manufacturing performance under uncertainty.},
project_tags = {scalable AI, ML for CPS}
}
Evaluation of key performance indicators (KPIs) such as energy consumption is essential for decision-making during the design and operation of smart manufacturing systems. The measurements of KPIs are strongly affected by several uncertainty sources such as input material uncertainty, the inherent variability in the manufacturing process, model uncertainty, and the uncertainty in the sensor measurements of operational data. A comprehensive understanding of the uncertainty sources and their effect on the KPIs is required to make the manufacturing processes more efficient. Towards this objective, this paper proposed an automated methodology to generate a hierarchical Bayesian network (HBN) for a manufacturing system from semantic system models, physics-based models, and available data in an automated manner, which can be used to perform uncertainty quantification (UQ) analysis. The semantic system model, which is a high-level model describing the system along with its parameters, is assumed to be available in the generic modeling environment (GME) platform. Apart from semantic description, physics-based models, if available, are assumed to be available in model libraries. The proposed methodology was divided into two tasks: (1) automated hierarchical Bayesian network construction using the semantic system model, available models and data, and (2) automated uncertainty quantification (UQ) analysis. A metamodel of an HBN was developed using the GME, along with a syntax representation for the associated conditional probability tables/distributions. The constructed HBN corresponding to a system was represented as an instance model of the HBN metamodel. On the metamodel, a model interpreter was written to be able to carry out the UQ analysis in an automated manner for any HBN instance model conforming to the HBN metamodel. The proposed methodologies are demonstrated using an injection molding process.
@inproceedings{Pettet2017,
author = {Pettet, Geoffrey and Nannapaneni, Saideep and Stadnick, Benjamin and Dubey, Abhishek and Biswas, Gautam},
booktitle = {2017 {IEEE} SmartWorld},
title = {Incident analysis and prediction using clustering and Bayesian network},
year = {2017},
acceptance = {28},
pages = {1--8},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/conf/uic/PettetNSDB17},
category = {selectiveconference},
contribution = {lead},
doi = {10.1109/UIC-ATC.2017.8397587},
file = {:Pettet2017-Incident_analysis_and_prediction_using_clustering_and_Bayesian_network.pdf:PDF},
keywords = {incident prediction, clustering, Bayesian networks, survival analysis, urban analytics},
project = {smart-emergency-response,smart-cities},
tag = {ai4cps,incident},
timestamp = {Wed, 16 Oct 2019 14:14:50 +0200},
url = {https://doi.org/10.1109/UIC-ATC.2017.8397587},
what = {This paper presents a clustering and Bayesian network approach for incident analysis and prediction in urban areas. The work develops unsupervised methods for grouping incidents with similar characteristics and applies survival analysis to predict incident frequencies for specific spatial areas. The methodology integrates data preprocessing, clustering, and probabilistic prediction.},
why = {Incident prediction in large urban areas requires identifying patterns across diverse incident types and locations. Existing approaches often make oversimplified assumptions about incident distributions. This work is innovative because it combines clustering with Bayesian networks to learn incident patterns directly from data.},
results = {The paper demonstrates successful incident prediction for Nashville using real fire department data, achieving significantly higher accuracy than baseline models through cluster-specific prediction. Results show how unsupervised clustering improves prediction accuracy by identifying incident subgroups.},
project_tags = {emergency, ML for CPS, Explainable AI}
}
Advances in data collection and storage infrastructure offer an unprecedented opportunity to integrate both data and emergency resources in a city into a dynamic learning system that can anticipate and rapidly respond to heterogeneous incidents. In this paper, we describe integration methods for spatio-temporal incident forecasting using previously collected vehicular accident data provided to us by the Nashville Fire Department. The literature provides several techniques that focus on analyzing features and predicting accidents for specific situations (specific intersections in a city, or certain segments of a freeway, for example), but these models break down when applied to a large, general area consisting of many road and intersection types and other factors like weather conditions. We use Similarity Based Agglomerative Clustering (SBAC) analysis to categorize incidents to account for these variables. Thereafter, we use survival analysis to learn the likelihood of incidents per cluster. The mapping of the clusters to the spatial locations is achieved using a Bayesian network. The prediction methods we have developed lay the foundation for future work on an optimal emergency vehicle allocation and dispatch system in Nashville.
@inproceedings{Samal2017,
author = {Samal, Chinmaya and Sun, Fangzhou and Dubey, Abhishek},
booktitle = {2017 {IEEE} International Conference on Smart Computing, {SMARTCOMP} 2017, Hong Kong, China, May 29-31, 2017},
title = {SpeedPro: {A} Predictive Multi-Model Approach for Urban Traffic Speed Estimation},
year = {2017},
pages = {1--6},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/conf/smartcomp/SamalSD17},
category = {workshop},
acceptance = {37.5},
contribution = {lead},
doi = {10.1109/SMARTCOMP.2017.7947048},
file = {:Samal2017-SpeedPro_A_Predictive_Multi-Model_Approach_for_Urban_Traffic_Speed_Estimation.pdf:PDF},
keywords = {traffic speed estimation, clustering, random forests, weather data, urban transportation},
project = {smart-transit,smart-cities},
tag = {ai4cps,transit},
timestamp = {Wed, 16 Oct 2019 14:14:54 +0200},
url = {https://doi.org/10.1109/SMARTCOMP.2017.7947048},
what = {This paper presents SpeedPro, a multi-model approach for urban traffic speed estimation using historical weather data and probe vehicle information. The work develops cluster-based prediction models that improve accuracy by grouping similar traffic conditions. The methodology integrates data from buses and weather sources to estimate real-time traffic speeds.},
why = {Accurate traffic speed estimation supports urban planning and transportation optimization. Existing approaches often fail to account for diverse traffic patterns across different regions and times. This work is innovative because it uses clustering to identify similar traffic conditions and develops separate prediction models for each cluster.},
results = {The paper demonstrates traffic speed prediction with RMSE error in the range of 2.9 to 3.3 miles per hour using cluster-based random forest models. Results show that accounting for weather and historical patterns improves prediction accuracy compared to models using only traffic data.},
project_tags = {transit, ML for CPS}
}
Data generated by GPS-equipped probe vehicles, especially public transit vehicles can be a reliable source for traffic speed estimation. Traditionally, this estimation is done by learning the parameters of a model that describes the relationship between the speed of the probe vehicle and the actual traffic speed. However, such approaches typically suffer from data sparsity issues. Furthermore, most state of the art approaches does not consider the effect of weather and the driver of the probe vehicle on the parameters of the learned model. In this paper, we describe a multivariate predictive multi-model approach called SpeedPro that (a) first identifies similar clusters of operation from the historic data that includes the real-time position of the probe vehicle, the weather data, and anonymized driver identifier, and then (b) uses these different models to estimate the traffic speed in real-time as a function of current weather, driver and probe vehicle speed. When the real-time information is not available our approach uses a different model that uses the historical weather and traffic information for estimation. Our results show that the purely historical data is less accurate than the model that uses the real-time information.
@inproceedings{Sun2017,
author = {Sun, Fangzhou and Dubey, Abhishek and White, Jules},
booktitle = {2017 {IEEE} International Conference on Big Data, BigData 2017, Boston, MA, USA, December 11-14, 2017},
title = {DxNAT - Deep neural networks for explaining non-recurring traffic congestion},
year = {2017},
pages = {2141--2150},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/conf/bigdataconf/SunDW17},
category = {selectiveconference},
contribution = {lead},
doi = {10.1109/BigData.2017.8258162},
file = {:Sun2017-DxNAT-Deep_neural_networks_for_explaining_non-recurring_traffic_congestion.pdf:PDF},
keywords = {traffic congestion, deep learning, anomaly detection, non-recurring congestion, event detection},
project = {smart-transit,smart-cities,cps-reliability},
tag = {ai4cps,transit},
timestamp = {Wed, 16 Oct 2019 14:14:51 +0200},
url = {https://doi.org/10.1109/BigData.2017.8258162},
what = {This paper presents DxNAT, a deep neural network approach for identifying and explaining non-recurring traffic congestion caused by events. The work converts traffic data to images and applies convolutional neural networks to classify congestion patterns. The methodology demonstrates high accuracy in detecting event-related congestion from traffic sensor data.},
why = {Most traffic congestion research focuses on recurring patterns, leaving non-recurring congestion underexplored. Events cause significant congestion but are difficult to detect from traffic data alone. This work is innovative because it applies deep learning to identify event-caused congestion and provides methods to explain network predictions.},
results = {The paper achieves 98.73 percent accuracy in identifying non-recurring traffic congestion using deep neural networks. Results demonstrate successful detection of congestion caused by sports events and accidents, enabling better understanding of event impacts on urban traffic.},
project_tags = {transit, ML for CPS, Explainable AI}
}
Non-recurring traffic congestion is caused by temporary disruptions, such as accidents, sports games, adverse weather, etc. We use data related to real-time traffic speed, jam factors (a traffic congestion indicator), and events collected over a year from Nashville, TN to train a multi-layered deep neural network. The traffic dataset contains over 900 million data records. The network is thereafter used to classify the real-time data and identify anomalous operations. Compared with traditional approaches of using statistical or machine learning techniques, our model reaches an accuracy of 98.73 percent when identifying traffic congestion caused by football games. Our approach first encodes the traffic across a region as a scaled image. After that the image data from different timestamps is fused with event- and time-related data. Then a crossover operator is used as a data augmentation method to generate training datasets with more balanced classes. Finally, we use the receiver operating characteristic (ROC) analysis to tune the sensitivity of the classifier. We present the analysis of the training time and the inference time separately.
@inproceedings{Sun2017a,
author = {Sun, Fangzhou and Samal, Chinmaya and White, Jules and Dubey, Abhishek},
booktitle = {2017 {IEEE} International Conference on Smart Computing, {SMARTCOMP} 2017, Hong Kong, China, May 29-31, 2017},
title = {Unsupervised Mechanisms for Optimizing On-Time Performance of Fixed Schedule Transit Vehicles},
year = {2017},
acceptance = {37.5},
pages = {1--8},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/conf/smartcomp/SunSWD17},
category = {selectiveconference},
contribution = {lead},
doi = {10.1109/SMARTCOMP.2017.7947057},
file = {:Sun2017a-Unsupervised_Mechanisms_for_Optimizing_On-Time_Performance_of_Fixed_Schedule_Transit_Vehicles.pdf:PDF},
keywords = {transit scheduling, genetic algorithms, on-time performance, optimization, clustering},
project = {smart-transit,smart-cities},
tag = {ai4cps,transit},
timestamp = {Wed, 16 Oct 2019 14:14:54 +0200},
url = {https://doi.org/10.1109/SMARTCOMP.2017.7947057},
what = {This paper addresses on-time performance optimization for fixed-schedule transit vehicles using unsupervised mechanisms. The work develops genetic algorithms and greedy approaches to generate schedules that maximize the probability of buses arriving within desired time windows. The methodology uses clustering to identify similar travel time patterns across different temporal periods.},
why = {Transit schedule reliability depends on realistic time estimates and timetable design. Manual schedule adjustment is labor-intensive and may not identify optimal solutions. This work is innovative because it combines statistical analysis of travel time patterns with optimization algorithms to generate transit schedules.},
results = {The paper demonstrates genetic algorithm optimization improving average on-time performance from 62.9 percent to 74.7 percent on Nashville transit routes. Results show how clustering monthly travel patterns enables generation of better transit schedules that account for seasonal variations.},
project_tags = {transit, planning, scalable AI}
}
The on-time arrival performance of vehicles at stops is a critical metric for both riders and city planners to evaluate the reliability of a transit system. However, it is a non-trivial task for transit agencies to adjust the existing bus schedule to optimize the on-time performance for the future. For example, severe weather conditions and special events in the city could slow down traffic and cause bus delay. Furthermore, the delay of previous trips may affect the initial departure time of consecutive trips and generate accumulated delay. In this paper, we formulate the problem as a single-objective optimization task with constraints and propose a greedy algorithm and a genetic algorithm to generate bus schedules at timepoints that improve the bus on-time performance at timepoints which is indicated by whether the arrival delay is within the desired range. We use the Nashville bus system as a case study and simulate the optimization performance using historical data. The comparative analysis of the results identifies that delay patterns change over time and reveals the efficiency of the greedy and genetic algorithms.
@inproceedings{Tan2017,
author = {Tan, Joshua and Kendrick, Christine and Dubey, Abhishek and Rhee, Sokwoo},
booktitle = {Proceedings of the 2nd International Workshop on Science of Smart City Operations and Platforms Engineering, SCOPE@CPSWeek 2017, Pittsburgh, PA, USA, April 21, 2017},
title = {Indicator frameworks},
year = {2017},
pages = {19--25},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/conf/cpsweek/TanKDR17},
category = {workshop},
contribution = {minor},
doi = {10.1145/3063386.3063762},
file = {:Tan2017-indicator_frameworks.pdf:PDF},
project = {smart-cities},
timestamp = {Tue, 06 Nov 2018 16:59:05 +0100},
url = {https://doi.org/10.1145/3063386.3063762},
what = {This work develops abstract indicator frameworks, a diagrammatic tool for constructing correlations between random variables in systems. The approach models operational indicators using process theory, including composition and tensoring of processes. The paper extends categorical theory approaches to indicator analysis, using Rand diagrams and process theories to capture both causal and statistical aspects of complex systems.},
why = {Abstract indicator frameworks address a critical gap in operational indicator design by providing mathematically rigorous foundations for understanding relationships between system variables. This is innovative because it bridges category theory, probabilistic reasoning, and practical system design, enabling more sophisticated analyses that account for mediating variables and complex interdependencies often missed in traditional frameworks.},
results = {The paper demonstrates how abstract indicator frameworks can represent diverse systems including air pollution monitoring and urban planning scenarios. It shows how correlation networks can analyze high-dimensional systems by focusing on weighted correlations and soft thresholds. The approach successfully enables intuitive system descriptions while maintaining mathematical precision for causal and statistical interpretation.},
keywords = {indicator frameworks, category theory, process theory, causal networks, correlation analysis, system design, operational indicators},
project_tags = {planning}
}
We develop a diagrammatic tool for constructing correlations between random variables, called an abstract indicator framework. Abstract indicator frameworks are modeled o operational (key performance) indicator frameworks as they are used in city planning and project governance, and give a rigorous, statistically-motivated process for constructing operational indicator frameworks.
@inproceedings{Voelgyesi2017,
author = {V{\"{o}}lgyesi, P{\'{e}}ter and Dubey, Abhishek and Krentz, Timothy and Madari, Istv{\'{a}}n and Metelko, Mary and Karsai, Gabor},
booktitle = {International Symposium on Rapid System Prototyping, {RSP} 2017, Shortening the Path from Specification to Prototype, October 19-20, 2017, Seoul, South Korea},
title = {Time synchronization services for low-cost fog computing applications},
year = {2017},
pages = {57--63},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/conf/rsp/VolgyesiDKMMK17},
category = {selectiveconference},
contribution = {colab},
doi = {10.1145/3130265.3130325},
file = {:Voelgyesi2017-Time_synchronization_services_for_low-cost_fog_computing_applications.pdf:PDF},
keywords = {time synchronization, fog computing, GPS, PTP, distributed systems, edge computing, real-time systems, microsecond accuracy},
project = {cps-middleware,cps-reliability},
tag = {platform,decentralization},
timestamp = {Tue, 06 Nov 2018 11:07:11 +0100},
url = {https://doi.org/10.1145/3130265.3130325},
what = {This paper presents time synchronization infrastructure for low-cost fog computing platforms running distributed embedded systems. The work describes hardware and software implementations for achieving accurate time coordination across geographically dispersed edge computing nodes using GPS and PTP-based technologies. The platform supports multiple clock domains with master-slave hierarchies and distributed time synchronization mechanisms.},
why = {Time synchronization is critical for distributed embedded systems requiring coordinated actions across multiple nodes, especially in applications with tight timing constraints. This work innovates by providing a complete end-to-end solution for GPS-synchronized time services on low-cost edge devices, enabling microsecond-level accuracy critical for real-time control applications without requiring expensive specialized hardware.},
results = {The implementation achieves sub-microsecond synchronization accuracy across edge computing platforms using BeagleBone Black hardware. The evaluation demonstrates GPS to master-node PHC synchronization within 1 microsecond variance, and master-to-slave PHC synchronization achieving tight sub-microsecond coordination. These results validate the feasibility of accurate time coordination for low-cost fog computing applications.},
project_tags = {CPS, middleware}
}
This paper presents the time synchronization infrastructure for a low-cost run-time platform and application framework specifically targeting Smart Grid applications. Such distributed applications require the execution of reliable and accurate time-coordinated actions and observations both within islands of deployments and across geographically distant nodes. The time synchronization infrastructure is built on well-established technologies: GPS, NTP, PTP, PPS and Linux with real-time extensions, running on low-cost BeagleBone Black hardware nodes. We describe the architecture, implementation, instrumentation approach, performance results and present an example from the application domain. Also, we discuss an important finding on the effect of the Linux RT_PREEMPT real-time patch on the accuracy of the PPS subsystem and its use for GPS-based time references.
@inproceedings{Walker2017,
author = {Walker, Michael A. and Dubey, Abhishek and Laszka, Aron and Schmidt, Douglas C.},
booktitle = {Proceedings of the 4th Workshop on Middleware and Applications for the Internet of Things, M4IoT@Middleware 2017, Las Vegas, NV, USA, December 11, 2017},
title = {PlaTIBART: a platform for transactive IoT blockchain applications with repeatable testing},
year = {2017},
pages = {17--22},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/conf/middleware/WalkerDLS17},
category = {workshop},
contribution = {lead},
doi = {10.1145/3152141.3152392},
file = {:Walker2017-PlaTIBART_a_platform_for_transactive_IoT_blockchain_applications_with_repeatable_testing.pdf:PDF},
keywords = {blockchain, IoT, testing, transactional systems, domain-specific languages, distributed systems, fault tolerance},
project = {transactive-energy,cps-middleware,cps-reliability},
tag = {decentralization},
timestamp = {Tue, 06 Nov 2018 00:00:00 +0100},
url = {https://doi.org/10.1145/3152141.3152392},
what = {This paper presents PlaTIBART, a platform for transactive IoT blockchain applications with repeatable testing capabilities. The work combines the Actor pattern with Domain Specific Language (DSL) design to enable systematic development and testing of blockchain-based IoT systems. It provides a three-tier architecture including IoT middleware, distributed database, and blockchain layers with custom test networks.},
why = {Blockchain-enabled IoT applications require new patterns for ensuring security, auditability, and fault tolerance beyond traditional systems. PlaTIBART is innovative in providing systematic engineering approaches for blockchain-IoT integration through custom DSLs and the Observer pattern, enabling developers to construct reproducible test environments for validating transactional integrity across distributed IoT devices.},
results = {The platform successfully demonstrates transactive energy system testing with repeatable network scenarios. Experimental validation shows that PlaTIBART scales linearly with increasing client numbers and provides consistent management of blockchain test networks. The implementation validates the feasibility of systematic testing and failure detection in blockchain-IoT applications without requiring complex manual setup.},
project_tags = {energy, middleware, CPS}
}
With the advent of blockchain-enabled IoT applications, there is an increased need for related software patterns, middleware concepts, and testing practices to ensure adequate quality and productivity. IoT and blockchain each provide different design goals, concepts, and practices that must be integrated, including the distributed actor model and fault tolerance from IoT and transactive information integrity over untrustworthy sources from blockchain. Both IoT and blockchain are emerging technologies and both lack codified patterns and practices for development of applications when combined. This paper describes PlaTIBART, which is a platform for transactive IoT blockchain applications with repeatable testing that combines the Actor pattern (which is a commonly used model of computation in IoT) together with a custom Domain Specific Language (DSL) and test network management tools. We show how PlaTIBART has been applied to develop, test, and analyze fault-tolerant IoT blockchain applications.
@inproceedings{Pradhan2016d,
author = {{Pradhan}, S. and Dubey, Abhishek and {Neema}, S. and {Gokhale}, A.},
booktitle = {2016 1st International Workshop on Science of Smart City Operations and Platforms Engineering (SCOPE) in partnership with Global City Teams Challenge (GCTC) (SCOPE - GCTC)},
title = {Towards a generic computation model for smart city platforms},
year = {2016},
month = apr,
pages = {1-6},
category = {workshop},
contribution = {colab},
doi = {10.1109/SCOPE.2016.7515059},
file = {:Pradhan2016d-Towards_a_Generic_Computation_Model_for_Smart_City_Platforms.pdf:PDF},
issn = {null},
keywords = {smart city, cyber-physical systems, computation models, edge computing, task graphs, distributed systems, resource management},
tag = {platform},
what = {This paper addresses generic computation models for smart city platforms supporting cyber-physical systems with heterogeneous application types. The CHARIOT computation model represents distributed computations as task graphs supporting diverse timing requirements and data processing patterns. The model enables mapping of time-driven, batch, and stream processing onto common computation abstractions.},
why = {Smart cities require supporting multiple concurrent applications with different computational requirements (real-time, near real-time, and batch) on shared infrastructure. CHARIOT innovates by providing a unified computation model that accommodates heterogeneous application types through tasklets and transport abstractions, enabling efficient resource sharing while preserving quality-of-service requirements.},
results = {The paper demonstrates how CHARIOT maps to existing computation patterns and scales across multiple edge and cloud computing nodes. It shows successful integration of control applications, edge analytics, and data processing workflows within a single platform. The validation using transactive energy and traffic control scenarios confirms the model's ability to support diverse smart city applications.},
project_tags = {CPS, middleware, scalable AI}
}
Smart emergency response systems, smart transportation systems, smart parking spaces are some examples of multi-domain smart city systems that require large-scale, open platforms for integration and execution. These platforms illustrate high degree of heterogeneity. In this paper, we focus on software heterogeneity arising from different types of applications. The source of variability among applications stems from (a) timing requirements, (b) rate and volume of data they interact with, and (c) behavior depending on whether they are stateful or stateless. These variations result in applications with different computation models. However, a smart city system can comprise multi-domain applications with different types and therefore computation models. As such, a key challenge that arises is that of integration; we require some mechanism to facilitate integration and interaction between applications that use different computation models. In this paper, we first identify computation models based on different application types. Second, we present a generic computation model and explain how it can map to previously identified computation models. Finally, we briefly describe how the generic computation model fits in our overall smart city platform architecture.
@inproceedings{Biswas2016,
author = {Biswas, Gautam and Khorasgani, Hamed and Stanje, Gerald and Dubey, Abhishek and Deb, Somnath and Ghoshal, Sudipto},
booktitle = {Prognostics and Health Management Conference},
title = {An application of data driven anomaly identification to spacecraft telemetry data},
year = {2016},
category = {conference},
contribution = {colab},
file = {:Biswas2016-An_application_of_data_driven_anomaly_identification_to_spacecraft_telemetry_data.pdf:PDF},
keywords = {anomaly detection, spacecraft telemetry, wavelet analysis, clustering, unsupervised learning, feature extraction, health monitoring},
tag = {ai4cps},
what = {This work proposes a mixed method combining unsupervised learning and expert analysis for detecting anomalies in spacecraft telemetry data. The approach divides mission timelines into segments, applies wavelet transforms for feature extraction, and uses hierarchical clustering to group normal versus anomalous operational behaviors. Expert input validates clustering results and identifies special operating modes.},
why = {Spacecraft telemetry monitoring requires detecting unanticipated failures and operational anomalies without complete prior knowledge of system behaviors. This work is innovative in combining data-driven clustering with expert knowledge to differentiate between specialized operational modes and genuine anomalies, enabling more effective real-time monitoring and failure prevention in complex space missions.},
results = {The method successfully identifies anomalies in LADEE spacecraft electrical power system data across 223 mission days with extensive telemetry. Wavelet transform feature extraction combined with hierarchical clustering effectively groups similar operational patterns. The expert-guided interpretation distinguishes nominal operations from anomalous behaviors, validating the mixed-method approach for spacecraft health monitoring.},
project_tags = {ML for CPS}
}
In this paper, we propose a mixed method for analyzing telemetry data from a robotic space mission. The idea is to first apply unsupervised learning methods to the telemetry data divided into temporal segments. The large clusters that ensue typically represent the nominal operations of the spacecraft and are not of interest from an anomaly detection viewpoint. However, the smaller clusters and outliers that result from this analysis may represent specialized modes of operation, e.g., conduct of a specialized experiment on board the spacecraft, or they may represent true anomalous or unexpected behaviors. To differentiate between specialized modes and anomalies, we employ a supervised method of consulting human mission experts in the approach presented in this paper. Our longer term goal is to develop more automated methods for detecting anomalies in time series data, and once anomalies are identified, use feature selection methods to build online detectors that can be used in future missions, thus contributing to making operations more effective and improving overall safety of the mission.
@article{Biswas2016a,
author = {Biswas, Gautam and Khorasgani, Hamed and Stanje, Gerald and Dubey, Abhishek and Deb, Somnath and Ghoshal, Sudipto},
journal = {International Journal of Prognostics and Health Management},
title = {An approach to mode and anomaly detection with spacecraft telemetry data},
year = {2016},
contribution = {colab},
file = {:Biswas2016a-An_approach_to_mode_and_anomaly_detection_with_spacecraft_telemetry_data.pdf:PDF},
keywords = {mode detection, anomaly detection, spacecraft systems, unsupervised learning, temporal analysis, hybrid methods, health monitoring},
tag = {a14cps},
what = {This paper discusses mode and anomaly detection approaches for spacecraft telemetry combining unsupervised learning with expert knowledge. The method applies data-driven techniques to identify temporal patterns representing operational modes, distinguishes between specialized and anomalous modes using hybrid approaches. The framework enables automated detection while accounting for temporal dynamics in telemetry signatures.},
why = {Spacecraft operations involve complex mode transitions and potential anomalies difficult to characterize a priori. This approach innovates by combining unsupervised learning to discover operational patterns with expert consultation to validate findings, enabling detection of new anomalies while avoiding false positives from specialized operational scenarios previously unknown to the detection system.},
results = {The framework successfully demonstrates mode classification and anomaly identification in spacecraft power system data through hierarchical clustering and feature analysis. Temporal pattern matching reveals distinct operational signatures for different mission phases. Expert validation confirms the approach effectively separates nominal specialized modes from true anomalous behaviors in long-duration space missions.},
project_tags = {ML for CPS}
}
This paper discusses a mixed method that combines unsupervised learning methods and human expert input for analyzing telemetry data from long-duration robotic space missions. Our goal is to develop more automated methods detecting anomalies in time series data. Once anomalies are identified using unsupervised learning methods we use feature selection methods followed by expert input to derive the knowledge required for building on-line detectors. These detectors can be used in later phases of the current mission and in future missions for improving operations and overall safety of the mission. Whereas the primary focus in this paper is on developing data-driven anomaly detection methods, we also present a computational platform for data mining and analytics that can operate on historical data offline, as well as incoming telemetry data on-line.
@inproceedings{Chhokra2016,
author = {Chhokra, Ajay and Dubey, Abhishek and Mahadevan, Nagabhushan and Karsai, Gabor},
booktitle = {7th {ACM/IEEE} International Conference on Cyber-Physical Systems, {ICCPS} 2016, Vienna, Austria, April 11-14, 2016},
title = {Poster Abstract: Distributed Reasoning for Diagnosing Cascading Outages in Cyber Physical Energy Systems},
year = {2016},
pages = {33:1},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/conf/iccps/ChhokraDMK16},
category = {poster},
contribution = {lead},
doi = {10.1109/ICCPS.2016.7479113},
file = {:Chhokra2016-Poster_Abstract_Distributed_Reasoning_for_Diagnosing_Cascading_Outages_in_Cyber_Physical_Energy_Systems.pdf:PDF},
keywords = {power systems, failure diagnosis, cascading outages, distributed reasoning, temporal causal models, protection systems},
project = {cps-reliability},
tag = {platform},
timestamp = {Wed, 16 Oct 2019 14:14:57 +0200},
url = {https://doi.org/10.1109/ICCPS.2016.7479113},
what = {This poster presents a distributed reasoning approach for diagnosing cascading outages in cyber-physical energy systems. The work uses temporal causal diagrams to model failure propagation through power grid protection equipment, enabling diagnosis of complex failure scenarios where protection elements interact across the system.},
why = {Power grid failures often cascade through complex interdependencies between protection devices and system components. This work is innovative in applying distributed online diagnostic reasoning with temporal causal models to enable systematic root-cause analysis of cascading blackouts, accounting for timing delays and interdependencies in protection device operations.},
results = {The approach successfully diagnoses IEEE 14-Bus test cases showing how protection element misconfiguration leads to cascading failures. Temporal causal reasoning correctly identifies failure sequences including protection equipment misoperations and resulting blackout scenarios. The method demonstrates feasibility of systematic diagnosis for complex power system failure modes.},
project_tags = {energy, CPS}
}
The power grid incorporates a number of protection elements such as distance relays that detect faults and prevent the propagation of failure effects from influencing the rest of system. However, the decision of these protection elements is only influenced by local information in the form of bus voltage/current (V-I) samples. Due to lack of system wide perspective, erroneous settings, and latent failure modes, protection devices often mis-operate and cause cascading effects that ultimately lead to blackouts. Blackouts around the world have been triggered or worsened by circuit breakers tripping, including the blackout of 2003 in North America, where the secondary/ remote protection relays incorrectly opened the breaker. Tools that aid the operators in finding the root cause of the problem on-line are required. However, high system complexity and the interdependencies between the cyber and physical elements of the system and the mis-operation of protection devices make the failure diagnosis a challenging problem.
@inproceedings{Dubey2016,
author = {Dubey, Abhishek and Pradhan, Subhav and Schmidt, Douglas C. and Rusitschka, Sebnem and Sturm, Monika},
booktitle = {Proceedings of the 3rd Workshop on Middleware for Context-Aware Applications in the IoT, M4IoT@Middleware 2016, Trento, Italy, December 12-13, 2016},
title = {The Role of Context and Resilient Middleware in Next Generation Smart Grids},
year = {2016},
pages = {1--6},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/conf/middleware/DubeyPSRS16},
category = {workshop},
contribution = {lead},
doi = {10.1145/3008631.3008632},
file = {:Dubey2016-The_Role_of_Context_and_Resilient_Middleware_in_Next_Generation_Smart_Grids.pdf:PDF},
keywords = {smart grids, middleware, context-awareness, adaptive systems, resilience, protection systems, system reconfiguration},
project = {cps-reliability,cps-middleware},
tag = {platform,power},
timestamp = {Tue, 06 Nov 2018 16:57:13 +0100},
url = {https://doi.org/10.1145/3008631.3008632},
what = {This work examines the role of context and resilient middleware in next-generation smart grids. It presents intelligent devices and context-aware middleware enabling smart grid adaptation to changing operational requirements. The CHARIOT middleware platform provides goal-based system descriptions supporting adaptive protection and system reconfiguration in response to contextual changes.},
why = {Modern smart grids face volatile distributed energy sources and changing operational constraints requiring adaptive response. This work innovates by introducing context-aware middleware that enables decoupling of software functionality from static hardware configurations, allowing dynamic adaptation of protection and control based on real-time system context and changing grid conditions.},
results = {The paper demonstrates CHARIOT's capability to model smart grid systems with context-driven reconfigurations through goal-based system descriptions. Case studies show adaptive protection mechanisms responding to equipment failures and changing system states. The framework validates that middleware-driven adaptation maintains system resilience without sacrificing functional correctness.},
project_tags = {energy, middleware, CPS}
}
The emerging trends of volatile distributed energy resources and micro-grids are putting pressure on electrical power system infrastructure. This pressure is motivating the integration of digital technology and advanced power-industry practices to improve the management of distributed electricity generation, transmission, and distribution, thereby creating a web of systems. Unlike legacy power system infrastructure, however, this emerging next-generation smart grid should be context-aware and adaptive to enable the creation of applications needed to enhance grid robustness and efficiency. This paper describes key factors that are driving the architecture of smart grids and describes orchestration middleware needed to make the infrastructure resilient. We use an example of adaptive protection logic in smart grid substations as a use case to motivate the need for contextawareness and adaptivity.
@inproceedings{Emfinger2016,
author = {Emfinger, William and Dubey, Abhishek and V{\"{o}}lgyesi, P{\'{e}}ter and Sallai, J{\'{a}}nos and Karsai, Gabor},
booktitle = {{IEEE/ACM} Symposium on Edge Computing, {SEC} 2016, Washington, DC, USA, October 27-28, 2016},
title = {Demo Abstract: {RIAPS} - {A} Resilient Information Architecture Platform for Edge Computing},
year = {2016},
pages = {119--120},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/conf/edge/EmfingerDVSK16},
category = {poster},
contribution = {lead},
doi = {10.1109/SEC.2016.23},
file = {:Emfinger2016-Demo_Abstract_RIAPS-A_Resilient_Information_Architecture_Platform_for_Edge_Computing.pdf:PDF},
keywords = {edge computing, smart grid, resilient systems, distributed applications, middleware, platform architecture, fault tolerance},
project = {cps-middleware},
tag = {platform,decentralization,power},
timestamp = {Wed, 16 Oct 2019 14:14:56 +0200},
url = {https://doi.org/10.1109/SEC.2016.23},
what = {This demo abstract presents RIAPS, a Resilient Information Architecture Platform for Smart Grid applications. The platform provides software architecture for deploying distributed intelligent applications on edge computing platforms for grid monitoring and control. It demonstrates a traffic light control scenario using embedded computing nodes communicating in Hardware-in-Loop testbed configurations.},
why = {Smart grid and edge computing applications require systematic platforms enabling development, deployment, and management of resilient distributed applications. RIAPS innovates by providing a complete middleware stack with adaptive scheduling and fault management capabilities specific to critical infrastructure applications requiring both reliability and flexibility.},
results = {RIAPS successfully demonstrates traffic intersection control application deployment across multiple edge computing nodes in Hardware-in-Loop testing. The platform supports dynamic composition of computing and communication networks with fault tolerance and adaptive scheduling. Experimental validation shows feasibility of deploying complex CPS applications using the RIAPS architecture.},
project_tags = {energy, CPS, middleware, scalable AI}
}
The emerging CPS/IoT ecosystem platforms such as Beaglebone Black, Raspberry Pi, Intel Edison and other edge devices such as SCALE, Paradrop are providing new capabilities for data collection, analysis and processing at the edge (also referred to as Fog Computing). This allows the dynamic composition of computing and communication networks that can be used to monitor and control the physical phenomena closer to the physical system. However, there are still a number of challenges that exist and must be resolved before we see wider applicability of these platforms for applications in safety-critical application domains such as Smart Grid and Traffic Control.
@article{Martins2016,
author = {Martins, Gon{\c{c}}alo and Moondra, Arul and Dubey, Abhishek and Bhattacharjee, Anirban and Koutsoukos, Xenofon D.},
journal = {Sensors},
title = {Computation and Communication Evaluation of an Authentication Mechanism for Time-Triggered Networked Control Systems},
year = {2016},
number = {8},
pages = {1166},
volume = {16},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/journals/sensors/MartinsMDBK16},
contribution = {minor},
doi = {10.3390/s16081166},
file = {:Martins2016-Computation_and_Communication_Evaluation_of an Authentication_Mechanism_for_Time-Triggered_Networked_Control_Systems.pdf:PDF},
keywords = {time-triggered systems, authentication, HMAC, network security, cyber-physical systems, automotive systems, real-time systems},
project = {cps-middleware,cps-reliability},
tag = {platform},
timestamp = {Wed, 14 Nov 2018 00:00:00 +0100},
url = {https://doi.org/10.3390/s16081166},
what = {This paper presents computational and communication evaluation of an authentication mechanism for Time-Triggered (TT) networked control systems. It evaluates HMAC-based message authentication for ensuring data integrity in automotive control applications. The work analyzes both computational overhead on embedded processors and communication bandwidth requirements across wired and wireless platforms.},
why = {Time-Triggered architectures provide deterministic communication guarantees for safety-critical systems, but adding security mechanisms risks violating timing constraints. This work innovates by comprehensively evaluating HMAC authentication overhead in TT systems, demonstrating that security can be added without compromising real-time guarantees critical for automotive and control applications.},
results = {Experimental evaluation on both wired (TTEthernet) and wireless platforms shows HMAC implementation feasibility with minimal overhead. Results demonstrate scalable performance supporting multiple communication platforms while maintaining timing requirements. The work validates that secure communication mechanisms can be integrated into TT systems without jeopardizing temporal predictability.},
project_tags = {CPS, middleware}
}
In modern networked control applications, confidentiality and integrity are important features to address in order to prevent against attacks. Moreover, network control systems are a fundamental part of the communication components of current cyber-physical systems (e.g., automotive communications). Many networked control systems employ Time-Triggered (TT) architectures that provide mechanisms enabling the exchange of precise and synchronous messages. TT systems have computation and communication constraints, and with the aim to enable secure communications in the network, it is important to evaluate the computational and communication overhead of implementing secure communication mechanisms. This paper presents a comprehensive analysis and evaluation of the effects of adding a Hash-based Message Authentication (HMAC) to TT networked control systems. The contributions of the paper include (1) the analysis and experimental validation of the communication overhead, as well as a scalability analysis that utilizes the experimental result for both wired and wireless platforms and (2) an experimental evaluation of the computational overhead of HMAC based on a kernel-level Linux implementation. An automotive application is used as an example, and the results show that it is feasible to implement a secure communication mechanism without interfering with the existing automotive controller execution times. The methods and results of the paper can be used for evaluating the performance impact of security mechanisms and, thus, for the design of secure wired and wireless TT networked control systems.
@inproceedings{Nannapaneni2016,
author = {Nannapaneni, Saideep and Mahadevan, Sankaran and Pradhan, Subhav and Dubey, Abhishek},
booktitle = {2016 {IEEE} International Conference on Smart Computing, {SMARTCOMP} 2016, St Louis, MO, USA, May 18-20, 2016},
title = {Towards Reliability-Based Decision Making in Cyber-Physical Systems},
year = {2016},
note = {At Workshop},
pages = {1--6},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/conf/smartcomp/NannapaneniMPD16},
category = {workshop},
contribution = {lead},
doi = {10.1109/SMARTCOMP.2016.7501724},
file = {:Nannapaneni2016-Towards_Reliability-Based_Decision_Making_in_Cyber-Physical_Systems.pdf:PDF},
keywords = {reliability analysis, cyber-physical systems, software reliability, hardware reliability, functional decomposition, smart parking, component modeling},
project = {cps-reliability},
tag = {platform},
timestamp = {Wed, 16 Oct 2019 14:14:54 +0200},
url = {https://doi.org/10.1109/SMARTCOMP.2016.7501724},
what = {This work develops a reliability evaluation framework for cyber-physical systems incorporating both software and hardware reliability. It addresses reliability analysis by considering functional decomposition of systems and component dependencies. The framework models failure rates across sensors, software, communication systems, and hardware to estimate overall system reliability.},
why = {CPS reliability evaluation is challenging because software and hardware failures are interdependent and traditional approaches fail to account for tight coupling. This work innovates by providing integrated reliability analysis that decomposes system functions into components with explicit dependency modeling, enabling realistic reliability assessment for complex CPS applications.},
results = {The framework successfully analyzes a smart parking system demonstrating reliability computation across software applications, sensor networks, communication systems, and physical devices. Component-level reliability estimates combine into overall system reliability through functional decomposition. Results show how different failure modes and redundancy strategies affect system-level reliability metrics.},
project_tags = {CPS, scalable AI}
}
Cyber-physical systems (CPS) are systems with a tight integration between the computational (also referred to as software or cyber) and physical (hardware) components. While the reliability evaluation of physical systems is well-understood and well-studied, reliability evaluation of CPS is difficult because software systems do not degrade and follow a well-defined failure model like physical systems. In this paper, we propose a framework for formulating the CPS reliability evaluation as a dependence problem derived from the software component dependences, functional requirements and physical system dependences. We also consider sensor failures, and propose a method for estimating software failures in terms of associated hardware and software inputs. This framework is codified in a domain-specific modeling language, where every system-level function is mapped to a set of required components using functional decomposition and function-component association; this provides details about operational constraints and dependences. We also illustrate how the encoded information can be used to make reconfiguration decisions at runtime. The proposed methodology is demonstrated using a smart parking system, which provides localization and guidance for parking within indoor environments.
@article{Nannapaneni2016a,
author = {Nannapaneni, Saideep and Dubey, Abhishek and Abdelwahed, Sherif and Mahadevan, Sankaran and Neema, Sandeep and Bapty, Ted},
journal = {International Journal of Prognostics and Health Management},
title = {Mission-based reliability prediction in component-based systems},
year = {2016},
number = {001},
volume = {7},
contribution = {colab},
file = {:Nannapaneni2016a-Mission-based_reliability_prediction_in_component-based_systems.pdf:PDF},
keywords = {reliability prediction, mission-based assessment, Bayesian networks, component dependencies, autonomous systems, design trade-offs},
what = {This paper develops mission-based reliability prediction for component-based systems using Bayesian networks and formal modeling approaches. The work extracts reliability block diagrams from system models to enable mission-specific reliability assessment. It incorporates failure rate dependencies between components and models operational constraints affecting system reliability during missions.},
why = {Systems performing specific missions face varying reliability requirements and operational constraints. This work innovates by enabling mission-specific reliability prediction through formal system models and Bayesian networks, allowing assessment of whether system designs can satisfy mission requirements and supporting design trade-off analysis based on reliability metrics.},
results = {The framework successfully demonstrates mission-based reliability assessment for an autonomous vehicle performing surveillance. Bayesian network modeling of component dependencies enables realistic failure probability computation. The approach validates mission feasibility and supports design decisions based on component reliability and mission-specific requirements.},
project_tags = {CPS, scalable AI}
}
This paper develops a framework for the extraction of a reliability block diagram in component-based systems for reliability prediction with respect to specific missions. A mission is defined as a composition of several high-level functions occurring at different stages and for a specific time during the mission. The high-level functions are decomposed into lower-level functions, which are then mapped to their corresponding components or component assemblies. The reliability block diagram is obtained using functional decomposition and function-component association. Using the reliability block diagram and the reliability information on the components such as failure rates, the reliability of the system carrying out a mission can be estimated. The reliability block diagram is evaluated by converting it into a logic (Boolean) expression. A modeling language created using the Generic Modeling Environment (GME) platform is used, which enables modeling of a system and captures the functional decomposition and function-component association in the system. This framework also allows for real-time monitoring of the system performance where the reliability of the mission can be computed over time as the mission progresses. The uncertainties in the failure rates and operational time of each high-level function are also considered which are quantified through probability distributions using the Bayesian framework. The dependence between failures of components are also considered and are quantified through a Bayesian network (BN). Other quantities of interest such as mission feasibility and function availability can also be assessed using this framework. Mission feasibility analysis determines if the mission can be accomplished given the current state of components in the system, and function availability provides information whether the function will be available in the future given the current state of the system. The proposed methodology is demonstrated using a radio-controlled (RC) car to carry out a simple surveillance mission.
@inproceedings{Neema2016,
author = {Neema, Himanshu and Emfinger, William and Dubey, Abhishek},
booktitle = {Proceedings of the 3rd International Transactive Energy Systems, Portland, Oregon, USA},
title = {A Reusable and Extensible Web-Based Co-Simulation Platform for Transactive Energy Systems},
year = {2016},
volume = {12},
category = {workshop},
contribution = {lead},
file = {:Neema2016-A_Reusable_and_Extensible_Web-Based_Co-Simulation_Platform_for_Transactive_Energy_Systems.pdf:PDF},
keywords = {transactive energy, co-simulation, web-based platform, energy markets, smart grids, distributed simulation, multi-domain integration},
tag = {platform,power},
what = {This work presents a reusable web-based co-simulation platform for transactive energy systems. The C2WT-TE platform enables integrated evaluations of transactive energy systems combining power domain models, market dynamics, and regulatory environments. It provides high-level architecture supporting heterogeneous simulators including commercial tools and custom models through standardized interfaces.},
why = {Transactive energy systems require complex multi-domain evaluations combining physical grid dynamics, economic markets, and cyber infrastructure. C2WT-TE innovates by providing a cloud-deployed integration platform enabling seamless connection of diverse simulators while maintaining coherent evaluation of coupled system behaviors across multiple domains and timescales.},
results = {The platform successfully demonstrates integrated simulation of transactive energy systems combining SUMO traffic simulation, GridLAB-D power system models, and economic market simulators. Experimental evaluation shows feasibility of multi-domain co-simulation for analyzing grid operations with distributed energy resources and market mechanisms. Results validate the approach for complex infrastructure evaluations.},
project_tags = {energy, middleware}
}
Rapid evolution of energy generation technology and increased used of distributed energy resources (DER) is continually pushing utilities to adapt and evolve business models to align with these changes. Today, more consumers are also producing energy using green generation technologies and energy pricing is becoming rather competitive and transactional, needing utilities to increase flexibility of grid operations and incorporate transactive energy systems (TES). However, a huge bottleneck is to ensure stable grid operations while gaining efficiency. A comprehensive platform is therefore needed for grid-scale multi-aspects integrated evaluations. For instance, cyber-attacks in a road traffic controller’s communication network can subtly divert electric vehicles in a particular area, causing surge in the grid loads due to increased EV charging and people activity, which can potentially disrupt, an otherwise robust, grid. To evaluate such a scenario, multiple special-purpose simulators (e.g., SUMO, OMNeT++, GridlabD, etc.) must be run in an integrated manner. To support this, we are developing a cloud-deployed web- and model-based simulation integration platform that enables integrated evaluations of transactive energy systems and is highly extensible and customizable for utility-specific custom simulation tools.
@inproceedings{Oruganti2016,
author = {Oruganti, Aparna and Sun, Fangzhou and Baroud, Hiba and Dubey, Abhishek},
booktitle = {2016 {IEEE} International Conference on Big Data, BigData 2016, Washington DC, USA, December 5-8, 2016},
title = {DelayRadar: {A} multivariate predictive model for transit systems},
year = {2016},
pages = {1799--1806},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/conf/bigdataconf/OrugantiSBD16},
category = {selectiveconference},
contribution = {colab},
doi = {10.1109/BigData.2016.7840797},
file = {:Oruganti2016-DelayRadar_A_multivariate_predictive_model_for_transit_systems.pdf:PDF},
keywords = {transit systems, delay prediction, machine learning, clustering, weather analysis, real-time prediction, decision support},
project = {smart-transit,smart-cities},
tag = {transit},
timestamp = {Wed, 16 Oct 2019 14:14:51 +0200},
url = {https://doi.org/10.1109/BigData.2016.7840797},
what = {This paper presents DelayRadar, a multivariate predictive model for transit systems using machine learning to forecast bus arrival times. The approach integrates real-time transit data, static schedules, historical data, and weather information to predict delays. It combines clustering analysis with regression and tree-based models to identify patterns and make accurate predictions.},
why = {Transit system delays frustrate riders and reduce adoption of public transportation. DelayRadar innovates by integrating diverse data sources and using multivariate modeling to explain over 70% of delay variance, enabling accurate transit predictions that improve user experience and operational planning. The clustering approach identifies seasonal and time-of-day patterns for more accurate forecasting.},
results = {The system achieves 4-5 minute prediction errors with real-time data and 47% improvement when predicting 15-minute future delays. Clustering analysis reveals distinct morning and afternoon delay patterns. Regression and tree-based models outperform simple baselines, demonstrating feasibility of accurate transit delay prediction for operational decision support.},
project_tags = {transit, planning, scalable AI}
}
Effective public transit operations are one of the fundamental requirements for a modern community. Recently, a number of transit agencies have started integrating automated vehicle locators in their fleet, which provides a real-time estimate of the time of arrival. In this paper, we use the data collected over several months from one such transit system and show how this data can be potentially used to learn long term patterns of travel time. More specifically, we study the effect of weather and other factors such as traffic on the transit system delay. These models can later be used to understand the seasonal variations and to design adaptive and transient transit schedules. Towards this goal, we also propose an online architecture called DelayRadar. The novelty of DelayRadar lies in three aspects: (1) a data store that collects and integrates real-time and static data from multiple data sources, (2) a predictive statistical model that analyzes the data to make predictions on transit travel time, and (3) a decision making framework to develop an optimal transit schedule based on variable forecasts related to traffic, weather, and other impactful factors. This paper focuses on identifying the model with the best predictive accuracy to be used in DelayRadar. According to the preliminary study results, we are able to explain more than 70% of the variance in the bus travel time and we can make future travel predictions with an out-of-sample error of 4.8 minutes with information on the bus schedule, traffic, and weather.
@article{Pradhan2016,
author = {Pradhan, Subhav and Dubey, Abhishek and Levendovszky, Tihamer and Kumar, Pranav Srinivas and Emfinger, William and Balasubramanian, Daniel and Otte, William and Karsai, Gabor},
journal = {Journal of Systems and Software},
title = {Achieving resilience in distributed software systems via self-reconfiguration},
year = {2016},
pages = {344--363},
volume = {122},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/journals/jss/PradhanDLKEBOK16},
contribution = {lead},
doi = {10.1016/j.jss.2016.05.038},
file = {:Pradhan2016-Achieving_resilience_in_distributed_software_systems_via_self-reconfiguration.pdf:PDF},
keywords = {resilience, self-reconfiguration, distributed systems, fault tolerance, mobile platforms, design-time analysis, runtime adaptation},
project = {cps-middleware,cps-reliability},
tag = {platform,a14cps},
timestamp = {Mon, 06 Nov 2017 00:00:00 +0100},
url = {https://doi.org/10.1016/j.jss.2016.05.038},
what = {This comprehensive work develops resilience mechanisms for distributed cyber-physical systems through runtime self-reconfiguration. The paper presents design-time reliability analysis tools and runtime self-reconfiguration infrastructure enabling autonomous system resilience. It demonstrates application to mobile CPS platforms including fractional satellite clusters requiring dynamic adaptation.},
why = {Mobile CPS platforms operate in dynamic, unpredictable environments requiring autonomous adaptation to failures without external intervention. This work innovates by combining design-time analysis tools with runtime reconfiguration infrastructure, enabling distributed systems to autonomously detect failures and recompute optimal configurations maintaining system resilience throughout operations.},
results = {The framework successfully demonstrates resilience for fractional satellite systems handling node failures and configuration recomputation. Design-time reliability analysis identifies configuration points enabling efficient runtime adaptation. Experimental results show the system maintains functionality when components fail by transitioning to alternative configurations computed at runtime.},
project_tags = {CPS, scalable AI}
}
Improvements in mobile networking combined with the ubiquitous availability and adoption of low-cost development boards have enabled the vision of mobile platforms of Cyber-Physical Systems (CPS), such as fractionated spacecraft and UAV swarms. Computation and communication resources, sensors, and actuators that are shared among different applications characterize these systems. The cyber-physical nature of these systems means that physical environments can affect both the resource availability and software applications that depend on resource availability. While many application development and management challenges associated with such systems have been described in existing literature, resilient operation and execution have received less attention. This paper describes our work on improving runtime support for resilience in mobile CPS, with a special focus on our runtime infrastructure that provides autonomous resilience via self-reconfiguration. We also describe the interplay between this runtime infrastructure and our design-time tools, as the later is used to statically determine the resilience properties of the former. Finally, we present a use case study to demonstrate and evaluate our design-time resilience analysis and runtime self-reconfiguration infrastructure.
@inproceedings{Pradhan2016a,
author = {Pradhan, Subhav and Dubey, Abhishek and Khare, Shweta and Sun, Fangzhou and Sallai, J{\'{a}}nos and Gokhale, Aniruddha S. and Schmidt, Douglas C. and Lehofer, Martin and Sturm, Monika},
booktitle = {{IEEE/ACM} Symposium on Edge Computing, {SEC} 2016, Washington, DC, USA, October 27-28, 2016},
title = {Poster Abstract: {A} Distributed and Resilient Platform for City-Scale Smart Systems},
year = {2016},
pages = {99--100},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/conf/edge/PradhanDKSSGSLS16},
category = {poster},
contribution = {lead},
doi = {10.1109/SEC.2016.28},
file = {:Pradhan2016a-Poster_Abstract_A_Distributed_and_Resilient_Platform_for_City-Scale_Smart_Systems.pdf:PDF},
keywords = {smart cities, resilience, distributed systems, goal-based models, hierarchical architecture, self-reconfiguration},
project = {cps-middleware,smart-cities},
tag = {platform},
timestamp = {Wed, 16 Oct 2019 14:14:56 +0200},
url = {https://doi.org/10.1109/SEC.2016.28},
what = {This brief presents a distributed and resilient platform for city-scale smart systems. The work extends CHARIOT with resilience zone mechanisms enabling hierarchical recovery from failures. The platform supports component-based application deployment with goal-based system descriptions for adaptive reconfiguration in response to failures.},
why = {City-scale smart systems span multiple resource domains and administrative boundaries requiring hierarchical resilience approaches. This work innovates by introducing resilience zones with local recovery capability, enabling smart city systems to maintain functionality despite failures while managing resources across different availability levels.},
results = {The system demonstrates CHARIOT's capability to model city-scale smart systems with resilience zone architecture. Goal-based descriptions enable automatic reconfiguration when failures occur. The approach validates feasibility of maintaining system resilience across distributed infrastructure without centralized control.},
project_tags = {CPS, middleware, scalable AI}
}
The advent of the Internet of Things (IoT) is driving several technological trends. The first trend is an increased level of integration between edge devices and commodity computers. This trend, in conjunction with low power-devices, energy harvesting, and improved battery technology, is enabling the next generation of information technology (IT) innovation: city-scale smart systems. These types of IoT systems can operate at multiple time-scales, ranging from closed-loop control requiring strict real-time decision and actuation to near real-time operation with humans-in-the-loop, as well as to long-term analysis, planning, and decision-making.
@inproceedings{Pradhan2016b,
author = {Pradhan, Subhav and Dubey, Abhishek and Gokhale, Aniruddha S.},
booktitle = {7th {ACM/IEEE} International Conference on Cyber-Physical Systems, {ICCPS} 2016, Vienna, Austria, April 11-14, 2016},
title = {WiP Abstract: Platform for Designing and Managing Resilient and Extensible {CPS}},
year = {2016},
pages = {39:1},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/conf/iccps/PradhanDG16},
category = {poster},
contribution = {lead},
doi = {10.1109/ICCPS.2016.7479128},
file = {:Pradhan2016b-WiP_Abstract_Platform_for_Designing_and_Managing_Resilient_and_Extensible_CPS.pdf:PDF},
keywords = {cyber-physical systems, resilience, extensibility, self-reconfiguration, design-time analysis, goal-based models},
project = {cps-reliability,cps-middleware},
tag = {platform},
timestamp = {Wed, 16 Oct 2019 14:14:57 +0200},
url = {https://doi.org/10.1109/ICCPS.2016.7479128},
what = {This work addresses platform design for resilient and extensible cyber-physical systems through design-time and runtime mechanisms. It presents CHARIOT, a platform with goal-based system descriptions supporting extensible CPS with autonomous resilience capabilities. The work emphasizes resilience as a key requirement for systems handling multiple failure modes.},
why = {Extensible CPS must maintain resilience despite changing resources, applications, and failure modes. This work innovates by providing a unified platform approach supporting both design-time analysis of system resilience and runtime reconfiguration mechanisms enabling autonomous adaptation to failures without external intervention.},
results = {The CHARIOT platform demonstrates successful modeling and management of resilient CPS applications through goal-based descriptions and self-reconfiguration. Experimental validation shows runtime configuration point computation enabling system adaptation to failures. The approach validates autonomous resilience capability for distributed CPS applications.},
project_tags = {CPS, middleware}
}
Extensible Cyber-Physical Systems (CPS) are loosely connected, multi-domain platforms that "virtualize" their resources to provide an open platform capable of hosting different cyber-physical applications. These cyber- physical platforms are extensible since resources and applications can be added or removed at any time. However, realizing such platform requires resolving challenges emanating from different properties; for this paper, we focus on resilience. Resilience is important for extensible CPS to make sure that extensibility of a system doesn’t result in failures and anomalies.
@inbook{Pradhan2016c,
author = {Pradhan, Subhav and Dubey, Abhishek and Gokhale, Aniruddha S.},
pages = {88--104},
title = {Designing a Resilient Deployment and Reconfiguration Infrastructure for Remotely Managed Cyber-Physical Systems},
year = {2016},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/conf/serene/PradhanDG16},
booktitle = {Software Engineering for Resilient Systems - 8th International Workshop, {SERENE} 2016, Gothenburg, Sweden, September 5-6, 2016, Proceedings},
contribution = {lead},
doi = {10.1007/978-3-319-45892-2\_7},
file = {:Pradhan2016c-Designing_a_Resilient_Deployment_and_Reconfiguration_Infrastructure_for_Remotely_Managed_CPS.pdf:PDF},
keywords = {deployment, reconfiguration, resilience, distributed systems, unmanned systems, component-based software, fault management},
project = {cps-reliability,cps-middleware},
tag = {platform},
timestamp = {Tue, 14 May 2019 10:00:48 +0200},
url = {https://doi.org/10.1007/978-3-319-45892-2\_7},
what = {This paper presents a design and development framework for resilient deployment and reconfiguration infrastructure in distributed CPS. It addresses challenges in achieving autonomous resilience for component-based applications through deployment and reconfiguration mechanisms. The work focuses on managing lifecycle and handling failures in multi-module systems.},
why = {Deploying and managing resilient distributed CPS requires systematic infrastructure supporting dynamic application reconfiguration. This work innovates by extending deployment standards to support resilient application management through component-based software engineering approaches enabling efficient deployment and recovery from failures.},
results = {The framework successfully demonstrates deployment and reconfiguration infrastructure for multi-module CPS including unmanned aerial vehicle systems. Experimental results show feasibility of autonomous reconfiguration responding to component failures. The work validates systematic deployment and recovery mechanisms for resilient distributed CPS.},
project_tags = {CPS, middleware}
}
Multi-module Cyber-Physical Systems (CPS), such as satellite clusters, swarms of Unmanned Aerial Vehicles (UAV), and fleets of Unmanned Underwater Vehicles (UUV) provide a CPS cluster-as-a-service for CPS applications. The distributed and remote nature of these systems often necessitates the use of Deployment and Configuration (D&C) services to manage the lifecycle of these applications. Fluctuating resources, volatile cluster membership and changing environmental conditions necessitate resilience. Thus, the D&C infrastructure does not only have to undertake basic management actions, such as activation of new applications and deactivation of existing applications, but also has to autonomously reconfigure existing applications to mitigate failures including D&C infrastructure failures. This paper describes the design and architectural considerations to realize such a D&C infrastructure for component-based distributed systems. Experimental results demonstrating the autonomous resilience capabilities are presented.
@inbook{Shekhar2016,
author = {Shekhar, Shashank and Sun, Fangzhou and Dubey, Abhishek and Gokhale, Aniruddha and Neema, Himanshu and Lehofer, Martin and Freudberg, Dan},
title = {A Smart Decision Support System for Public Transit Operations},
year = {2016},
booktitle = {Internet of Things and Data Analytics Handbook},
contribution = {lead},
file = {:Shekhar2016-Transit_Hub_A_Smart_Decision_Support_System_for_Public_Transit_Operations.pdf:PDF},
keywords = {transit systems, decision support, real-time data, predictive analytics, vehicle tracking, operational intelligence, smart cities},
tag = {transit},
what = {This work presents Transit Hub, a comprehensive smart decision support system for public transit operations integrating real-time data, predictive analytics, and operational decision support. The system combines automated vehicle location data, multiple data sources, and historical analysis to provide transit agencies with actionable intelligence for improving service quality and efficiency.},
why = {Public transit agencies struggle with traffic congestion and unpredictable service due to lack of integrated decision support. Transit Hub innovates by combining real-time tracking, data analytics, and predictive models to enable informed operational decisions improving service reliability and user experience. The integrated approach addresses transit system bottlenecks through data-driven insights.},
results = {The system successfully demonstrates integration of AVL data, weather information, and historical schedules providing actionable transit insights. Predictive models identify bottleneck routes and time periods for operational improvements. Real-time decision support enables dynamic transit schedule adjustments improving service quality and operational efficiency.},
project_tags = {transit, planning, scalable AI}
}
@inproceedings{Sun2016,
author = {Sun, Fangzhou and Pan, Yao and White, Jules and Dubey, Abhishek},
booktitle = {2016 {IEEE} International Conference on Smart Computing, {SMARTCOMP} 2016, St Louis, MO, USA, May 18-20, 2016},
title = {Real-Time and Predictive Analytics for Smart Public Transportation Decision Support System},
year = {2016},
pages = {1--8},
acceptance = {34},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/conf/smartcomp/SunPWD16},
category = {selectiveconference},
contribution = {lead},
doi = {10.1109/SMARTCOMP.2016.7501714},
file = {:Sun2016-Real-Time_and_Predictive_Analytics_for_Smart_Public_Transportation_Decision_Support_System.pdf:PDF},
keywords = {transit systems, predictive analytics, Kalman filtering, clustering, real-time prediction, decision support, transportation planning},
project = {smart-transit,smart-cities},
tag = {transit},
timestamp = {Wed, 16 Oct 2019 14:14:54 +0200},
url = {https://doi.org/10.1109/SMARTCOMP.2016.7501714},
what = {This paper presents real-time and predictive analytics for smart public transportation decision support systems. The work develops clustering models and predictive approaches for analyzing historical bus delay patterns and transit performance. It combines real-time vehicle scheduling with Kalman filtering for accurate arrival time predictions.},
why = {Accurate transit predictions require understanding complex delay patterns influenced by traffic, weather, and time-of-day factors. This work innovates by combining clustering analysis revealing distinct delay patterns with predictive models achieving 25% improvement in average arrival time prediction, enabling better user information and operational planning.},
results = {The system achieves accurate transit arrival predictions with 25% error reduction using clustering-based models. Real-time Kalman filtering integrates schedule adherence and actual vehicle locations. Experimental validation using Nashville transit data demonstrates 47% improvement when predicting next 15-minute arrivals with integrated real-time information.},
project_tags = {transit, planning, scalable AI}
}
Public bus transit plays an important role in city transportation infrastructure. However, public bus transit is often difficult to use because of lack of real- time information about bus locations and delay time, which in the presence of operational delays and service alerts makes it difficult for riders to predict when buses will arrive and plan trips. Precisely tracking vehicle and informing riders of estimated times of arrival is challenging due to a number of factors, such as traffic congestion, operational delays, varying times taken to load passengers at each stop. In this paper, we introduce a public transportation decision support system for both short-term as well as long-term prediction of arrival bus times. The system uses streaming real-time bus position data, which is updated once every minute, and historical arrival and departure data - available for select stops to predict bus arrival times. Our approach combines clustering analysis and Kalman filters with a shared route segment model in order to produce more accurate arrival time predictions. Experiments show that compared to the basic arrival time prediction model that is currently being used by the city, our system reduces arrival time prediction errors by 25 percent on average when predicting the arrival delay an hour ahead and 47 percent when predicting within a 15 minute future time window.
@inproceedings{Jain2015,
author = {{Jain}, R. and {Lukic}, S. M. and {Chhokra}, A. and {Mahadevan}, N. and Dubey, Abhishek and {Karsai}, G.},
booktitle = {2015 North American Power Symposium (NAPS)},
title = {An improved distance relay model with directional element, and memory polarization for TCD based fault propagation studies},
year = {2015},
month = oct,
pages = {1-6},
category = {selectiveconference},
contribution = {minor},
doi = {10.1109/NAPS.2015.7335206},
file = {:Jain2015-An_improved_distance_relay_model_with_directional_element_and_memory_polarization_for_TCD_based_fault_propagation_studies.pdf:PDF},
issn = {null},
keywords = {power systems, distance relay, temporal causal diagrams, fault detection, mho elements, memory polarization, directional protection},
tag = {power},
what = {This work presents an improved distance relay model incorporating directional elements and memory polarization techniques for power system fault detection. The approach integrates Temporal Causal Diagram (TCD) analysis with OpenDSS simulation to model fault propagation behavior in electrical grids. The relay implementation combines advanced algorithms for analyzing mho elements, directional characteristics, and memory effects to enhance detection accuracy. Testing demonstrates the relay's ability to identify various fault types while minimizing false positives through dynamic impedance calculations.},
why = {Distance relays are critical protection components in power systems, yet conventional approaches often produce misoperations due to inaccurate impedance calculations and failure to capture cascading effects. This work is innovative because it introduces memory polarization and directional analysis simultaneously, improving relay selectivity and security. The integration of TCD models enables the reasoner to understand system-wide fault dynamics, advancing the state of practical protection device design.},
results = {The relay model successfully detects forward and reverse faults with appropriate directional discrimination based on impedance calculations. Testing on a three-bus power system shows the relay can identify zone reaches accurately and respond with correct selectivity for different fault types. The memory-polarized approach reduces false tripping events and enables the relay to distinguish between legitimate and spurious faults through analysis of fault location and impedance values.},
project_tags = {energy, planning}
}
Modern Power Systems have evolved into a very complex network of multiple sources, lines, breakers, loads and others. The performance of these interdependent components decide the reliability of the power systems. A tool called “Reasoner” is being developed to deduce fault propagations using a Temporal Causal Diagram (TCD) approach. It translates the physical system as a Cause-effect model. This work discusses the development of an advanced distance relay model, which monitors the system, and challenges the operation of reasoner for refinement. Process of generation of a Fault and Discrepancy Mapping file from the test system is presented. This file is used by the reasoner to scrutinize relays’ responses for active system faults, and hypothesize potential mis-operations (or cyber faults) with a confidence metric. Analyzer (relay model) is integrated to OpenDSS for fault analysis. The understanding of the system interdependency (fault propagation behavior) using reasoner can make the grid more robust against cascaded failures.
@inproceedings{Chhokra2015,
author = {{Chhokra}, A. and {Abdelwahed}, S. and Dubey, Abhishek and {Neema}, S. and {Karsai}, G.},
booktitle = {2015 Electronic System Level Synthesis Conference (ESLsyn)},
title = {From system modeling to formal verification},
year = {2015},
month = jun,
pages = {41-46},
category = {conference},
contribution = {minor},
file = {:Chhokra2015-From_system_modeling_to_formal_verification.pdf:PDF},
issn = {2117-4628},
keywords = {SystemC, formal verification, code generation, GME, model-based design, embedded systems, cyber-physical systems},
tag = {platform},
what = {This paper addresses the challenge of converting SystemC designs into formal verification models through a Generic Modeling Environment (GME)-based approach. The work develops an automated toolchain that translates SystemC specifications into intermediate representations and then to formal languages like Uppaal. The methodology supports both hardware and software aspects of cyber-physical systems, enabling designers to graphically model systems and automatically generate simulation code.},
why = {SystemC designs lack formal semantics, making it difficult to perform automated verification and ensuring correctness of complex embedded systems. This work is innovative because it bridges the gap between practical SystemC development and formal methods by providing automatic code generation and verification through GME. This capability enables early detection of design errors and improves system reliability before hardware development.},
results = {The approach successfully translates SystemC designs containing processes, ports, and state machines into Uppaal timed automata models. The tool generates deployable code from verified models and demonstrates the translation with a power system case study. Results show that the automated verification framework can identify behavioral issues and assist in design refinement.},
project_tags = {CPS, middleware, Explainable AI}
}
Due to increasing design complexity, modern systems are modeled at a high level of abstraction. SystemC is widely accepted as a system level language for modeling complex embedded systems. Verification of these SystemC designs nullifies the chances of error propagation down to the hardware. Due to lack of formal semantics of SystemC, the verification of such designs is done mostly in an unsystematic manner. This paper provides a new modeling environment that enables the designer to simulate and formally verify the designs by generating SystemC code. The generated SystemC code is automatically translated to timed automata for formal analysis.
@inproceedings{Chhokra2015a,
author = {{Chhokra}, A. and Dubey, Abhishek and {Mahadevan}, N. and {Karsai}, G.},
booktitle = {2015 Workshop on Modeling and Simulation of Cyber-Physical Energy Systems (MSCPES)},
title = {A component-based approach for modeling failure propagations in power systems},
year = {2015},
month = apr,
pages = {1-6},
category = {workshop},
contribution = {colab},
doi = {10.1109/MSCPES.2015.7115412},
file = {:Chhokra2015a-A_component-based_approach_for_modeling_failure_propagations_in_power_systems.pdf:PDF},
keywords = {cyber-physical systems, failure propagation, temporal causal diagrams, model generation, power systems, protection devices},
tag = {platform,power},
what = {This work presents a component-based modeling approach for capturing failure propagation dynamics in power systems using an extensible domain-specific language and Generic Modeling Environment. The methodology enables automatic generation of Temporal Causal Diagram (TCD) models from high-level system descriptions. The approach decomposes complex systems into plant nodes, interface nodes, and protection elements, then traces fault effects through hierarchical models.},
why = {Understanding failure propagation in power systems requires sophisticated modeling that captures both cyber and physical dimensions. This work is innovative because it provides a systematic methodology for generating TCD models automatically from component specifications, eliminating manual model construction. The approach enables system designers to reason about complex failure scenarios and their cascading effects across subsystems.},
results = {The methodology successfully generates TCD models from power system components including transmission lines, protective relays, and circuit breakers. Testing on a three-bus system demonstrates the framework's ability to model fault propagation paths and identify failure modes. The generated models enable simulation of multiple fault scenarios and analysis of protection system response.},
project_tags = {CPS, energy, emergency, planning}
}
Resiliency and reliability is of paramount impor- tance for energy cyber physical systems. Electrical protection systems including detection elements such as Distance Relays and actuation elements such as Breakers are designed to protect the system from abnormal operations and arrest failure propagation by rapidly isolating the faulty components. However, failure in the protection devices themselves can and do lead to major system events and fault cascades, often leading to blackouts. This paper augments our past work on Temporal Causal Diagrams (TCD), a modeling formalism designed to help reason about the failure progressions by (a) describing a way to generate the TCD model from the system specification, and (b) understand the system failure dynamics for TCD reasoners by configuring simulation models.
@techreport{Pradhan2015a,
author = {Pradhan, Subhav and Dubey, Abhishek and Otte, William R and Karsai, Gabor and Gokhale, Aniruddha},
institution = {Institute for Software Integrated Systems, Vanderbilt University},
title = {Towards a Product Line of Heterogeneous Distributed Applications},
year = {2015},
address = {Nashville},
month = {4/2015},
number = {ISIS-15-117},
type = {Technical Report},
attachments = {http://www.isis.vanderbilt.edu/sites/default/files/TechReport2013.pdf},
contribution = {minor},
file = {:Pradhan2015a-Towards_a_product_line_of_heterogeneous_distributed_applications.pdf:PDF},
issn = {ISIS-15-117},
keywords = {software product lines, distributed systems, feature models, configuration management, heterogeneous applications},
owner = {abhishek},
timestamp = {2015.10.16},
url = {http://www.isis.vanderbilt.edu/sites/default/files/Pradhan_SEAMS_TechReport.pdf},
what = {This technical report describes UMRELA, a conceptual feature model for distributed applications that enables product line engineering of heterogeneous systems. The work presents a systematic approach to capturing variability and commonality across distributed applications through universal feature modeling. The methodology includes configuration management framework (AMF) and application management tools for managing dynamic deployment and reconfiguration of distributed systems.},
why = {Next-generation distributed systems require flexible management of heterogeneous components from multiple vendors operating in dynamic environments. This work is innovative because it extends traditional software product line concepts to distributed systems where components can dynamically join and leave. The feature model enables capturing domain-specific requirements while supporting runtime adaptation.},
results = {UMRELA successfully represents feature models for distributed applications with varying unit of composition characteristics. The configuration management framework enables specification of initial configuration points and calculation of new configurations at runtime. Experimental results demonstrate the feasibility of managing distributed application variability through feature model-based abstractions.},
project_tags = {middleware, scalable AI}
}
Next generation large-scale distributed systems – such as smart cities – are dynamic, heterogeneous and multi-domain in nature. The same is true for applications hosted on these systems. Application heterogeneity stems from their Unit of Composition (UoC); some applications might be coarse-grained and composed from processes or actors, whereas others might be fine-grained and composed from software components. Software components can further amplify heterogeneity since there exists different component models for different domains. Lifecycle management of such distributed, heterogeneous applications is a considerable challenge. In this paper, we solve this problem by reasoning about these systems as a Software Product Line (SPL) where individual dimensions of heterogeneity can be considered as product variants. To enable such reasoning, first, we present UMRELA (Universal feature-Model for distRibutEd appLicAtions), a conceptual feature model that identifies commonalities and variability points for capturing and representing distributed applications and their target system. This results in a product line of a family of distributed applications. UMRELA facilitates representation of initial configuration point, and the configuration space of the system. The latter represents all possible states the system can reach and is used as an implicit encoding to calculate new configuration points at runtime. Second, we present a prototype Application Management Framework (AMF) as a proof of concept configuration management tool that uses UMRELA to manage heterogeneous distributed applications.
@article{Balasubramanian2015,
author = {Balasubramanian, Daniel and Dubey, Abhishek and Otte, William and Levendovszky, Tihamer and Gokhale, Aniruddha S. and Kumar, Pranav Srinivas and Emfinger, William and Karsai, Gabor},
journal = {Sci. Comput. Program.},
title = {{DREMS} {ML:} {A} wide spectrum architecture design language for distributed computing platforms},
year = {2015},
pages = {3--29},
volume = {106},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/journals/scp/Balasubramanian15},
contribution = {colab},
doi = {10.1016/j.scico.2015.04.002},
file = {:Balasubramanian2015-DREMS_ML_A_wide_spectrum_architecture_design_language_for_distributed_computing_platforms.pdf:PDF},
keywords = {architecture description language, distributed systems, real-time embedded systems, model-driven development, scheduling analysis},
project = {cps-middleware},
tag = {platform},
timestamp = {Sat, 27 May 2017 01:00:00 +0200},
url = {https://doi.org/10.1016/j.scico.2015.04.002},
what = {This paper presents DREMS ML, a wide-spectrum architecture description language designed for distributed real-time embedded systems. The language integrates support for component abstractions, scheduling properties, and network characteristics through domain-specific modeling concepts. The work addresses multi-paradigm challenges in describing distributed applications where elements at different abstraction levels must coexist.},
why = {Developing distributed real-time embedded systems requires integrated modeling of application structure, scheduling, and network characteristics, but existing languages provide incomplete support. DREMS ML is innovative because it provides a unified modeling environment that addresses inherent challenges in specifying properties spanning multiple concerns including security, timing, and resource allocation. The language supports correct-by-construction development through automated analysis.},
results = {DREMS ML successfully models distributed real-time applications including scheduling analysis with CPN, network bandwidth analysis, and security properties. The language enables automatic generation of deployment configuration files and analysis of system properties. Case studies demonstrate the language's ability to capture complex application architectures while supporting design-time verification.},
project_tags = {CPS, middleware, Explainable AI}
}
Complex sensing, processing and control applications running on distributed platforms are difficult to design, develop, analyze, integrate, deploy and operate, especially if resource constraints, fault tolerance and security issues are to be addressed. While technology exists today for engineering distributed, real-time component-based applications, many problems remain unsolved by existing tools. Model-driven development techniques are powerful, but there are very few existing and complete tool chains that offer an end-to-end solution to developers, from design to deployment. There is a need for an integrated model-driven development environment that addresses all phases of application lifecycle including design, development, verification, analysis, integration, deployment, operation and maintenance, with supporting automation in every phase. Arguably, a centerpiece of such a model-driven environment is the modeling language. To that end, this paper presents a wide-spectrum architecture design language called DREMS ML that itself is an integrated collection of individual domain-specific sub-languages. We claim that the language promotes “correct-by-construction” software development and integration by supporting each individual phase of the application lifecycle. Using a case study, we demonstrate how the design of DREMS ML impacts the development of embedded systems.
@inproceedings{Dubey2015,
author = {Dubey, Abhishek and Sturm, Monika and Lehofer, Martin and Sztipanovits, Janos},
booktitle = {Workshop on Big Data Analytics in CPS: Enabling the Move from IoT to Real-Time Control},
title = {Smart City Hubs: Opportunities for Integrating and Studying Human CPS at Scale},
year = {2015},
category = {workshop},
contribution = {lead},
file = {:Dubey2015-Smart_city_hubs_Opportunities_for_integrating_and_studying_human_cps_at_scale.pdf:PDF},
keywords = {smart cities, human-cyber-physical systems, decision support, urban systems, sensor networks, information integration},
tag = {transit},
url = {http://www.isis.vanderbilt.edu/sites/default/files/extendedAbstract.pdf},
what = {This position paper explores opportunities for integrating human-cyber-physical systems at scale, focusing on smart city applications. The work proposes smart city hub frameworks that enable interaction between human operators and cyber-physical infrastructure through decision support systems. The paper discusses challenges of collecting sensor data, managing heterogeneous networks, and designing interfaces that support human decision-making in large-scale CPS.},
why = {Smart cities represent a new frontier in cyber-physical systems where human factors become critical alongside technical concerns. This work is innovative because it explicitly addresses the challenge of integrating human decision-making with CPS through city hub frameworks that bridge sensing, computing, and human interfaces. The approach recognizes that effective smart city deployment requires understanding human behavior and preferences alongside technical system design.},
results = {The paper presents a conceptual framework for smart city hubs supporting integration of transportation, emergency management, and public safety systems. The approach demonstrates feasibility through a Nashville metropolitan transportation scenario where city hubs facilitate information sharing and decision support. Results show potential for improving urban system efficiency through integrated human-CPS interaction.},
project_tags = {CPS, emergency, transit, planning}
}
@article{Mahadevan2015,
author = {Mahadevan, Nagabhushan and Dubey, Abhishek and Chhokra, Ajay and Guo, Huangcheng and Karsai, Gabor},
journal = {{IEEE} Instrum. Meas. Mag.},
title = {Using temporal causal models to isolate failures in power system protection devices},
year = {2015},
number = {4},
pages = {28--39},
volume = {18},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/journals/imm/MahadevanDCGK15},
contribution = {lead},
doi = {10.1109/MIM.2015.7155770},
file = {:Mahadevan2015-Using_temporal_causal_models_to_isolate_failures_in_power_system_protection_devices.pdf:PDF},
keywords = {fault diagnosis, temporal causal diagrams, power systems, protection devices, timed discrete event systems, failure propagation},
project = {cps-reliability,smart-energy},
tag = {platform,power},
timestamp = {Sun, 28 May 2017 01:00:00 +0200},
url = {https://doi.org/10.1109/MIM.2015.7155770},
what = {This work presents a Temporal Causal Diagram (TCD) approach for diagnosing failures in power system protection devices. The methodology models fault propagation as TFPG models integrated with Timed Discrete Event Systems (TDES) to capture temporal relationships between failure modes and their observable effects. The approach includes a TCD reasoning algorithm that generates hypotheses about system failures based on observed anomalies.},
why = {Automated diagnosis of power system failures requires understanding complex causal relationships between protection device misoperations and system-level effects. This work is innovative because it combines temporal fault propagation graphs with discrete event system models to enable reasoning about both fault causes and timing constraints. The TCD reasoning algorithm provides systematic hypothesis generation for improving system diagnostics.},
results = {The TCD approach successfully diagnoses failures in a three-bus power system protected by distance relays and circuit breakers. Testing demonstrates the methodology's ability to identify fault propagation paths and distinguish between multiple possible failure scenarios. The reasoning algorithm generates plausible hypotheses ranked by consistency with observed system behavior.},
project_tags = {energy, emergency, planning}
}
We introduced the modeling paradigm of Temporal Causal Diagrams (TCD) in this paper. TCDs capture fault propagation and behavior (nominal and faulty) of system components. An example model for the power transmission systems was also described. This TCD model was then used to develop an executable simulation model in Simulink/ Stateflow. Though this translation of TCD to an executable model is currently done manually, we are developing model templates and tools to automate this process. Simulations results (i.e., event traces) for a couple of single and multi-fault scenarios were also presented. As part of our future work, we wish to test and study the scalability of this approach towards a larger power transmission system taking into account a far richer set of protection elements. Further, we wish to consider more realistic event traces from the fault scenarios including missing, inconsistent and out-of-sequence alarms and events.
@inproceedings{Pradhan2015,
author = {Pradhan, Subhav M. and Dubey, Abhishek and Gokhale, Aniruddha S. and Lehofer, Martin},
booktitle = {Proceedings of the Workshop on Domain-Specific Modeling, DSM@SPLASH 2015, Pittsburgh, PA, USA, October 27, 2015},
title = {{CHARIOT:} a domain specific language for extensible cyber-physical systems},
year = {2015},
pages = {9--16},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/conf/oopsla/PradhanDGL15},
category = {workshop},
contribution = {lead},
doi = {10.1145/2846696.2846708},
file = {:Pradhan2015-CHARIOT_a_domain_specific_language_for_extensible_cyber-physical_systems.pdf:PDF},
keywords = {deployment and configuration, self-adaptive systems, cyber-physical systems, resilience, autonomic computing},
project = {cps-middleware,cps-reliability},
tag = {platform},
timestamp = {Tue, 06 Nov 2018 16:57:16 +0100},
url = {https://doi.org/10.1145/2846696.2846708},
what = {This technical report presents a self-adaptive Deployment and Configuration (D&C) infrastructure for cyber-physical systems. The work addresses challenges of managing dynamic applications in resource-constrained environments through adaptive mechanisms that support autonomous resilience. The infrastructure enables runtime reconfiguration to handle application failures and system changes while maintaining required service levels.},
why = {CPS deployed in dynamic environments face challenges of resource scarcity, component failures, and changing environmental conditions that require autonomous adaptation. This work is innovative because it extends deployment and configuration infrastructure with self-adaptive capabilities that enable systems to reconfigure themselves without human intervention. The approach ensures system resilience while maintaining application functionality under adverse conditions.},
results = {The self-adaptive D&C infrastructure successfully manages deployment of distributed CPS applications with autonomous reconfiguration capabilities. Experimental results demonstrate the system's ability to handle node failures through dynamic application migration and resource reallocation. The infrastructure enables applications to maintain required service levels despite infrastructure failures.},
project_tags = {CPS, middleware, scalable AI}
}
Wider adoption, availability and ubiquity of wireless networking technologies, integrated sensors, actuators, and edge computing devices is facilitating a paradigm shift by allowing us to transition from traditional statically configured vertical silos of CyberPhysical Systems (CPS) to next generation CPS that are more open, dynamic and extensible. Fractionated spacecraft, smart cities computing architectures, Unmanned Aerial Vehicle (UAV) clusters, platoon of vehicles on highways are all examples of extensible CPS wherein extensibility is implied by the dynamic aggregation of physical resources, affect of physical dynamics on availability of computing resources, and various multi-domain applications hosted on these systems. However, realization of extensible CPS requires resolving design-time and run-time challenges emanating from properties specific to these systems. In this paper, we first describe different properties of extensible CPS - dynamism, extensibility, remote deployment, security, heterogeneity and resilience. Then we identify different design-time challenges stemming from heterogeneity and resilience requirements. We particularly focus on software heterogeneity arising from availability of various communication middleware. We then present appropriate solutions in the context of a novel domain specific language, which can be used to design resilient systems while remaining agnostic to middleware heterogeneities. We also describe how this language and its features have evolved from our past work. We use a platform of fractionated spacecraft to describe our solution.
@inproceedings{Nannapaneni2014,
author = {Nannapaneni, Saideep and Dubey, Abhishek and Abdelwahed, Sherif and Mahadevan, Sankaran and Neema, Sandeep},
booktitle = {PHM 2014 - Proceedings of the Annual Conference of the Prognostics and Health Management Society 2014},
title = {A Model-Based Approach for Reliability Assessment in Component-Based Systems},
year = {2014},
month = oct,
category = {conference},
contribution = {colab},
file = {:Nannapaneni2014-A_Model-based_approach_for_reliability_assessment_in_component_based_systems.pdf:PDF},
keywords = {reliability assessment, component-based systems, functional decomposition, mission reliability, design trade-offs},
tag = {platform},
what = {This paper presents a model-based approach for reliability assessment in component-based systems using mission-level functional decomposition. The methodology maps system components to high-level functions and derives reliability block diagrams for computing mission reliability. The approach enables trade-off analysis between design alternatives based on cost, performance, and reliability metrics.},
why = {Reliability assessment for complex systems requires understanding how component failures affect mission-level objectives. This work is innovative because it provides a systematic methodology for extracting mission-critical functions from system designs and computing reliability considering both component failures and design alternatives. The approach supports informed design decisions balancing multiple objectives.},
results = {The methodology successfully computes mission reliability for component-based systems including automobiles and aircraft. Reliability block diagrams derived from functional decomposition enable analysis of component failure impacts. Results demonstrate trade-offs between system cost, performance, and reliability supporting design selection.},
project_tags = {CPS, planning}
}
This paper describes a formal framework for reliability assessment of component-based systems with respect to specific missions. A mission comprises of different timed mission stages, with each stage requiring a number of highlevel functions. The work presented here describes a modeling language to capture the functional decomposition and missions of a system. The components and their alternatives are mapped to basic functions which are used to implement the system-level functions. Our contribution is the extraction of mission-specific reliability block diagram from these high-level models of component assemblies. This is then used to compute the mission reliability using reliability information of components. This framework can be used for real-time monitoring of system performance where reliability of the mission is computed over time as the mission is in progress. Other quantities of interest such as mission feasibility, function availability can also be computed using this framework. Mission feasibility answers the question whether the mission can be accomplished given the current state of components in the system and function availability provides information if the function is available in the future given the current state of the system. The software used in this framework includes Generic Modeling Environment (GME) and Python. GME is used for modeling the system and Python for reliability computations. The proposed methodology is demonstrated using a radio-controlled (RC) car in carrying out a simple surveillance mission.
@inproceedings{Pradhan2014,
author = {{Pradhan}, S. and {Emfinger}, W. and Dubey, Abhishek and {Otte}, W. R. and {Balasubramanian}, D. and {Gokhale}, A. and {Karsai}, G. and {Coglio}, A.},
booktitle = {2014 IEEE International Conference on Space Mission Challenges for Information Technology},
title = {Establishing Secure Interactions across Distributed Applications in Satellite Clusters},
year = {2014},
month = sep,
pages = {67-74},
category = {conference},
contribution = {lead},
doi = {10.1109/SMC-IT.2014.17},
file = {:Pradhan2014-Establishing_Secure_Interactions_across_Distributed_Applications_in_Satellite_Clusters.pdf:PDF},
issn = {null},
keywords = {distributed systems, security, multi-level security, publish-subscribe, satellite systems, information flow control},
tag = {platform},
what = {This technical report addresses challenges of establishing secure interactions across distributed applications in satellite clusters. The work presents Secure Transport (ST) mechanism using Multi-Level Security (MLS) policies to enforce information partitioning between applications with different security classifications. The approach extends OpenDDS middleware to support secure publish-subscribe interactions.},
why = {Satellite clusters require secure sharing of computing resources among applications with different security requirements while preventing unauthorized information flow. This work is innovative because it adapts multi-level security concepts to the publish-subscribe communication paradigm, enabling secure interactions without requiring applications to understand low-level security mechanics. The approach maintains system openness while enforcing strict information partitioning.},
results = {Secure Transport successfully enforces MLS policies in a satellite cluster environment enabling secure communication between applications with different security labels. The mechanism prevents unauthorized information flows while permitting legitimate communication between applications with compatible security labels. Testing demonstrates feasibility of secure interactions without manual security policy management.},
project_tags = {CPS, middleware}
}
Recent developments in small satellites have led to an increasing interest in building satellite clusters as open systems that provide a "cluster-as-a-service" in space. Since applications with different security classification levels must be supported in these open systems, the system must provide strict information partitioning such that only applications with matching security classifications interact with each other. The anonymous publish/subscribe communication pattern is a powerful interaction abstraction that has enjoyed great success in previous space software architectures, such as NASA’s Core Flight Executive. However, the difficulty is that existing solutions that support anonymous publish/subscribe communication, such as the OMG Data Distribution Service (DDS), do not support information partitioning based on security classifications, which is a key requirement for some systems. This paper makes two contributions to address these limitations. First, we present a transport mechanism called Secure Transport that uses a lattice of labels to represent security classifications and enforces Multi-Level Security (MLS) policies to ensure strict information partitioning. Second, we present a novel discovery service that allows us to use an existing DDS implementation with our custom transport mechanism to realize a publish/subscribe middleware with information partitioning based on security classifications of applications. We also include an evaluation of our solution in the context of a use case scenario.
@inproceedings{Martins2014,
author = {{Martins}, G. and {Bhattacharjee}, A. and Dubey, Abhishek and {Koutsoukos}, X. D.},
booktitle = {2014 7th International Symposium on Resilient Control Systems (ISRCS)},
title = {Performance evaluation of an authentication mechanism in time-triggered networked control systems},
year = {2014},
month = aug,
pages = {1-6},
category = {conference},
contribution = {minor},
doi = {10.1109/ISRCS.2014.6900098},
file = {:Martins2014-Performance_Evaluation_of_an_Authentication_Mechanism_in_Time-Triggered_Network_Control_Systems.pdf:PDF},
issn = {null},
keywords = {time-triggered networks, message authentication, HMAC, real-time systems, TTEthernet, network security},
tag = {platform},
what = {This paper evaluates performance of Hash-based Message Authentication (HMAC) mechanisms in time-triggered networked control systems. The work analyzes computational overhead and network impact of adding authentication to TTEthernet communications. Testing on an automotive control system measures HMAC execution time and frame transmission delays.},
why = {Securing real-time control systems requires authentication mechanisms that don't jeopardize timing guarantees. This work is innovative because it provides empirical evaluation of HMAC performance overhead in time-triggered systems, enabling designers to understand trade-offs between security and timing. The analysis supports informed design decisions for security-critical real-time systems.},
results = {Experimental results show HMAC adds 20-32 byte overhead per message and introduces measurable but acceptable latency in TTEthernet communications. Testing with SHA-1, SHA-2, and SHA-3 hash functions demonstrates feasibility of authentication in time-triggered networks. Results support practical deployment of HMAC in automotive and other real-time systems.},
project_tags = {CPS, middleware}
}
An important challenge in networked control systems is to ensure the confidentiality and integrity of the message in order to secure the communication and prevent attackers or intruders from compromising the system. However, security mechanisms may jeopardize the temporal behavior of the network data communication because of the computation and communication overhead. In this paper, we study the effect of adding Hash Based Message Authentication (HMAC) to a time-triggered networked control system. Time Triggered Architectures (TTAs) provide a deterministic and predictable timing behavior that is used to ensure safety, reliability and fault tolerance properties. The paper analyzes the computation and communication overhead of adding HMAC and the impact on the performance of the time-triggered network. Experimental validation and performance evaluation results using a TTEthernet network are also presented.
@inproceedings{Mahadevan2014,
author = {Mahadevan, Nagabhushan and Dubey, Abhishek and Karsai, Gabor and Srivastava, Anurag and Liu, Chen-Ching},
booktitle = {Annual Conference of the Prognostics and Health Management Society},
title = {Temporal Causal Diagrams for diagnosing failures in cyber-physical systems},
year = {2014},
month = jan,
category = {conference},
contribution = {colab},
file = {:Mahadevan2014-Temporal_Causal_Diagrams_for_Diagnosing_Failures_in_Cyber_Physical_Systems.pdf:PDF},
keywords = {temporal causal diagrams, fault diagnosis, cyber-physical systems, timed discrete event systems, formal analysis},
tag = {platform,power},
what = {This work presents a framework for formal analysis of Temporal Causal Diagrams applied to cyber-physical system fault diagnosis. The methodology extends TFPG models with behavioral semantics enabling system-level reasoning about failure modes and their effects. The approach includes improved diagnosis algorithms that account for internal mode changes and timing delays in protection systems.},
why = {Effective fault diagnosis in cyber-physical systems requires understanding both physical and cyber failure mechanisms. This work is innovative because it provides formal mathematical foundations for temporal causal diagrams integrating physical and logical system aspects. The extended diagnosis approach improves accuracy by accounting for system-level effects and temporal constraints.},
results = {The formal framework successfully models power system failures and enables hypothesis generation for fault diagnosis. Improved reasoning algorithms identify failure modes with consideration of system timing constraints and state dependencies. Results demonstrate enhanced diagnostic accuracy for complex multi-component failures.},
project_tags = {CPS, emergency, planning}
}
Resilient and reliable operation of cyber physical systems of societal importance such as Smart Electric Grids is one of the top national priorities. Due to their critical nature, these systems are equipped with fast-acting, local protection mechanisms. However, commonly misguided protection actions together with system dynamics can lead to un-intentional cascading effects. This paper describes the ongoing work using Temporal Causal Diagrams (TCD), a refinement of the Timed Failure Propagation Graphs (TFPG), to diagnose problems associated with the power transmission lines protected by a combination of relays and breakers. The TCD models represent the faults and their propagation as TFPG, the nominal and faulty behavior of components (including local, discrete controllers and protection devices) as Timed Discrete Event Systems (TDES), and capture the cumulative and cascading effects of these interactions. The TCD diagnosis engine includes an extended TFPG-like reasoner which in addition to observing the alarms and mode changes (as the TFPG), monitors the event traces (that correspond to the behavioral aspects of the model) to generate hypotheses that consistently explain all the observations. In this paper, we show the results of applying the TCD to a segment of a power transmission system that is protected by distance relays and breakers.
@techreport{Pradhan2014b,
author = {Pradhan, Subhav and Otte, William and Dubey, Abhishek and Szabo, Csanad and Gokhale, Aniruddha and Karsai, Gabor},
institution = {Institute for Software Integrated Systems, Vanderbilt University},
title = {Towards a Self-adaptive Deployment and Configuration Infrastructure for Cyber-Physical Systems},
year = {2014},
address = {Nashville},
month = {6/2014},
number = {ISIS-14-102},
type = {Technical Report},
attachments = {http://www.isis.vanderbilt.edu/sites/default/files/TechReport2013.pdf},
contribution = {colab},
file = {:Pradhan2014b-Towards_a_self-adaptive_deployment_and_configuration_infrastructure_for_CPS.pdf:PDF},
issn = {ISIS-14-102},
keywords = {domain-specific language, cyber-physical systems, extensible systems, resilience, deployment configuration},
owner = {abhishek},
tag = {platform},
timestamp = {2015.10.16},
url = {http://www.isis.vanderbilt.edu/sites/default/files/Pradhan_SEAMS_TechReport.pdf},
what = {This technical report describes CHARIOT DSL, a domain-specific language for extensible cyber-physical systems addressing design-time heterogeneity and resilience challenges. The language enables specification of system objectives, configurations, and deployment constraints through high-level abstractions. The work includes tools for mapping CHARIOT designs to platform-specific implementations.},
why = {Extensible CPS require design approaches supporting resilience without compromising middleware independence. CHARIOT DSL is innovative because it provides unified abstractions for specifying communication heterogeneity, resilience requirements, and configuration space while remaining agnostic to specific platforms. The approach enables designers to address resilience concerns at design time.},
results = {CHARIOT DSL successfully models imaging satellite clusters and fractional spacecraft architectures with resilience requirements. The language enables generation of deployment specifications and runtime reconfiguration plans. Case studies demonstrate the language's ability to capture complex resilience scenarios including component failures and dynamic resource allocation.},
project_tags = {CPS, middleware, Explainable AI}
}
Multi-module Cyber-Physical Systems (CPSs), such as satellite clusters, swarms of Unmanned Aerial Vehicles (UAV), and fleets of Unmanned Underwater Vehicles (UUV) are examples of managed distributed real-time systems where mission-critical applications, such as sensor fusion or coordinated flight control, are hosted. These systems are dynamic and reconfigurable, and provide a "CPS cluster-as-a-service’’ for mission-specific scientific applications that can benefit from the elasticity of the cluster membership and heterogeneity of the cluster members. Distributed and remote nature of these systems often necessitates the use of Deployment and Configuration (D&C) services to manage lifecycle of software applications. Fluctuating resources, volatile cluster membership and changing environmental conditions require resilience. However, due to the dynamic nature of the system, human intervention is often infeasible. This necessitates a self-adaptive D&C infrastructure that supports autonomous resilience. Such an infrastructure must have the ability to adapt existing applications on the fly in order to provide application resilience and must itself be able to adapt to account for changes in the system as well as tolerate failures. This paper describes the design and architectural considerations to realize a self-adaptive, D&C infrastructure for CPSs. Previous efforts in this area have resulted in D&C infrastructures that support application adaptation via dynamic re-deployment and re-configuration mechanisms. Our work, presented in this paper, improves upon these past efforts by implementing a self-adaptive D&C infrastructure which itself is resilient. The paper concludes with experimental results that demonstrate the autonomous resilience capabilities of our new D&C infrastructure.
@inproceedings{Balasubramanian2014,
author = {Balasubramanian, Daniel and Levendovszky, Tihamer and Dubey, Abhishek and Karsai, Gabor},
booktitle = {Proceedings of the 8th Workshop on Multi-Paradigm Modeling co-located with the 17th International Conference on Model Driven Engineering Languages and Systems, MPM@MODELS 2014, Valencia, Spain, September 30, 2014},
title = {Taming Multi-Paradigm Integration in a Software Architecture Description Language},
year = {2014},
pages = {67--76},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/conf/models/BalasubramanianLDK14},
category = {workshop},
contribution = {colab},
file = {:Balasubramanian2014-Taming_Multi-Paradigm_Integration_in_a_Software_Architecture_Description_Language.pdf:PDF},
keywords = {distributed real-time embedded systems, model-driven development, scheduling analysis, network analysis, security verification},
project = {cps-reliability,cps-middleware},
tag = {platform},
timestamp = {Thu, 18 Jul 2019 11:36:32 +0200},
url = {http://ceur-ws.org/Vol-1237/paper7.pdf},
what = {This paper presents DREMS, a comprehensive infrastructure for designing and implementing distributed real-time embedded systems through model-driven development. The platform consists of design-time tools for modeling applications and a runtime platform for deployment. The work integrates scheduling analysis, network bandwidth prediction, and security property verification.},
why = {Developing distributed real-time systems faces challenges of managing complexity across multiple concerns including scheduling, network communication, and security. DREMS is innovative because it provides unified modeling environment with integrated analysis supporting automated code generation and deployment. The platform reduces development complexity while enabling systematic verification of system properties.},
results = {DREMS successfully supports development of distributed applications with scheduling analysis, network bandwidth analysis, and security verification. The platform generates deployable code and configuration files from high-level models. Case studies demonstrate successful modeling and verification of complex distributed systems.},
project_tags = {CPS, middleware, Explainable AI}
}
Software architecture description languages offer a convenient way of describing the high-level structure of a software system. Such descriptions facilitate rapid prototyping, code generation and automated analysis. One of the big challenges facing the software community is the design of architecture description languages that are general enough to describe a wide-range of systems, yet detailed enough to capture domain-specific properties and provide a high level of tool automation. This paper presents the multi-paradigm challenges we faced and solutions we built when creating a domain-specific modeling language for software architectures of distributed real-time systems.
@inproceedings{Balasubramanian2014a,
author = {Balasubramanian, Daniel and Dubey, Abhishek and Otte, William R. and Emfinger, William and Kumar, Pranav Srinivas and Karsai, Gabor},
booktitle = {25nd {IEEE} International Symposium on Rapid System Prototyping, {RSP} 2014, New Delhi, India, October 16-17, 2014},
title = {A Rapid Testing Framework for a Mobile Cloud},
year = {2014},
pages = {128--134},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/conf/rsp/BalasubramanianDOEKK14},
category = {selectiveconference},
contribution = {colab},
doi = {10.1109/RSP.2014.6966903},
file = {:Balasubramanian2014a-A_Rapid_Testing_Framework_for_a_Mobile_Cloud.pdf:PDF},
keywords = {testing framework, distributed systems, simulation, network emulation, middleware, cyber-physical systems},
project = {cps-middleware},
tag = {platform},
timestamp = {Wed, 16 Oct 2019 14:14:50 +0200},
url = {https://doi.org/10.1109/RSP.2014.6966903},
what = {This paper presents a rapid testing framework for mobile cloud infrastructure addressing challenges of modeling hardware dynamics, network characteristics, and middleware configuration. The framework integrates physical dynamics simulation with network emulation and middleware testing. The approach enables realistic evaluation of distributed satellite systems before physical deployment.},
why = {Testing distributed CPS requires understanding interactions between physical dynamics, network behavior, and software systems. This work is innovative because it integrates heterogeneous simulation tools enabling comprehensive evaluation of system performance under realistic conditions. The framework supports testing of applications that cannot be easily deployed on actual hardware.},
results = {The testing framework successfully models dynamics of distributed satellite systems including orbital mechanics and network communications. Integration of physical dynamics simulator, network emulator, and middleware enables realistic testing of applications. Results demonstrate feasibility of evaluating complex distributed systems in simulation.},
project_tags = {CPS, middleware, scalable AI}
}
Mobile clouds such as network-connected vehicles and satellite clusters are an emerging class of systems that are extensions to traditional real-time embedded systems: they provide long-term mission platforms made up of dynamic clusters of heterogeneous hardware nodes communicating over ad hoc wireless networks. Besides the inherent complexities entailed by a distributed architecture, developing software and testing these systems is difficult due to a number of other reasons, including the mobile nature of such systems, which can require a model of the physical dynamics of the system for accurate simulation and testing. This paper describes a rapid development and testing framework for a distributed satellite system. Our solutions include a modeling language for configuring and specifying an application’s interaction with the middleware layer, a physics simulator integrated with hardware in the loop to provide the system’s physical dynamics and the integration of a network traffic tool to dynamically vary the network bandwidth based on the physical dynamics.
@inproceedings{Emfinger2014,
author = {Emfinger, William and Karsai, Gabor and Dubey, Abhishek and Gokhale, Aniruddha S.},
booktitle = {Proceedings of the 4th {ACM} {SIGBED} International Workshop on Design, Modeling, and Evaluation of Cyber-Physical Systems, CyPhy 2014, Berlin, Germany, April 14-17, 2014},
title = {Analysis, verification, and management toolsuite for cyber-physical applications on time-varying networks},
year = {2014},
pages = {44--47},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/conf/cyphy/EmfingerKDG14},
category = {workshop},
contribution = {colab},
doi = {10.1145/2593458.2593459},
file = {:Emfinger2014-Analysis_verification_and_management_toolsuite_for_cyber-physical_applications_on_time-varying_networks.pdf:PDF},
keywords = {network analysis, cyber-physical systems, time-varying networks, quality of service, network modeling},
project = {cps-reliability},
tag = {platform},
timestamp = {Tue, 06 Nov 2018 00:00:00 +0100},
url = {https://doi.org/10.1145/2593458.2593459},
what = {This work presents a network analysis and verification toolsuite for cyber-physical applications operating on time-varying networks. The methodology models network capabilities including bandwidth and delay characteristics, enabling prediction of application performance. The approach integrates dynamic network analysis with application traffic profile modeling.},
why = {CPS operating on wireless networks face performance challenges from dynamic network conditions. This work is innovative because it provides systematic methodology for analyzing network performance and predicting application behavior under varying network conditions. The analysis enables designers to ensure applications meet timing and reliability requirements despite network dynamics.},
results = {The toolsuite successfully analyzes network characteristics and predicts application performance including buffer delays and bandwidth requirements. Testing on UDP and TCP communications demonstrates the methodology's ability to characterize network behavior. Results support design-time analysis of application-network interactions.},
project_tags = {CPS, middleware}
}
Cyber-Physical Systems (CPS) are increasingly utilizing advances in wireless mesh networking among computing nodes to facilitate communication and control for distributed applications. Factors such as interference or node mobility cause such wireless networks to experience changes in both topology and link capacities. These dynamic networks pose a reliability concern for high-criticality or mixed-criticality systems which require strict guarantees about system performance and robustness prior to deployment. To address the design- and run-time verification and reliability concerns created by these dynamic networks, we are developing an integrated modeling, analysis, and run-time toolsuite which provides (1) network profiles that model the dynamics of system network resources and application network requirements over time, (2) design-time verification of application performance on dynamic networks, and (3) management of the CPS network resources during run-time. In this paper we present the foundations for the analysis of dynamic networks and show experimental validations of this analysis. We conclude with a focus on future work and applications to the field
@inproceedings{Karsai2014,
author = {Karsai, Gabor and Balasubramanian, Daniel and Dubey, Abhishek and Otte, William},
booktitle = {17th {IEEE} International Symposium on Object/Component/Service-Oriented Real-Time Distributed Computing, {ISORC} 2014, Reno, NV, USA, June 10-12, 2014},
title = {Distributed and Managed: Research Challenges and Opportunities of the Next Generation Cyber-Physical Systems},
year = {2014},
pages = {1--8},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/conf/isorc/KarsaiBDO14},
category = {selectiveconference},
contribution = {colab},
doi = {10.1109/ISORC.2014.36},
file = {:Karsai2014-Distributed_and_Managed.pdf:PDF},
keywords = {cyber-physical systems, distributed systems, security, resilience, system architecture, resource management},
project = {cps-reliability,cps-middleware},
tag = {platform},
timestamp = {Wed, 16 Oct 2019 14:14:53 +0200},
url = {https://doi.org/10.1109/ISORC.2014.36},
what = {This position paper presents research challenges and opportunities for next-generation cyber-physical systems emphasizing distributed computing, security, and resilience. The work discusses architectural requirements for multi-module CPS including component isolation, resource management, and fault tolerance. The paper proposes layered architecture addressing these concerns across multiple abstraction levels.},
why = {Next-generation CPS face unprecedented challenges in managing distributed resources across multiple security domains while maintaining resilience. This work is innovative because it systematically identifies architectural requirements for addressing these challenges and proposes integrated solutions. The layered architecture enables support for both security and resilience as fundamental design principles.},
results = {The proposed architecture successfully addresses key requirements including resource management, security isolation, and autonomous fault tolerance. The approach demonstrates feasibility through discussion of architectural support for multi-domain applications. The layered design provides clear separation of concerns supporting independent verification of system properties.},
project_tags = {CPS, middleware, scalable AI}
}
Cyber-physical systems increasingly rely on distributed computing platforms where sensing, computing, actuation, and communication resources are shared by a multitude of applications. Such ’cyber-physical cloud computing platforms’ present novel challenges because the system is built from mobile embedded devices, is inherently distributed, and typically suffers from highly fluctuating connectivity among the modules. Architecting software for these systems raises many challenges not present in traditional cloud computing. Effective management of constrained resources and application isolation without adversely affecting performance are necessary. Autonomous fault management and real-time performance requirements must be met in a verifiable manner. It is also both critical and challenging to support multiple end-users whose diverse software applications have changing demands for computational and communication resources, while operating on different levels and in separate domains of security. The solution presented in this paper is based on a layered architecture consisting of a novel operating system, a middleware layer, and component-structured applications. The component model facilitates the construction of software applications from modular and reusable components that are deployed in the distributed system and interact only through well-defined mechanisms. The complexity of creating applications and performing system integration is mitigated through the use of a domain-specific model-driven development process that relies on a domain-specific modeling language and its accompanying graphical modeling tools, software generators for synthesizing infrastructure code, and the extensive use of model-based analysis for verification and validation.
@inproceedings{Kumar2014,
author = {Kumar, Pranav Srinivas and Dubey, Abhishek and Karsai, Gabor},
booktitle = {Proceedings of the 11th Workshop on Model-Driven Engineering, Verification and Validation co-located with 17th International Conference on Model Driven Engineering Languages and Systems, MoDeVVa@MODELS 2014, Valencia, Spain, September 30, 2014},
title = {Colored Petri Net-based Modeling and Formal Analysis of Component-based Applications},
year = {2014},
pages = {79--88},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/conf/models/KumarDK14},
category = {workshop},
contribution = {colab},
keywords = {Colored Petri Nets, formal analysis, component-based systems, scheduling verification, real-time systems},
project = {cps-reliability,cps-middleware},
tag = {platform},
timestamp = {Tue, 28 May 2019 16:23:34 +0200},
url = {http://ceur-ws.org/Vol-1235/paper-10.pdf},
what = {This paper presents Colored Petri Net-based modeling and formal analysis approach for component-based applications in distributed real-time embedded systems. The methodology enables verification of system properties including deadline violations and buffer overflows through formal analysis. The approach integrates component operation scheduling with business logic modeling.},
why = {Verifying properties of complex distributed applications requires formal methods that capture both scheduling and behavioral aspects. This work is innovative because it provides unified formal framework using Colored Petri Nets that integrates component scheduling with application semantics. The approach enables early detection of timing and functional violations.},
results = {The Colored Petri Net approach successfully models component-based applications and verifies system properties through formal analysis. Testing demonstrates the methodology's ability to identify scheduling violations and potential deadlock conditions. Results support use of formal methods in verifying safety-critical distributed systems.},
project_tags = {CPS, middleware, scalable AI}
}
Distributed Real-Time Embedded (DRE) Systems that address safety and mission-critical system requirements are applied in a variety of domains today. Complex, integrated systems like managed satellite clusters expose heterogeneous concerns such as strict timing requirements, complexity in system integration, deployment, and repair; and resilience to faults. Integrating appropriate modeling and analysis techniques into the design of such systems helps ensure predictable, dependable and safe operation upon deployment. This paper describes how we can model and analyze applications for these systems in order to verify system properties such as lack of deadline violations. Our approach is based on (1) formalizing the component operation scheduling using Colored Petri nets (CPN), (2) modeling the abstract temporal behavior of application components, and (3) integrating the business logic and the component operation scheduling models into a concrete CPN, which is then analyzed. This model-driven approach enables a verication-driven workow wherein the application model can be rened and restructured before actual code development.
@article{Levendovszky2014,
author = {Levendovszky, Tihamer and Dubey, Abhishek and Otte, William and Balasubramanian, Daniel and Coglio, Alessandro and Nyako, Sandor and Emfinger, William and Kumar, Pranav Srinivas and Gokhale, Aniruddha S. and Karsai, Gabor},
journal = {{IEEE} Software},
title = {Distributed Real-Time Managed Systems: {A} Model-Driven Distributed Secure Information Architecture Platform for Managed Embedded Systems},
year = {2014},
number = {2},
pages = {62--69},
volume = {31},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/journals/software/LevendovszkyDOBCNEKGK14},
contribution = {colab},
doi = {10.1109/MS.2013.143},
file = {:Levendovszky2014-Distributed_Real_Time_Managed_Systems.pdf:PDF},
keywords = {distributed embedded systems, component-based development, security, resource management, platform},
project = {cps-middleware,cps-reliability},
tag = {platform},
timestamp = {Thu, 18 May 2017 01:00:00 +0200},
url = {https://doi.org/10.1109/MS.2013.143},
what = {This paper presents Distributed Real-Time Managed Systems (DREMS) as a practical design and runtime solution for distributed embedded systems. The work integrates modern software development practices with novel approaches for managing constrained resources and isolating applications. The platform provides component model supporting reuse and secure information flows.},
why = {Building distributed embedded systems requires integrating hardware constraints, security requirements, and development practices. DREMS is innovative because it provides practical integrated platform addressing security, resource management, and application isolation as core architectural principles. The component-based approach enables rapid application development while maintaining system correctness.},
results = {DREMS successfully supports development of satellite and mobile cloud applications with resource constraints and security requirements. The platform enables rapid prototyping while maintaining secure information flows and resource isolation. Experimental results demonstrate feasibility for complex multi-domain applications.},
project_tags = {CPS, middleware, scalable AI}
}
Architecting software for a cloud computing platform built from mobile embedded devices incurs many challenges that aren’t present in traditional cloud computing. Both effectively managing constrained resources and isolating applications without adverse performance effects are needed. A practical design- and runtime solution incorporates modern software development practices and technologies along with novel approaches to address these challenges. The patterns and principles manifested in this system can potentially serve as guidelines for current and future practitioners in this field.
@inproceedings{Otte2014,
author = {Otte, William R. and Dubey, Abhishek and Karsai, Gabor},
booktitle = {Sensors and Systems for Space Applications VII},
title = {{A resilient and secure software platform and architecture for distributed spacecraft}},
year = {2014},
editor = {Pham, Khanh D. and Cox, Joseph L.},
organization = {International Society for Optics and Photonics},
pages = {121 -- 130},
publisher = {SPIE},
volume = {9085},
category = {conference},
contribution = {lead},
doi = {10.1117/12.2054055},
file = {:Otte2014-A_resilient_and_secure_software_platform_and_architecture_for_distributed_spacecraft.pdf:PDF},
keywords = {distributed systems, security, resilience, spacecraft, secure information flows, platform architecture},
tag = {platform},
url = {https://doi.org/10.1117/12.2054055},
what = {This paper addresses design and implementation of resilient and secure software platforms for distributed spacecraft. The work describes a distributed architecture supporting applications with different security classifications while maintaining resilience through autonomous management. The platform implements secure information flows and resource constraints through operating system and middleware mechanisms.},
why = {Spacecraft clusters require platform support for applications with different security requirements while maintaining mission-critical resilience. This work is innovative because it provides concrete architectural solutions for enforcing security policies and resilience mechanisms through lower-level platform abstractions. The approach enables secure shared platforms for multi-organizational missions.},
results = {The distributed platform successfully enforces security policies preventing information leakage between applications with different classifications. Implementation demonstrates feasibility of secure spacecraft architectures supporting autonomous fault tolerance. Results support deployment of multi-organizational missions on shared platforms.},
project_tags = {CPS, middleware}
}
A distributed spacecraft is a cluster of independent satellite modules flying in formation that communicate via ad-hoc wireless networks. This system in space is a cloud platform that facilitates sharing sensors and other computing and communication resources across multiple applications, potentially developed and maintained by different organizations. Effectively, such architecture can realize the functions of monolithic satellites at a reduced cost and with improved adaptivity and robustness. Openness of these architectures pose special challenges because the distributed software platform has to support applications from different security domains and organizations, and where information flows have to be carefully managed and compartmentalized. If the platform is used as a robust shared resource its management, configuration, and resilience becomes a challenge in itself. We have designed and prototyped a distributed software platform for such architectures. The core element of the platform is a new operating system whose services were designed to restrict access to the network and the file system, and to enforce resource management constraints for all non-privileged processes Mixed-criticality applications operating at different security labels are deployed and controlled by a privileged management process that is also pre-configuring all information flows. This paper describes the design and objective of this layer.
@inproceedings{Pradhan2014a,
author = {Pradhan, Subhav and Otte, William and Dubey, Abhishek and Gokhale, Aniruddha and Karsai, Gabor},
booktitle = {Proceedings of the 11th IEEE International Conference and Workshops on the Engineering of Autonomic and Autonomous Systems (EASe'14)},
title = {Key Considerations for a Resilient and Autonomous Deployment and Configuration Infrastructure for Cyber-Physical Systems},
year = {2014},
organization = {Citeseer},
category = {conference},
contribution = {colab},
file = {:Pradhan2014a-Key_Considerations_for_a_Resilient_and_Autonomous_Deployment_and_Configuration_Infrastructure_for_CPS.pdf:PDF},
keywords = {cyber-physical systems, deployment and configuration, self-adaptation, distributed systems, fault management, middleware, resilience},
tag = {platform},
what = {This paper addresses the design of a resilient and autonomous deployment and configuration (D&C) infrastructure for cyber-physical systems (CPS) such as fractioned spacecraft and swarms of unmanned vehicles. The work focuses on self-adaptive D&C services that can detect infrastructure failures and manage application failures in highly dynamic cluster environments. The infrastructure supports automatic deployment and configuration of software applications while maintaining system functionality in the face of various fault scenarios.},
why = {CPS systems are inherently dynamic and distributed, operating in resource-constrained environments where human intervention is often infeasible. Existing D&C solutions lack the ability to support resilient adaptation and autonomous recovery from failures. This work is innovative because it presents a self-adaptive D&C infrastructure capable of maintaining application resilience through automated detection and recovery mechanisms, addressing a critical gap in the design of autonomous systems.},
results = {The paper presents key design challenges for a self-adaptive D&C infrastructure and proposes a layered architecture with distributed group membership monitoring, failure detection mechanisms, and dynamic reconfiguration capabilities. The infrastructure uses a Group Membership Monitor and detection mechanisms to handle challenges like distributed group membership, leader election, proper sequencing of deployment, and D&C state preservation across system faults.},
project_tags = {CPS, middleware}
}
Multi-module Cyber-Physical Systems (CPSs), such as satellite clusters, swarms of Unmanned Aerial Vehicles (UAV), and fleets of Unmanned Underwater Vehicles (UUV) are examples of managed distributed real-time systems where mission-critical applications, such as sensor fusion or coordinated flight control, are hosted. These systems are dynamic and reconfigurable, and provide a “CPS cluster-as-a-service” for mission-specific scientific applications that can benefit from the elasticity of the cluster membership and heterogeneity of the cluster members. The distributed and remote nature of these systems often necessitates the use of Deployment and Configuration (D&C) services to manage the lifecycle of software applications. Fluctuating resources, volatile cluster membership and changing environmental conditions require resilient D&C services. However, the dynamic nature of the system often precludes human intervention during the D&C activities, which motivates the need for a self-adaptive D&C infrastructure that supports autonomous resilience. Such an infrastructure must have the ability to adapt existing applications on-the-fly in order to provide application resilience and must itself be able to adapt to account for changes in the system as well as tolerate failures. This paper makes two contributions towards addressing these needed. First, we identify the key challenges in achieving such a self-adaptive D&C infrastructure. Second, we present our ideas on resolving these challenges and realizing a self-adaptive D&C infrastructure.
@article{Mahadevan2013,
author = {Mahadevan, Nagabhushan and Dubey, Abhishek and Balasubramanian, Daniel and Karsai, Gabor},
journal = {Innov. Syst. Softw. Eng.},
title = {Deliberative, search-based mitigation strategies for model-based software health management},
year = {2013},
issn = {1614-5046},
month = dec,
number = {4},
pages = {293–318},
volume = {9},
address = {Berlin, Heidelberg},
contribution = {lead},
doi = {10.1007/s11334-013-0215-x},
issue_date = {December 2013},
keywords = {software health management, fault mitigation, deliberative reasoning, boolean satisfiability, SAT solvers, fault diagnosis, component configuration},
numpages = {26},
project = {cps-reliability,cps-middleware},
publisher = {Springer-Verlag},
tag = {platform},
url = {https://doi.org/10.1007/s11334-013-0215-x},
what = {This paper presents a deliberative, search-based mitigation strategy for model-based software health management in complex aerospace systems. The work develops an architecture that combines runtime health monitoring and diagnosis with goal-oriented reasoning to identify alternative component configurations that can restore system functionality when faults occur.},
why = {Traditional design-time and offline approaches to fault tolerance are insufficient for managing complex software systems that must adapt to unforeseen failures at runtime. This work is innovative in applying deliberative reasoning techniques and Boolean Satisfiability (SAT) solvers to autonomously identify recovery strategies, enabling systems to maintain desired functionalities even when components fail.},
results = {The approach demonstrates how to use Timed Failure Propagation Graphs (TFPG) models combined with SAT solvers to determine optimal component reconfigurations. The design-time and runtime support for deliberative strategy enables systems to search through configuration spaces and identify valid alternatives that restore functionality while addressing the complex specifications required to cover all possible fault combinations.},
project_tags = {scalable AI, Explainable AI}
}
Rising software complexity in aerospace systems makes them very difficult to analyze and prepare for all possible fault scenarios at design time; therefore, classical run-time fault tolerance techniques such as self-checking pairs and triple modular redundancy are used. However, several recent incidents have made it clear that existing software fault tolerance techniques alone are not sufficient. To improve system dependability, simpler, yet formally specified and verified run-time monitoring, diagnosis, and fault mitigation capabilities are needed. Such architectures are already in use for managing the health of vehicles and systems. Software health management is the application of these techniques to software systems. In this paper, we briefly describe the software health management techniques and architecture developed by our research group. The foundation of the architecture is a real-time component framework (built upon ARINC-653 platform services) that defines a model of computation for software components. Dedicated architectural elements: the Component Level Health Manager (CLHM) and System Level Health Manager (SLHM) provide the health management services: anomaly detection, fault source isolation, and fault mitigation. The SLHM includes a diagnosis engine that (1) uses a Timed Failure Propagation Graph (TFPG) model derived from the component assembly model, (2) reasons about cascading fault effects in the system, and (3) isolates the fault source component(s). Thereafter, the appropriate system-level mitigation action is taken. The main focus of this article is the description of the fault mitigation architecture that uses goal-based deliberative reasoning to determine the best mitigation actions for recovering the system from the identified failure mode.
@inproceedings{Shi2013,
author = {{Shi}, J. and {Amgai}, R. and {Abdelwahed}, S. and Dubey, Abhishek and {Humphreys}, J. and {Alattar}, M. and {Jia}, R.},
booktitle = {2013 IEEE Electric Ship Technologies Symposium (ESTS)},
title = {Generic modeling and analysis framework for shipboard system design},
year = {2013},
month = apr,
pages = {420-428},
category = {workshop},
contribution = {minor},
doi = {10.1109/ESTS.2013.6523770},
file = {:Shi2013-Generic_modeling_and_analysis_framework_for_shipboard_system_design.pdf:PDF},
issn = {null},
keywords = {model integrated computing, shipboard systems, power systems design, cross-domain modeling, simulation, systems engineering},
tag = {platform,power},
what = {This paper proposes a generic modeling and analysis framework for shipboard power system design, enabling the integration of components from different domains (electrical, mechanical, thermal) into a unified simulation environment. The approach facilitates cross-domain analysis and design optimization through a Model Integrated Computing (MIC) paradigm.},
why = {Shipboard power systems are increasingly complex with multiple integrated subsystems, making traditional design and analysis approaches insufficient. This work is innovative in providing a generic modeling environment that supports multiple simulation tools and enables cross-platform operations while maintaining semantic consistency across different engineering domains.},
results = {The framework demonstrates integration with multiple simulation tools including Matlab, Simulink, PSCAD, and VTB, enabling comprehensive analysis of shipboard power system scenarios. The approach provides a meta-level abstraction that allows designers to capture system complexity across domains and perform holistic analysis and validation of system performance under various operating conditions.},
project_tags = {energy, CPS}
}
This paper proposes a novel modeling and simulation environment for ship design based on the principles of Model Integrated Computing (MIC). The proposed approach facilitates the design and analysis of shipboard power systems and similar systems that integrate components from different fields of expertise. The conventional simulation platforms such as Matlab®, Simulink®, PSCAD® and VTB® require the designers to have explicit knowledge of the syntactic and semantic information of the desired domain within the tools. This constraint, however, severely slows down the design and analysis process, and causes cross-domain or cross-platform operations remain error prone and expensive. Our approach focuses on the development of a modeling environment that provides generic support for a variety of application across different domains by capturing modeling concepts, composition principles and operation constraints. For the preliminary demonstration of the modeling concept, in this paper we limit the scope of design to cross-platform implementations of the proposed environment by developing an application model of a simplified shipboard power system and using Matlab engine and VTB solver separately to evaluate the performance with different respects. In the case studies a fault scenario is pre-specified and tested on the system model. The corresponding time domain bus voltage magnitude and angle profiles are generated via invoking external solver, displayed to users and then saved for future analysis.
@techreport{Mahadevan2013a,
author = {Mahadevan, Nagabhushan and Dubey, Abhishek and Balasubramanian, Daniel and Karsai, Gabor},
institution = {Institute for Software Integrated Systems, Vanderbilt University},
title = {Deliberative Reasoning in Software Health Management},
year = {2013},
month = {04/2013},
number = {ISIS-13-101},
type = {techreport},
attachments = {http://www.isis.vanderbilt.edu/sites/default/files/TechReport2013.pdf},
contribution = {lead},
file = {:Mahadevan2013a-Deliberative_reasoning_in_software_health_management.pdf:PDF},
issn = {ISIS-13-101},
keywords = {software health management, ARINC-653, component models, fault diagnosis, timed failure propagation, aerospace systems},
tag = {platform},
what = {This technical report describes a component-level and system-level health management approach for ARINC-653 software systems in aerospace applications. The work presents a two-level health management architecture with Component-Level Health Manager (CLHM) for individual components and System-Level Health Manager (SLHM) for overall system health, using Timed Failure Propagation Graph models.},
why = {Rising software complexity in aerospace systems necessitates innovative runtime mechanisms that provide fault management services beyond traditional design-time approaches. This work is significant in applying Software Health Management concepts to component-based systems, enabling automated fault detection, diagnosis, and mitigation through formal models that capture temporal and causal system dependencies.},
results = {The architecture demonstrates automatic synthesis of health management infrastructure from component models, including monitoring code, diagnosis information, and mitigation strategies. The TFPG-based diagnosis engine can isolate fault sources and trigger appropriate system-level recovery actions, providing runtime dependability for safety-critical aerospace systems.},
project_tags = {CPS, ML for CPS}
}
Rising software complexity in aerospace systems makes them very dicult to analyze and prepare for all possible fault scenarios at design-time. Therefore, classical run-time fault-tolerance techniques, such as self-checking pairs and triple modular redundancy are used. However, several recent incidents have made it clear that existing software fault tolerance techniques alone are not sucient. To improve system dependability, simpler, yet formally specied and veried run-time monitoring, diagnosis, and fault mitigation are needed. Such architectures are already in use for managing the health of vehicles and systems. Software health management is the application of adapting and applying these techniques to software. In this paper, we briey describe the software health management technique and architecture developed by our research group. The foundation of the architecture is a real-time component framework (built upon ARINC-653 platform services) that denes a model of computation for software components. Dedicated architectural elements: the Component Level Health Manager (CLHM) and System Level Health Manager (SLHM) are providing health management services: anomaly detection, fault source isolation, and fault mitigation. The SLHM includes a diagnosis engine that uses a Timed Failure Propagation (TFPG) model derived from the component assembly model, and it reasons about cascading fault eects in the system and isolates the fault source component(s). Thereafter, the appropriate system level mitigation action is taken. The main focus of this article is the description of the fault mitigation architecture that uses goal-based deliberative reasoning to determine the best mitigation actions for recovering the system from the identied failure mode.
@inbook{Dubey2010,
author = {Dubey, Abhishek and Karsai, Gabor and Mahadevan, Nagabhushan},
editor = {de Lemos, Rog{\'e}rio and Giese, Holger and M{\"u}ller, Hausi A. and Shaw, Mary},
pages = {294--323},
publisher = {Springer Berlin Heidelberg},
title = {Fault-Adaptivity in Hard Real-Time Component-Based Software Systems},
year = {2013},
address = {Berlin, Heidelberg},
isbn = {978-3-642-35813-5},
booktitle = {Software Engineering for Self-Adaptive Systems II: International Seminar, Dagstuhl Castle, Germany, October 24-29, 2010 Revised Selected and Invited Papers},
contribution = {lead},
doi = {10.1007/978-3-642-35813-5_12},
file = {:Dubey2010-Fault-Adaptivity_in_Hard_Real-Time_Component-Based_Software_Systems.pdf:PDF},
keywords = {fractionated spacecraft, component-based software, distributed systems, middleware, space systems, fault management, security},
project = {cps-middleware,cps-reliability},
tag = {platform},
url = {https://doi.org/10.1007/978-3-642-35813-5_12},
what = {This paper presents a model-driven software component framework (FX) for fractionated spacecraft systems, addressing the challenge of developing and deploying reusable components in highly constrained space environments. The framework defines component models with formal interface specifications and execution semantics for mission-critical distributed applications.},
why = {Fractionated spacecraft represent a novel space architecture with clusters of independent modules that must communicate wirelessly while maintaining strict security and resource constraints. This work is innovative in providing a component-based architecture that abstracts system complexities and provides framework support for fault management, security isolation, and efficient resource utilization in highly dynamic space platforms.},
results = {The FX framework demonstrates how Interface Definition Language (IDL) concepts enable flexible component interactions through peer-to-peer, group-based, and device interaction patterns. The architecture successfully integrates deployment management, operations management, dictionary management, and fault management capabilities into a unified platform serving distributed spacecraft applications.},
project_tags = {middleware, CPS}
}
Complexity in embedded software systems has reached the point where we need run-time mechanisms that provide fault management services. Testing and verification may not cover all possible scenarios that a system encounters, hence a simpler, yet formally specified run-time monitoring, diagnosis, and fault mitigation architecture is needed to increase the software system’s dependability. The approach described in this paper borrows concepts and principles from the field of ‘Systems Health Management’ for complex aerospace systems and implements a novel two level health management architecture that can be applied in the context of a model-based software development process.
@article{Dubey2013,
author = {Dubey, Abhishek and Karsai, Gabor},
journal = {{Innovations in System and Software Engineering}},
title = {Software health management},
year = {2013},
number = {4},
pages = {217},
volume = {9},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/journals/isse/DubeyK13},
contribution = {lead},
doi = {10.1007/s11334-013-0226-7},
file = {:Dubey2013-Software_Health_Management.pdf:PDF},
keywords = {software health management, fault tolerance, aerospace systems, system dependability, fault detection, diagnosis, mitigation},
project = {cps-reliability,cps-middleware},
tag = {platform},
timestamp = {Tue, 26 Jun 2018 01:00:00 +0200},
what = {This editorial introduces and contextualizes software health management as a discipline for complex aerospace systems. The work discusses the application of health management techniques to software systems and presents an overview of emerging approaches that combine detection, diagnosis, and mitigation for ensuring system dependability.},
why = {Software failures in aerospace systems can have serious consequences, making health management essential for safety-critical operations. This editorial is significant in promoting software health management as an established discipline and highlighting the gap between current practices and needed capabilities for managing software faults at runtime.},
results = {The editorial provides a comprehensive overview of software health management foundations and presents several papers addressing detection, diagnosis, and mitigation. It establishes the state-of-the-art in software health management and motivates future research directions for improving software system dependability through advanced health management techniques.},
project_tags = {CPS}
}
@inproceedings{Dubey2013a,
author = {Dubey, A and Karsai, G and Mahadevan, N and Srivastava, A and Liu, CC and Lukic, S},
booktitle = {NSF Energy Cyber Physical System Workshop, Washington DC},
title = {Understanding Failure Dynamics in the Smart Electric Grid},
year = {2013},
category = {workshop},
contribution = {lead},
file = {:Dubey2013a-Understanding_failture_dynamics_in_the_smart_electric_grid.pdf:PDF},
keywords = {fractionated spacecraft, component frameworks, model-driven development, distributed systems, middleware, space systems},
tag = {platform},
what = {This paper describes a model-driven software component framework for fractionated spacecraft that addresses challenges of developing distributed real-time applications for resource-constrained space environments. The framework provides support for component-based design, deployment configuration, and runtime system integration with emphasis on fault management and security.},
why = {Fractionated spacecraft systems require sophisticated software architectures that can handle dynamic cluster composition, network constraints, and security isolation. This work is innovative in combining Interface Definition Language concepts with CORBA communication patterns to provide flexible, reusable component abstractions that enable rapid development and deployment of space mission applications.},
results = {The framework demonstrates a complete model-driven development process from component specification through deployment and runtime execution. The approach enables developers to create reusable software components with well-defined interactions while maintaining support for fault isolation, security constraints, and dynamic system reconfiguration.},
project_tags = {middleware, CPS}
}
@inproceedings{Dubey2013b,
author = {Dubey, Abhishek and Gokhale, Aniruddha and Karsai, Gabor and Otte, W and Willemsen, Johnny},
booktitle = {Proceedings of the 5th International Conference on Spacecraft Formation Flying Missions and Technologies (SFFMT)},
title = {A model-driven software component framework for fractionated spacecraft},
year = {2013},
organization = {IEEE Munich, Germany},
category = {workshop},
contribution = {lead},
file = {:Dubey2013b-A_model-driven_software_component_framework_for_fractionated_spacecraft.pdf:PDF},
keywords = {model-driven development, distributed real-time systems, embedded systems, toolchain, code generation, middleware, deployment},
what = {This paper presents DREMS, a comprehensive toolchain and platform for rapid development, integration, and deployment of managed distributed real-time embedded systems. The infrastructure combines a design-time modeling environment, code generation tools, and a runtime software platform supporting distributed applications on networked computing nodes.},
why = {Distributed real-time embedded systems face challenges in managing complexity across modeling, integration, and deployment phases. DREMS is innovative in providing an integrated environment that spans the entire development lifecycle, enabling developers to synthesize infrastructure code from models and automatically manage system composition and verification at design time.},
results = {The DREMS platform demonstrates successful integration of model-driven development tools with a runtime software platform supporting components, middleware, and system-level services. The toolchain enables automatic code generation, constraint checking, and configuration synthesis, reducing development complexity and increasing reliability for distributed embedded system applications.},
project_tags = {middleware, scalable AI}
}
Fractionated spacecraft is a novel space architecture that uses a cluster of small spacecraft modules (with their own attitude control and propulsion systems) connected via wireless links to accomplish complex missions. Resources, such as sensors, persistent storage space, processing power, and downlink bandwidth can be shared among the members of the cluster thanks to the networking. Such spacecraft can serve as a cost effective, highly adaptable, and fault tolerant platform for running various distributed mission software applications that collect, process, and downlink data. Naturally, a key component in such a system is the software platform: the distributed operating system and software infrastructure that makes such applications possible. Existing operating systems are insufficient, and newer technologies like component frameworks do not address all the requirements of such flexible space architectures. The high degree of flexibility and the need for thorough planning and analysis of the resource management necessitates the use of advanced development techniques. This paper describes the core principles and design of a software component framework for fractionated spacecraft that is a special case of a distributed real-time embedded system. Additionally we describe how a model-driven development environment helps with the design and engineering of complex applications for this platform.
@inproceedings{Emfinger2013,
author = {Emfinger, William and Kumar, Pranav and Dubey, Abhishek and Otte, William and Gokhale, Aniruddha and Karsai, Gabor},
booktitle = {IEEE Real-time Systems Symposium},
title = {Drems: A toolchain and platform for the rapid application development, integration, and deployment of managed distributed real-time embedded systems},
year = {2013},
category = {poster},
contribution = {lead},
file = {:Emfinger2013-DREMS_A_toolchain_and_platform_for_rapid.pdf:PDF},
keywords = {distributed embedded systems, model-driven development, toolchain, code generation, real-time systems, deployment},
tag = {platform},
what = {This paper presents DREMS, a toolchain and platform infrastructure for designing, implementing, configuring, deploying, and maintaining distributed real-time embedded systems. The work describes a complete development workflow from modeling tools through deployment, including support for reusable components and automated code generation.},
why = {Complex distributed embedded systems require comprehensive tooling that spans multiple development phases and provides consistency across design and runtime. This work is significant in integrating model-driven engineering with actual system deployment, enabling developers to reason about system properties at design time while maintaining flexibility for runtime adaptation.},
results = {The DREMS platform successfully integrates design-time modeling with code generation and runtime management, demonstrated through deployment of satellite cluster applications. The infrastructure provides automated synthesis of system configurations, constraint validation, and runtime management of application components across distributed nodes.},
project_tags = {middleware, CPS, scalable AI}
}
@inproceedings{Otte2013,
author = {Otte, William and Dubey, Abhishek and Pradhan, Subhav and Patil, Prithviraj and Gokhale, Aniruddha S. and Karsai, Gabor and Willemsen, Johnny},
booktitle = {16th {IEEE} International Symposium on Object/Component/Service-Oriented Real-Time Distributed Computing, {ISORC} 2013, Paderborn, Germany, June 19-21, 2013},
title = {{F6COM:} {A} component model for resource-constrained and dynamic space-based computing environments},
year = {2013},
pages = {1--8},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/conf/isorc/OtteDPPGKW13},
category = {selectiveconference},
contribution = {lead},
doi = {10.1109/ISORC.2013.6913199},
file = {:Otte2013-F6COM_A_Component_Model.pdf:PDF},
keywords = {component models, space systems, fractionated spacecraft, cyber-physical systems, resource-constrained computing, fault management},
project = {cps-reliability,cps-middleware},
tag = {platform},
timestamp = {Wed, 16 Oct 2019 14:14:53 +0200},
url = {https://doi.org/10.1109/ISORC.2013.6913199},
what = {This paper presents F6COM, a component model for resource-constrained and dynamic space-based computing environments like fractionated spacecraft. The model provides abstractions for composing reusable software components while supporting fault management, security constraints, and dynamic reconfiguration in resource-limited platforms.},
why = {Space computing platforms present unique challenges with limited resources, dynamic network conditions, and critical security requirements. F6COM is innovative in designing a component model tailored specifically for space environments, providing fine-grained control over component interactions, security policies, and execution states while maintaining simplicity and efficiency.},
results = {The F6COM component model demonstrates support for multiple interaction patterns (synchronous, asynchronous, timed), contract-based programming with pre/post-conditions, and complex component lifecycle management. The architecture successfully balances robustness and flexibility needed for space mission applications while remaining resource-efficient.},
project_tags = {CPS, middleware}
}
Component-based programming models are well-suited to the design of large-scale, distributed applications because of the ease with which distributed functionality can be developed, deployed, and validated using the models’ compositional properties. Existing component models supported by standardized technologies, such as the OMG’s CORBA Component Model (CCM), however, incur a number of limitations in the context of cyber physical systems (CPS) that operate in highly dynamic, resource-constrained, and uncertain environments, such as space environments, yet require multiple quality of service (QoS) assurances, such as timeliness, reliability, and security. To overcome these limitations, this paper presents the design of a novel component model called F6COM that is developed for applications operating in the context of a cluster of fractionated spacecraft. Although F6COM leverages the compositional capabilities and port abstractions of existing component models, it provides several new features. Specifically, F6COM abstracts the component operations as tasks, which are scheduled sequentially based on a specified scheduling policy. The infrastructure ensures that at any time at most one task of a component can be active - eliminating race conditions and deadlocks without requiring complicated and error-prone synchronization logic to be written by the component developer. These tasks can be initiated due to (a) interactions with other components, (b) expiration of timers, both sporadic and periodic, and (c) interactions with input/output devices. Interactions with other components are facilitated by ports. To ensure secure information flows, every port of an F6COM component is associated with a security label such that all interactions are executed within a security context. Thus, all component interactions can be subjected to Mandatory Access Control checks by a Trusted Computing Base that facilitates the interactions. Finally, F6COM provides capabilities to monitor task execution deadlines and to configure component-specific fault mitigation actions.
@article{Pradhan2013,
author = {Pradhan, Subhav and Otte, William and Dubey, Abhishek and Gokhale, Aniruddha S. and Karsai, Gabor},
journal = {{SIGBED} Review},
title = {Towards a resilient deployment and configuration infrastructure for fractionated spacecraft},
year = {2013},
number = {4},
pages = {29--32},
volume = {10},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/journals/sigbed/PradhanODGK13},
contribution = {lead},
doi = {10.1145/2583687.2583694},
file = {:Pradhan2013-Towards_a_resilient_deployment_and_configuration_infrastructure_for_fractionated_spacecraft.pdf:PDF},
keywords = {deployment and configuration, fractionated spacecraft, reconfiguration, resilience, SAT solving, autonomous systems},
project = {cps-reliability,cps-middleware},
tag = {platform},
timestamp = {Tue, 06 Nov 2018 00:00:00 +0100},
url = {https://doi.org/10.1145/2583687.2583694},
what = {This paper addresses resilient deployment and configuration for fractionated spacecraft systems, proposing key architectural ideas for enabling autonomous reconfiguration in response to faults. The work focuses on extending the D&C infrastructure to support component redeployment and system-level reconfiguration in highly dynamic space environments.},
why = {Fractionated spacecraft operate in resource-constrained environments where human intervention is infeasible, requiring autonomous adaptation to maintain mission functionality. This work is innovative in developing reconfiguration capabilities grounded in formal methods (SAT solving) while maintaining the ability to operate under resource constraints typical of space systems.},
results = {The paper presents a reconfiguration infrastructure that leverages SAT solvers to identify valid system configurations matching specified functional requirements. The approach enables dynamic redeployment of components and system-level adaptation, providing fractionated spacecraft with autonomous resilience capabilities.},
project_tags = {CPS, scalable AI}
}
Fractionated spacecraft are clusters of small, independent modules that interact wirelessly to realize the functionality of a traditional monolithic spacecraft. System F6 (F6 stands for Future, Fast, Flexible, Fractionated, Free-Flying spacecraft) is a DARPA program for fractionated spacecraft. Software applications in F6 are implemented in the context of the F6 Information Architecture Platform (IAP), which provides component-based abstractions for composing distributed applications. The lifecycle of these distributed applications must be managed autonomously by a deployment and configuration (D&C) infrastructure, which can redeploy and reconfigure the running applications in response to faults and other anomalies that may occur during system operation. Addressing these D&C requirements is hard due to the significant fluctuation in resource availabilities, constraints on resources, and safety and security concerns. This paper presents the key architectural ideas that are required in realizing such a D&C infrastructure.
@inproceedings{Chen2012,
author = {{Qian Chen} and {Mehrotra}, R. and Dubey, Abhishek and {Abdelwahed}, S. and {Rowland}, K.},
booktitle = {2012 Proceedings of IEEE Southeastcon},
title = {On state of the art in virtual machine security},
year = {2012},
month = mar,
pages = {1-6},
category = {conference},
contribution = {minor},
doi = {10.1109/SECon.2012.6196905},
file = {:Chen2012-On_state_of_the_art_in_virtual_machine_security.pdf:PDF},
issn = {1091-0050},
keywords = {virtualization, security, virtual machines, data centers, web services, threat analysis, VMware, Xen},
what = {This paper surveys virtualization technologies and security vulnerabilities in virtual machine environments, focusing on data center and web services infrastructure. The work provides comprehensive coverage of current virtualization approaches including full virtualization, paravirtualization, and application virtualization, along with associated security challenges.},
why = {Virtualization has become ubiquitous in modern computing infrastructure, but security challenges remain inadequately addressed. This work is significant in systematically surveying security issues across multiple virtualization layers and providing practical guidance on threat mitigation for enterprise systems.},
results = {The survey identifies and categorizes security threats across virtualization layers including VM isolation, resource management, migration issues, and communication vulnerabilities. It provides recommendations for improving virtual machine security through access control policies, trusted platform modules, and careful configuration management.},
project_tags = {CPS, middleware}
}
Data centers and computing service providers are striving to improve the utilization of their computing resources. This is primarily due to the need of resources to be more economical and power efficient. Virtualization is one of the concepts that provide flexibility to host multiple operating system stacks on a single hardware. By effectively partitioning the computing resources, it reduces the total number of physical servers and consolidates several services on a single physical rack. Each virtual machine behaves like an independent machine (may be duplicate of the original one) while the scheduling of hardware resources among different virtual machines is performed with the help of a Virtual Machine Monitor (VMM). Proliferation of virtual machines in the enterprise architecture creates need for identification of potential security risks as well as appropriate solutions for the identified risks to ensure the integrity of the underlying applications hosted at the virtual machines. This paper describes available virtualization technologies, corresponding security vulnerabilities, and available solutions.
@inproceedings{Dubey2012,
author = {Dubey, Abhishek and {Emfinger}, W. and {Gokhale}, A. and {Karsai}, G. and {Otte}, W. R. and {Parsons}, J. and {Szabo}, C. and {Coglio}, A. and {Smith}, E. and {Bose}, P.},
booktitle = {2012 IEEE Aerospace Conference},
title = {A software platform for fractionated spacecraft},
year = {2012},
month = mar,
pages = {1-20},
category = {conference},
contribution = {lead},
doi = {10.1109/AERO.2012.6187334},
file = {:Dubey2012-A_software_platform_for_fractionated_spacecraft.pdf:PDF},
issn = {1095-323X},
keywords = {component models, formal semantics, ARINC-653, real-time systems, timed transition graphs, safety-critical systems},
tag = {platform},
what = {This technical report presents formalization of component models for real-time systems based on ARINC-653 semantics. The work develops timed transition graphs and formal semantic specifications for component interactions in safety-critical real-time systems, enabling rigorous reasoning about system properties.},
why = {Component-based software development requires formal specifications to ensure correctness in safety-critical applications. This work is innovative in providing formal semantics for real-time component interactions, enabling derivation of system-level properties and supporting rigorous verification of composed systems.},
results = {The formalization captures component port execution semantics, synchronous and asynchronous interactions, and deadline constraints through timed transition traces. The approach enables formal reasoning about component assemblies and supports derivation of system-level fault propagation templates for health management.},
project_tags = {CPS, scalable AI}
}
A fractionated spacecraft is a cluster of independent modules that interact wirelessly to maintain cluster flight and realize the functions usually performed by a monolithic satellite. This spacecraft architecture poses novel software challenges because the hardware platform is inherently distributed, with highly fluctuating connectivity among the modules. It is critical for mission success to support autonomous fault management and to satisfy real-time performance requirements. It is also both critical and challenging to support multiple organizations and users whose diverse software applications have changing demands for computational and communication resources, while operating on different levels and in separate domains of security. The solution proposed in this paper is based on a layered architecture consisting of a novel operating system, a middleware layer, and component-structured applications. The operating system provides primitives for concurrency, synchronization, and secure information flows; it also enforces application separation and resource management policies. The middleware provides higher-level services supporting request/response and publish/subscribe interactions for distributed software. The component model facilitates the creation of software applications from modular and reusable components that are deployed in the distributed system and interact only through well-defined mechanisms. Two cross-cutting aspects - multi-level security and multi-layered fault management - are addressed at all levels of the architecture. The complexity of creating applications and performing system integration is mitigated through the use of a domain-specific model-driven development process that relies on a dedicated modeling language and its accompanying graphical modeling tools, software generators for synthesizing infrastructure code, and the extensive use of model-based analysis for verification and validation.
@techreport{Dubey2012b,
author = {Dubey, Abhishek and Karsai, Gabor and Mahadevan, Nagabhushan},
institution = {Institute for Software Integrated Systems, Vanderbilt University},
title = {Formalization of a Component Model for Real-time Systems},
year = {2012},
month = {04/2012},
number = {ISIS-12-102},
attachments = {http://www.isis.vanderbilt.edu/sites/default/files/ISIS-12-102-TechReport.pdf},
contribution = {lead},
file = {:Dubey2012b-Formalization_of_a_Component_Model_for_Real-time_Systems.pdf:PDF},
issn = {ISIS-12-102},
keywords = {software platform architecture, fractionated spacecraft, operating systems, middleware, component models, fault management, security},
tag = {platform},
what = {This paper presents a software platform architecture for fractionated spacecraft that integrates operating system services, middleware, and component models into a unified system. The architecture provides services for deployment, operations management, dictionary management, and fault management across distributed spacecraft nodes.},
why = {Fractionated spacecraft require sophisticated software architectures that balance flexibility, security, and fault management while operating on resource-constrained platforms. This work is innovative in developing a layered architecture that provides essential system services while maintaining clean separation of concerns and enabling rapid development of mission applications.},
results = {The F6MDA architecture demonstrates a complete software platform with F6OS operating system, F6ORB middleware for inter-component communication, and F6COM component model. The platform successfully supports deployment of satellite cluster applications with multi-level security and dynamic fault management capabilities.},
project_tags = {middleware, CPS}
}
Component-based software development for real-time systems necessitates a well-defined ‘component model’ that allows compositional analysis and reasoning about systems. Such a model defines what a component is, how it works, and how it interacts with other components. It is especially important for real-time systems to have such a component model, as many problems in these systems arise from poorly understood and analyzed component interactions. In this paper we describe a component model for hard real-time systems that relies on the services of an ARINC-653 compliant real-time operating system platform. The model provides high-level abstractions of component interactions, both for the synchronous and asynchronous case. We present a formalization of the component model in the form of timed transition traces. Such formalization is necessary to be able to derive interesting system level properties such as fault propagation graphs from models of component assemblies. We provide a brief discussion about such system level fault propagation templates for this component model.
@techreport{Dubey2012c,
author = {Dubey, Abhishek and Mahadevan, Nagabhushan and Karsai, Gabor},
institution = {Insitute for Software Integrated Systems, Vanderbilt University},
title = {The Inertial Measurement Unit Example: A Software Health Management Case Study},
year = {2012},
month = {02/2012},
number = {ISIS-12-101},
attachments = {http://www.isis.vanderbilt.edu/sites/default/files/TechReport_IMU.pdf},
contribution = {lead},
file = {:Dubey2012c-The_Inertial_Measurement_Unit_Example.pdf:PDF},
issn = {ISIS-12-101},
keywords = {component models, formal methods, ARINC-653, real-time systems, fault propagation, health management},
tag = {platform},
what = {This technical report presents a formal component model for ARINC-653 real-time systems, defining component execution states (active, passive, inactive) and interaction semantics. The work provides mathematical foundations for reasoning about component assemblies and supports automated synthesis of health management strategies.},
why = {Formal models of component interactions are essential for safety-critical systems where correctness cannot be verified through testing alone. This work is significant in providing rigorous specifications that enable automated reasoning about system behavior and support synthesis of fault management strategies tailored to specific failure scenarios.},
results = {The formal model captures component port execution semantics through detailed state machines and constraint specifications. It enables derivation of fault propagation patterns and supports automated generation of monitoring and mitigation strategies from component assembly specifications.},
project_tags = {CPS, scalable AI}
}
This report captures in detail a Two-level Software Health Management strategy on a real-life example of an Inertial Measurement Unit subsystem. We describe in detail the design of the component and system level health management strategy. Results are expressed as relevant portions of the detailed logs that shows the successful adaptation of the monitor/ detect/ diagnose/ mitigate approach to Software Health Management.
@techreport{4574,
author = {Monceaux, Weston P and Evans, Deland E and Rappold, Keith N and Butler, Cary D and Abdelwahed, Sherif and Mehrotra, Rajat and Dubey, Abhishek},
institution = {DTIC Document},
title = {Implementing Autonomic Computing Methods to Improve Attack Resilience in Web Services},
year = {2012},
contribution = {minor},
keywords = {performance},
pages = {422}
}
@inbook{Chalfant2012,
author = {Chalfant, Julie and Langland, Blake and Abdelwahed, Sherif and Chryssostomidis, Chryssostomos and Dougal, Roger and Dubey, Abhishek and El Mezyani, Touria and Herbst, JD and Kiehne, Thomas and Ordonez, Juan and others},
title = {A collaborative early-stage ship design environment},
year = {2012},
contribution = {minor},
file = {:Chalfant2012-A_collaborative_early-stage_ship_design_environment.pdf:PDF},
journal = {CEM Publications},
keywords = {ship design, collaborative engineering, multidisciplinary design, systems integration, design tools, data management},
what = {This paper describes the Smart Ship System Design (S3D) environment developed for collaborative early-stage ship design. The work presents tools and methodologies that enable multidisciplinary teams to collaborate on ship design while addressing challenges in systems integration, data exchange, and design optimization.},
why = {Modern naval ship design involves complex interactions between multiple engineering disciplines and requires integration of various specialized tools. The S3D environment is innovative in providing a collaborative framework that enables designers from different disciplines to work in parallel while maintaining design consistency across different views and tools.},
results = {The S3D environment demonstrates successful integration of multiple specialized design tools including ASSET, Paramarine, and domain-specific analysis tools. The platform enables collaborative design with shared equipment catalogs, automated data propagation across tools, and support for multidisciplinary design optimization.},
project_tags = {energy, planning}
}
Recent advances in sensor and weapons systems are significantly increasing the electrical power that is required and the thermal loads that must be dissipated onboard US Navy ships. Thus, design tools and methods must bring detailed consideration of all disciplines early in the design process, including electrical, thermal and controls in addition to the traditional naval architecture and marine engineering. Effective interface of the multiple disciplines demands a collaborative design process. The Electric Ship Research and Development Consortium (ESRDC) has developed the backbone structure of a collaborative design environment with the goal of bringing together many disciplines early in the ship design process. This design environment brings many innovations, especially in the arena of simultaneous collaborative design. This paper describes the Smart Ship System Design (S3D) environment as developed to date, along with overall and discipline-specific visions of implementation of the environment in ship design.
@inproceedings{Dabholkar2012,
author = {Dabholkar, Akshay and Dubey, Abhishek and Gokhale, Aniruddha S. and Karsai, Gabor and Mahadevan, Nagabhushan},
booktitle = {{IEEE} 31st Symposium on Reliable Distributed Systems, {SRDS} 2012, Irvine, CA, USA, October 8-11, 2012},
title = {Reliable Distributed Real-Time and Embedded Systems through Safe Middleware Adaptation},
year = {2012},
pages = {362--371},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/conf/srds/DabholkarDGKM12},
category = {selectiveconference},
contribution = {lead},
acceptance = {25},
doi = {10.1109/SRDS.2012.59},
file = {:Dabholkar2012-Reliable_Distributed_Real-Time_and_Embedded_Systems_through_Safe_Middleware_Adaptation.pdf:PDF},
keywords = {middleware, fault tolerance, resource-aware systems, real-time systems, adaptive failure management, distributed systems},
project = {cps-reliability,cps-middleware},
tag = {platform},
timestamp = {Wed, 16 Oct 2019 14:14:49 +0200},
url = {https://doi.org/10.1109/SRDS.2012.59},
what = {This paper presents SafeMAT, a middleware-based fault tolerance solution for distributed real-time and embedded (DRE) systems that addresses resource constraints while maintaining fault tolerance capabilities. The work introduces distributed resource monitoring, adaptive failure management, and dynamic reconfiguration for safely adapting to faults.},
why = {DRE systems face unique challenges in providing fault tolerance while operating under strict resource constraints and real-time constraints. SafeMAT is innovative in leveraging middleware-based adaptation to provide resource-aware failure management, enabling systems to gracefully degrade and recover from different failure types without compromising safety properties.},
results = {SafeMAT demonstrates a three-component architecture with distributed resource monitoring for tracking system resource utilization, adaptive failure management for selecting appropriate recovery strategies, and dynamic reconfiguration for modifying system structure at runtime. The approach shows 9-15% overhead while providing safe and predictable failure adaptability.},
project_tags = {middleware, CPS}
}
Distributed real-time and embedded (DRE) systems are a class of real-time systems formed through a composition of predominantly legacy, closed and statically scheduled real-time subsystems, which comprise over-provisioned resources to deal with worst-case failure scenarios. The formation of the system-of-systems leads to a new range of faults that manifest at different granularities for which no statically defined fault tolerance scheme applies. Thus, dynamic and adaptive fault tolerance mechanisms are needed which must execute within the available resources without compromising the safety and timeliness of existing real-time tasks in the individual subsystems. To address these requirements, this paper describes a middleware solution called Safe Middleware Adaptation for Real-Time Fault Tolerance (SafeMAT), which opportunistically leverages the available slack in the over-provisioned resources of individual subsystems. SafeMAT comprises three primary artifacts: (1) a flexible and configurable distributed, runtime resource monitoring framework that can pinpoint in real-time the available slack in the system that is used in making dynamic and adaptive fault tolerance decisions, (2) a safe and resource aware dynamic failure adaptation algorithm that enables efficient recovery from different granularities of failures within the available slack in the execution schedule while ensuring real-time constraints are not violated and resources are not overloaded, and (3) a framework that empirically validates the correctness of the dynamic mechanisms and the safety of the DRE system. Experimental results evaluating SafeMAT on an avionics application indicates that SafeMAT incurs only 9-15% runtime fail over and 2-6% processor utilization overheads thereby providing safe and predictable failure adaptability in real-time.
@inproceedings{Dubey2012a,
author = {Dubey, Abhishek and Mahadevan, Nagabhushan and Karsai, Gabor},
booktitle = {The Eighth International Conference on Autonomic and Autonomous Systems},
title = {A deliberative reasoner for model-based software health management},
year = {2012},
pages = {86--92},
category = {selectiveconference},
contribution = {lead},
note = {Best Paper Award},
acceptance = {23},
file = {:Dubey2012a-A_Deliberative_Reasoner_for_Model-Based_Software_Health_Management.pdf:PDF},
keywords = {deliberative reasoning, software health management, fault diagnosis, component reconfiguration, SAT solvers, autonomous computing},
tag = {platform},
what = {This paper presents a deliberative reasoner for model-based software health management that uses goal-oriented reasoning to identify alternative component configurations for restoring system functionality. The work combines TFPG-based diagnosis with SAT-based reconfiguration planning to enable autonomous system recovery.},
why = {Complex systems often have multiple alternative configurations that can restore functionality after failures, but identifying valid alternatives under constraints is computationally challenging. This work is innovative in applying deliberative reasoning techniques to search the configuration space and identify recovery actions suitable for autonomous execution.},
results = {The deliberative mitigation approach demonstrates use of Boolean Satisfiability solvers to identify valid component reconfigurations that restore functionality. The system successfully handles complex fault scenarios in simulated Inertial Measurement Units and can be applied to various component assemblies through model specification.},
project_tags = {scalable AI, Explainable AI}
}
While traditional design-time and off-line approaches to testing and verification contribute significantly to improving and ensuring high dependability of software, they may not cover all possible fault scenarios that a system could encounter at runtime. Thus, runtime health management of complex embedded software systems is needed to improve their dependability. Our approach to Software Health Management uses concepts from the field of Systems Health Management: detection, diagnosis and mitigation. In earlier work we had shown how to use a reactive mitigation strategy specified using a timed state machine model for system health manager. This paper describes the algorithm and key concepts for an alternative approach to system mitigation using a deliberative strategy, which relies on a function-allocation model to identify alternative component-assembly configurations that can restore the functions needed for the goals of the system.
@inproceedings{Mahadevan2012,
author = {Mahadevan, Nagabhushan and Dubey, Abhishek and Karsai, Gabor},
booktitle = {15th {IEEE} International Symposium on Object/Component/Service-Oriented Real-Time Distributed Computing, {ISORC} 2012, Shenzhen, China, April 11-13, 2012},
title = {Architecting Health Management into Software Component Assemblies: Lessons Learned from the {ARINC-653} Component Mode},
year = {2012},
pages = {79--86},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/conf/isorc/MahadevanDK12},
category = {selectiveconference},
contribution = {lead},
doi = {10.1109/ISORC.2012.19},
file = {:Mahadevan2012-Architecting_Health_Management_into_Software_Component_Assemblies.pdf:PDF},
keywords = {component models, formal semantics, real-time systems, timed transition traces, safety-critical systems, formal verification},
project = {cps-reliability,cps-middleware},
tag = {platform},
timestamp = {Wed, 16 Oct 2019 14:14:53 +0200},
url = {https://doi.org/10.1109/ISORC.2012.19},
what = {This technical report describes formalization of a component model for real-time systems, providing formal semantics for component port execution and interaction patterns. The work develops timed transition traces and formal specifications that enable rigorous analysis of component assemblies in safety-critical applications.},
why = {Formal specifications are essential for reasoning about correctness in safety-critical systems where informal descriptions lead to ambiguity and errors. This work is significant in providing rigorous mathematical foundations for component interactions that enable automated verification and synthesis of system properties.},
results = {The formalization captures nominal component behavior through timed transition traces, describing port execution semantics for both synchronous and asynchronous interactions. The approach enables derivation of system-level properties and supports formal reasoning about fault propagation and component assembly correctness.},
project_tags = {CPS}
}
Complex real-time software systems require an active fault management capability. While testing, verification and validation schemes and their constant evolution help improve the dependability of these systems, an active fault management strategy is essential to potentially mitigate the unacceptable behaviors at run-time. In our work we have applied the experience gained from the field of Systems Health Management towards component-based software systems. The software components interact via well-defined concurrency patterns and are executed on a real-time component framework built upon ARINC-653 platform services. In this paper, we present the lessons learned in architecting and applying a two-level health management strategy to assemblies of software components.
@inbook{Mehrotra2012,
author = {Mehrotra, Rajat and Dubey, Abhishek and Abdelwahed, Sherif and Tantawi, Asser N.},
pages = {621--648},
publisher = {CRC Press},
title = {Power-Aware Modeling and Autonomic Management Framework for Distributed Computing Systems},
year = {2012},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/reference/crc/MehrotraDAT12},
booktitle = {Handbook of Energy-Aware and Green Computing - Two Volume Set},
contribution = {colab},
file = {:Mehrotra2012-Power-Aware_Modeling_and_Autonomic_Management_Framework_for_Distributed_Computing_Systems.pdf:PDF},
keywords = {power management, distributed systems, autonomic computing, predictive control, energy efficiency, QoS management},
project = {cps-middleware},
tag = {platform},
timestamp = {Wed, 12 Jul 2017 01:00:00 +0200},
url = {http://www.crcnetbase.com/doi/abs/10.1201/b16631-34},
what = {This paper presents a power-aware modeling and autonomic management framework for distributed computing systems. The work develops a predictive power management approach that combines power consumption modeling with control techniques to minimize power consumption while maintaining desired system response times.},
why = {Energy consumption is a critical concern in large-scale distributed computing systems, yet traditional approaches lack sophistication in managing power while maintaining quality of service. This work is innovative in applying control-theoretic approaches to autonomously manage power consumption, adapting system behavior to workload characteristics and environmental conditions.},
results = {The framework demonstrates development of power consumption models for multi-tier systems using offline regression and online Kalman filtering techniques. Predictive control approaches successfully minimize power consumption while maintaining response time constraints, showing significant potential for reducing operational costs and environmental impact.},
project_tags = {energy, scalable AI, CPS}
}
@inproceedings{Mehrotra2012a,
author = {Mehrotra, Rajat and Dubey, Abhishek and Abdelwahed, Sherif and Rowland, Krisa W.},
booktitle = {The 8th International Conference on Autonomic and Autonomous Systems {ICAS} 2012},
title = {RFDMon: A Real-time and Fault-tolerant Distributed System Monitoring Approach},
year = {2012},
category = {selectiveconference},
contribution = {lead},
acceptance = {23},
file = {:Mehrotra2012a-RFDMon_A_real-time_and_fault-tolerant_distributed_system_monitoring_approach.pdf:PDF},
keywords = {distributed monitoring, fault tolerance, quality of service, middleware, hierarchical management, ARINC-653, real-time systems},
tag = {platform},
what = {This work presents RFDMon, a real-time and fault-tolerant distributed system monitoring framework built on Data Distribution Services middleware. The framework measures system variables like CPU utilization, memory usage, network bandwidth, and application performance metrics across heterogeneous computing nodes. It organizes monitoring sensors into regions with local managers, regional leaders, and a global membership manager to enable hierarchical and scalable monitoring. The approach uses spatial and temporal partitioning to isolate monitoring data collection and ensure periodic updates.},
why = {As distributed systems grow in complexity and scale, monitoring becomes critical for identifying failures and performance bottlenecks before they impact users. RFDMon innovates by combining the flexibility of Data Distribution Services with fault-tolerant hierarchical management to achieve minimal latency monitoring without overwhelming system resources. The work addresses the gap between traditional client-server monitoring models and the needs of modern large-scale computing infrastructure by supporting dynamic node additions and automatic fault diagnosis.},
results = {The framework successfully monitors large clusters of 100 to 800 computing nodes with minimal overhead on computational resources. Experimental results demonstrate that the system identifies infrastructure faults in real-time with minimal delay and can reconfigure itself automatically to resume monitoring. The hierarchical architecture scales effectively while maintaining low communication overhead, and the framework integrates cleanly with fault diagnosis modules to support comprehensive system health management.},
project_tags = {middleware, CPS, scalable AI}
}
One of the main requirements for building an autonomic system is to have a robust monitoring framework. In this paper, a systematic distributed event based (DEB) system monitoring framework “RFDMon” is presented for measuring system variables (CPU utilization, memory utilization, disk utilization, network utilization, etc.), system health (temperature and voltage of Motherboard and CPU) application performance variables (application response time, queue size, and throughput), and scientific application data structures (PBS information and MPI variables) accurately with minimum latency at a specified rate and with controllable resource utilization. This framework is designed to be tolerant to faults in monitoring framework, self-configuring (can start and stop monitoring the nodes and configure monitors for threshold values/changes for publishing the measurements), aware of execution of the framework on multiple nodes through HEARTBEAT messages, extensive (monitors multiple parameters through periodic and aperiodic sensors), resource constrainable (computational resources can be limited for monitors), and expandable for adding extra monitors on the fly. Since RFDMon uses a Data Distribution Services (DDS) middleware, it can be used for deploying in systems with heterogeneous nodes. Additionally, it provides a functionality to limit the maximum cap on resources consumed by monitoring processes such that it reduces the effect on the availability of resources for the applications.
@inproceedings{Mehrotra2011,
author = {{Mehrotra}, R. and Dubey, Abhishek and {Abdelwahed}, S. and {Monceaux}, W.},
booktitle = {2011 Eighth IEEE International Conference and Workshops on Engineering of Autonomic and Autonomous Systems},
title = {Large Scale Monitoring and Online Analysis in a Distributed Virtualized Environment},
year = {2011},
month = apr,
pages = {1-9},
category = {conference},
contribution = {colab},
doi = {10.1109/EASe.2011.17},
file = {:Mehrotra2011-Large_Scale_Monitoring_and_Online_Analysis_in_a_Distributed_Virtualized_Environment.pdf:PDF},
issn = {2168-1872},
keywords = {virtualized systems, quality of service, system modeling, power management, Kalman filtering, event-based monitoring},
tag = {platform},
what = {This paper describes a large-scale event-based monitoring approach for distributed systems in virtualized environments using comprehensive measurement of system variables including CPU, memory, disk utilization, and application-level metrics. The framework measures both physical/virtual CPU utilization and application variables like queue waiting time and service time. It provides extensive data processing utilities including synchronization scripts, monitoring sensors, Xenmon integration, power consumption modeling, and Kalman filter-based system identification techniques.},
why = {Virtualization has created new challenges for performance monitoring since multiple applications share physical resources and dynamic migration changes system configuration. This work innovates by combining multiple monitoring utilities with advanced system modeling techniques including exponential Kalman filters to predict computational resource requirements and power consumption. The integration of online monitoring with system modeling enables dynamic control of virtualized environments to maintain QoS requirements while minimizing operational costs.},
results = {Experiments in virtualized environments demonstrate accurate prediction of system behavior and effective online analysis of monitoring data. The framework successfully tracks system state changes across multiple virtual machines with minimal latency, and the Kalman filter approach enables accurate estimation of service time and delay without additional performance burden. Integration with a feedback controller shows effective management of system resources to maintain predefined QoS parameters.},
project_tags = {middleware, CPS, ML for CPS}
}
Due to increase in number and complexity of the large scale systems, performance monitoring and multidimensional quality of service (QoS) management has become a difficult and error prone task for system administrators. Recently, the trend has been to use virtualization technology, which facilitates hosting of multiple distributed systems with minimum infrastructure cost via sharing of computational and memory resources among multiple instances, and allows dynamic creation of even bigger clusters. An effective monitoring technique should not only be fine grained with respect to the measured variables, but also should be able to provide a high level overview of the distributed systems to the administrator of all variables that can affect the QoS requirements. At the same time, the technique should not add performance burden to the system. Finally, it should be integrated with a control methodology that manages performance of the enterprise system. In this paper, a systematic distributed event based (DEB) performance monitoring approach is presented for distributed systems by measuring system variables (physical/virtual CPU utilization and memory utilization), application variables (application queue size, queue waiting time, and service time), and performance variables (response time, throughput, and power consumption) accurately with minimum latency at a specified rate. Furthermore, we have shown that proposed monitoring approach can be utilized to provide input to an application monitoring utility to understand the underlying performance model of the system for a successful on-line control of the distributed systems for achieving predefined QoS parameters.
@inproceedings{Dubey2011a,
author = {Dubey, Abhishek and {Karsai}, G. and {Mahadevan}, N.},
booktitle = {2011 Aerospace Conference},
title = {Model-based software health management for real-time systems},
year = {2011},
month = mar,
pages = {1-18},
category = {conference},
contribution = {lead},
doi = {10.1109/AERO.2011.5747559},
file = {:Dubey2011a-Model-based_software_health_management_for_real-time_systems.pdf:PDF},
issn = {1095-323X},
keywords = {software health management, fault diagnosis, mitigation, model-based design, ARINC-653, component architecture},
tag = {platform},
what = {This paper presents model-based software health management techniques for real-time systems that detect, diagnose, and mitigate faults in complex software components. The approach applies traditional System Health Management from avionics to software using component-based architecture and the ARINC Component Model. It introduces a two-level hierarchy with Component-level Health Management detecting anomalies in individual components and System-level Health Management managing overall system health. The framework includes monitoring sensors, anomaly detection via runtime verification, and mitigating actions through timed fault propagation.},
why = {Software in cyber-physical systems like aircraft increasingly implements critical functionality, yet existing fault tolerance techniques rely on hardware redundancy and are inadequate for latent software defects. This work innovates by adapting aviation safety principles to software systems using model-based approaches and component architecture to enable fault containment and recovery. The hierarchical health management structure provides both localized quick recovery and global system-wide diagnosis without requiring system redesign.},
results = {The framework successfully applies model-based health management to the Boeing 777 Air Data Inertial Reference Unit case study, detecting and mitigating effects of component-level failures such as failed accelerometers. Experimental results demonstrate that the system can identify root failure sources using timed fault propagation and automatically execute mitigation strategies. The approach enables systems to recover from individual component failures while maintaining overall functionality and preventing failure cascade.},
project_tags = {CPS, middleware, Explainable AI}
}
Complexity of software systems has reached the point where we need run-time mechanisms that can be used to provide fault management services. Testing and verification may not cover all possible scenarios that a system will encounter, hence a simpler, yet formally specified run-time monitoring, diagnosis, and fault mitigation architecture is needed to increase the software system’s dependability. The approach described in this paper borrows concepts and principles from the field of “Systems Health Management” for complex systems and implements a two level health management strategy that can be applied through a model-based software development process. The Component-level Health Manager (CLHM) for software components provides a localized and limited functionality for managing the health of a component locally. It also reports to the higher-level System Health Manager (SHM) which manages the health of the overall system. SHM consists of a diagnosis engine that uses the timed fault propagation (TFPG) model based on the component assembly. It reasons about the anomalies reported by CLHM and hypothesizes about the possible fault sources. Thereafter, necessary system level mitigation action can be taken. System-level mitigation approaches are subject of on-going investigations and have not been included in this paper. We conclude the paper with case study and discussion.
@techreport{Mahadevan2011a,
author = {Mahadevan, Nagabhushan and Dubey, Abhishek and Karsai, Gabor},
institution = {Institute For Software Integrated Systems, Vanderbilt University},
title = {A Case Study On The Application of Software Health Management Techniques},
year = {2011},
address = {Nashville},
month = {01/2011},
number = {ISIS-11-101},
attachments = {http://www.isis.vanderbilt.edu/sites/default/files/ADIRUTechReport.pdf},
contribution = {colab},
file = {:Mahadevan2011a-A_case_study_on_the_application_of_software_health_management_techniques.pdf:PDF},
tag = {platform},
what = {This work presents a comprehensive case study applying software health management techniques to the Boeing 777 Air Data Inertial Reference Unit using timed failure propagation graphs for real-time fault diagnosis. The authors develop models of the ADIRU software architecture using the ARINC Component Model and apply component-level health management to detect anomalies in individual processing modules. The framework integrates formal methods for specifying monitoring conditions and uses event-based anomaly detection with timed state machines to diagnose faults in real-time.},
why = {As commercial aircraft increasingly rely on integrated modular avionics where software implements safety-critical functions, the ability to diagnose and mitigate software faults in real-time becomes essential for maintaining aircraft safety. This work innovates by developing practical techniques to model complex software systems and automatically generate health managers that can detect subtle faults missed by traditional testing. The model-based approach enables diagnosis of latent software defects that may escape rigorous verification and testing during development.},
results = {The case study demonstrates successful detection and diagnosis of injected faults in an emulated ADIRU system using the component framework and monitoring-based approach. Results show that the system can identify specific component failures and characterize their effects on system output through timed fault propagation analysis. The code generation tools successfully produce runtime code that performs health monitoring with acceptable performance overhead while maintaining strict timing properties required for real-time avionics.},
keywords = {software health management, avionics, fault diagnosis, timed failure propagation, real-time systems, component models},
project_tags = {CPS, middleware, Explainable AI}
}
Ever increasing complexity of software used in large-scale, safety critical cyber-physical systems makes it increasingly difficult to expose and thence correct all potential bugs. There is a need to augment the existing fault tolerance methodologies with new approaches that address latent software bugs exposed at runtime. This paper describes an approach that borrows and adapts traditional ‘Systems Health Management’ techniques to improve software dependability through simple formal specification of runtime monitoring, diagnosis and mitigation strategies. The two-level approach of Health Management at Component and System level is demonstrated on a simulated case study of an Air Data Inertial Reference Unit (ADIRU). That subsystem was categorized as the primary failure source for the in-flight upset caused in the Malaysian Air flight 124 over Perth, Australia in August 2005.
@inbook{Abdelwahed2011,
author = {Abdelwahed, Sherif and Dubey, Abhishek and Karsai, Gabor and Mahadevan, Nagabhushan},
chapter = {Chapter 9},
pages = {285},
publisher = {CRC Press},
title = {Model-based Tools and Techniques for Real-Time System and Software Health Management},
year = {2011},
booktitle = {Machine Learning and Knowledge Discovery for Engineering Systems Health Management},
contribution = {colab},
doi = {10.1201/b11580-15},
keywords = {distributed monitoring, autonomous systems, fault detection, hierarchical architecture, quality of service, middleware},
organization = {CRC Press},
tag = {platform},
url = {https://doi.org/10.1201/b11580},
what = {This paper introduces RFDMon, a distributed event-based monitoring framework for autonomous systems that achieves real-time fault diagnosis and recovery through scalable hierarchical monitoring architecture. The system monitors infrastructure resources including CPU utilization, memory usage, network bandwidth, and hardware health across 100 to 800 computing nodes. It organizes monitoring into regions with local managers and global membership managers to enable efficient information dissemination while maintaining low communication overhead and minimal latency.},
why = {Autonomous computing infrastructure imposes stringent requirements for consistency, synchronization, and security across multiple nodes, and manual monitoring of large-scale systems is neither scalable nor reliable. RFDMon innovates by leveraging Data Distribution Services middleware to decouple publishers and subscribers, enabling flexible and extensible monitoring system architecture. The hierarchical organization with regional coordination provides both scalability for large systems and resilience through distributed fault management without central bottlenecks.},
results = {The framework successfully monitors large heterogeneous computing clusters with minimal resource consumption, automatically detecting infrastructure faults with low latency and minimal delay. Experimental deployments in production environments demonstrate effective scalability to hundreds of nodes with automatic self-reconfiguration when failures occur. The system enables diagnostic monitoring of infrastructure and can identify specific failed components while automatically adapting to infrastructure changes.},
project_tags = {middleware, scalable AI, CPS}
}
The ultimate challenge in system health management is the theory for and application of the technology to systems, for instance to an entire vehicle. The main problem the designer faces is complexity; simply the sheer size of the system, the number of data points, anomalies, and failure modes can be overwhelming. Furthermore, systems are heterogeneous and one has to have a systems engineer’s view to understand interactions among systems. Yet, system-level health management is crucial, as faults increasingly arise from system-level effects and interactions. While individual subsystems tend to have built-in redundancy or local anomaly detection, fault management, and prognostics features, the system integrators are 287required to provide the same capabilities for the entire vehicle, across different engineering subsystems and areas.
@article{Dubey2011,
author = {Dubey, Abhishek and Karsai, Gabor and Mahadevan, Nagabhushan},
journal = {Softw., Pract. Exper.},
title = {A component model for hard real-time systems: {CCM} with {ARINC-653}},
year = {2011},
number = {12},
pages = {1517--1550},
volume = {41},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/journals/spe/DubeyKM11},
contribution = {lead},
doi = {10.1002/spe.1083},
file = {:Dubey2011-A_component_model_for_hard_real-time_systems_CCM_with_ARINC-653.pdf:PDF},
keywords = {component-based development, hard real-time systems, ARINC-653, CORBA, model-based design, middleware},
project = {cps-reliability,cps-middleware},
tag = {platform},
timestamp = {Sun, 28 May 2017 01:00:00 +0200},
url = {https://doi.org/10.1002/spe.1083},
what = {This paper presents a component model for hard real-time systems that extends the CORBA Component Model with ARINC-653 partitioning concepts to enable safe composition of components in critical systems. The work develops a middleware layer that bridges component-based software development with real-time operating system constraints through implementation of CORBA concepts on ARINC-653 partitions. It includes a modeling environment using Model Integrated Computing tools that enables developers to specify component assemblies and automatically generate deployment configurations and timing analysis.},
why = {The increasing complexity of real-time systems and the need for component reusability create tension with the strict isolation and timing requirements of safety-critical systems. This work innovates by demonstrating how standard component middleware concepts can be adapted to operate within hard real-time constraints while maintaining system safety properties. The model-based approach enables design-time verification of system properties while allowing developers to leverage familiar component-based development paradigms.},
results = {The framework successfully implements CORBA component semantics on top of ARINC-653 partitions using less than 15,000 lines of C++ code, demonstrating practical feasibility of the approach. Case studies with GPS example components show that the system can generate code that respects timing constraints while enabling component reuse. The modeling environment successfully captures real-time properties and enables design-time verification of system assemblies before deployment.},
project_tags = {middleware, CPS}
}
Size and complexity of software in safety critical system is increasing at a rapid pace. One technology that can be used to mitigate this complexity is component-based software development. However, in spite of the apparent benefits of a component-based approach to development, little work has been done in applying these concepts to hard real time systems. This paper improves the state of the art by making three contributions: (1) we present a component model for hard real time systems and define the semantics of different types of component interactions; (2) we present an implementation of a middleware that supports this component model. This middleware combines an open source CORBA Component Model (CCM) implementation (MICO) with ARINC-653: a state of the art RTOS standard, (3) finally; we describe a modeling environment that enables design, analysis, and deployment of component assemblies. We conclude with a discussion of lessons learned during this exercise. Our experiences point towards extending both the CCM as well as revising the ARINC-653.
@inproceedings{Mahadevan2011,
author = {Mahadevan, Nagabhushan and Dubey, Abhishek and Karsai, Gabor},
booktitle = {2011 {ICSE} Symposium on Software Engineering for Adaptive and Self-Managing Systems, {SEAMS} 2011, Waikiki, Honolulu , HI, USA, May 23-24, 2011},
title = {Application of software health management techniques},
year = {2011},
acceptance = {27},
pages = {1--10},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/conf/icse/MahadevanDK11},
category = {selectiveconference},
contribution = {colab},
doi = {10.1145/1988008.1988010},
file = {:Mahadevan2011-Application_of_software_health_management_techniques.pdf:PDF},
keywords = {software health management, real-time systems, model-based design, component architecture, code generation, fault mitigation},
project = {cps-middleware,cps-reliability},
tag = {platform},
timestamp = {Tue, 06 Nov 2018 00:00:00 +0100},
url = {https://doi.org/10.1145/1988008.1988010},
what = {This paper applies software health management techniques to real-time systems using model-based development and the ARINC Component Model framework. The work extends traditional system health management from aerospace to software by developing component-level health managers that monitor individual components and system-level health managers that coordinate global system health. It demonstrates application of model-based design tools to automatically generate both functional code and health management code from specifications.},
why = {Software increasingly implements critical functionality in cyber-physical systems where faults can have safety consequences, yet traditional software testing and verification cannot guarantee correct behavior in all operational scenarios. This work innovates by applying proven aerospace health management principles to software systems through model-based development that automatically generates monitoring code. The hierarchical health management structure enables both localized component recovery and global system diagnosis.},
results = {The approach successfully demonstrates application of software health management to a real-time system case study with automatic code generation producing functional and monitoring code that respects timing constraints. Results show that component-level health managers can effectively detect and mitigate anomalies while system-level managers coordinate global recovery actions. The framework enables developers to specify health management requirements in models and automatically generate executable code with known timing properties.},
project_tags = {CPS, middleware, Explainable AI}
}
The growing complexity of software used in large-scale, safety critical cyber-physical systems makes it increasingly difficult to expose and hence correct all potential defects. There is a need to augment the existing fault tolerance methodologies with new approaches that address latent software defects exposed at runtime. This paper describes an approach that borrows and adapts traditional ‘System Health Management’ techniques to improve software dependability through simple formal specification of runtime monitoring, diagnosis, and mitigation strategies. The two-level approach to health management at the component and system level is demonstrated on a simulated case study of an Air Data Inertial Reference Unit (ADIRU). An ADIRU was categorized as the primary failure source for the in-flight upset caused in the Malaysian Air flight 124 over Perth, Australia in 2005.
@article{Nordstrom2011,
author = {Nordstrom, Steven and Dubey, Abhishek and Keskinpala, Turker and Neema, Sandeep and Bapty, Theodore},
journal = {{Journal of Aerospace Computing, Information, and Communication}},
title = {Autonomic Healing of Model-Based Systems},
year = {2011},
number = {4},
pages = {87--99},
volume = {8},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/journals/jacic/NordstromDKNB11},
contribution = {minor},
doi = {10.2514/1.31940},
keywords = {reliability},
project = {cps-reliability},
tag = {platform},
timestamp = {Thu, 18 May 2017 01:00:00 +0200},
url = {https://doi.org/10.2514/1.31940}
}
@inproceedings{Roy2011a,
author = {Roy, Nilabja and Dubey, Abhishek and Gokhale, Aniruddha S.},
booktitle = {{IEEE} International Conference on Cloud Computing, {CLOUD} 2011, Washington, DC, USA, 4-9 July, 2011},
title = {Efficient Autoscaling in the Cloud Using Predictive Models for Workload Forecasting},
year = {2011},
acceptance = {22.4},
pages = {500--507},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/conf/IEEEcloud/RoyDG11},
category = {selectiveconference},
contribution = {colab},
doi = {10.1109/CLOUD.2011.42},
file = {:Roy2011a-Efficient_Autoscaling_in_the_Cloud_Using_Predictive_Models_for_Workload_Forecasting.pdf:PDF},
keywords = {cloud computing, autoscaling, workload prediction, resource allocation, performance modeling, quality of service},
project = {cps-middleware},
tag = {platform},
timestamp = {Wed, 16 Oct 2019 14:14:54 +0200},
url = {https://doi.org/10.1109/CLOUD.2011.42},
what = {This paper presents efficient autoscaling algorithms for cloud environments that predict future workload and allocate resources to maintain quality of service while minimizing operational costs. The work develops a model predictive control approach that uses performance models to estimate required resources based on predicted workload. It combines multiple techniques including workload forecasting using autoregressive moving average models, response time analysis, and resource allocation optimization to determine optimal machine allocation.},
why = {Cloud computing promises cost-effective computing through dynamic resource allocation, yet static provisioning based on average load results in either performance degradation during peaks or resource waste during valleys. This work innovates by integrating workload prediction with control-theoretic optimization to make dynamic scaling decisions that satisfy performance requirements while minimizing operational costs. The predictive approach anticipates load changes rather than reacting to them, enabling faster and more efficient resource adjustments.},
results = {Experiments with realistic workload traces demonstrate that the predictive algorithm determines effective resource allocations that allocate extra machines only when predicted load increases. The approach successfully satisfies quality of service objectives while achieving significant cost savings compared to static or reactive approaches. The algorithm shows effective performance across different cost function configurations and workload patterns.},
project_tags = {scalable AI, ML for CPS}
}
Large-scale component-based enterprise applications that leverage Cloud resources expect Quality of Service(QoS) guarantees in accordance with service level agreements between the customer and service providers. In the context of Cloud computing, auto scaling mechanisms hold the promise of assuring QoS properties to the applications while simultaneously making efficient use of resources and keeping operational costs low for the service providers. Despite the perceived advantages of auto scaling, realizing the full potential of auto scaling is hard due to multiple challenges stemming from the need to precisely estimate resource usage in the face of significant variability in client workload patterns. This paper makes three contributions to overcome the general lack of effective techniques for workload forecasting and optimal resource allocation. First, it discusses the challenges involved in auto scaling in the cloud. Second, it develops a model-predictive algorithm for workload forecasting that is used for resource auto scaling. Finally, empirical results are provided that demonstrate that resources can be allocated and deal located by our algorithm in a way that satisfies both the application QoS while keeping operational costs low.
@inproceedings{Roy2011b,
author = {Roy, Nilabja and Dubey, Abhishek and Gokhale, Aniruddha S. and Dowdy, Larry W.},
booktitle = {ICPE'11 - Second Joint {WOSP/SIPEW} International Conference on Performance Engineering, Karlsruhe, Germany, March 14-16, 2011},
title = {A Capacity Planning Process for Performance Assurance of Component-based Distributed Systems},
year = {2011},
pages = {259--270},
acceptance = {36},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/conf/wosp/RoyDGD11},
category = {selectiveconference},
contribution = {colab},
doi = {10.1145/1958746.1958784},
file = {:Roy2011b-A_Capacity_Planning_Process_for_Performance_Assurance_of_Component-based_Distributed_Systems.pdf:PDF},
keywords = {capacity planning, multi-tier systems, performance modeling, quality of service, resource allocation, queuing models},
project = {cps-middleware},
tag = {platform},
timestamp = {Tue, 06 Nov 2018 00:00:00 +0100},
url = {https://doi.org/10.1145/1958746.1958784},
what = {This paper develops a capacity planning process for multi-tiered component-based distributed systems that enables service providers to guarantee performance and availability to customers without impacting revenue. The work presents a two-stage Modeling and Analysis using Queuing-Placement and Replication Optimization (MAQ-PRO) process that combines analytical modeling with profiling techniques to estimate performance requirements and determine optimal component placement. It addresses the challenge of high assurance of performance and availability while minimizing resource consumption.},
why = {Web-scale enterprise systems must deliver consistent performance and high availability to users while managing operational costs and maintaining revenue through service level agreements. This work innovates by developing a systematic capacity planning process that combines analytical performance estimation with component-level profiling to handle increased system activity from multiple processing stages. The two-stage approach enables both accurate performance prediction and efficient resource allocation for complex multi-tier systems.},
results = {The approach successfully models multi-tier web applications using queuing theory extended with regression models that account for increased system activity and context switching overhead. Experimental results show accurate performance estimation that enables determining required resource allocations to satisfy service level agreements. The framework demonstrates effective capacity planning for real-world systems while maintaining application functionality and performance characteristics.},
project_tags = {scalable AI, ML for CPS}
}
For service providers of multi-tiered component-based applications, such as web portals, assuring high performance and availability to their customers without impacting revenue requires effective and careful capacity planning that aims at minimizing the number of resources, and utilizing them efficiently while simultaneously supporting a large customer base and meeting their service level agreements. This paper presents a novel, hybrid capacity planning process that results from a systematic blending of 1) analytical modeling, where traditional modeling techniques are enhanced to overcome their limitations in providing accurate performance estimates; 2) profile-based techniques, which determine performance profiles of individual software components for use in resource allocation and balancing resource usage; and 3) allocation heuristics that determine minimum number of resources to allocate software components. Our results illustrate that using our technique, performance (i.e., bounded response time) can be assured while reducing operating costs by using 25% less resources and increasing revenues by handling 20% more clients compared to traditional approaches.
@techreport{Saxena2011,
author = {Saxena, Tripti and Dubey, Abhishek},
institution = {Insititute for Software Integrated Systems, Vanderbilt University},
title = {Meta-Tools For Designing Scientific Workflow Management Systems: Part-I, Survey},
year = {2011},
number = {ISIS-11-105},
attachments = {http://www.isis.vanderbilt.edu/sites/default/files/Survey-report.pdf},
contribution = {colab},
file = {:Saxena2011-Meta-tools_for_Designing_Scientific_Workflow_Management_Systems_Survey.pdf:PDF},
what = {This paper presents a survey of scientific workflow management systems and their features for specifying, managing, and monitoring scientific computation workflows. The work compares workflow systems including Kepler, Pegasus, Chimera, and others across dimensions such as composition, representation, mapping, execution, fault tolerance, metadata handling, and provenance. It provides comprehensive analysis of how different systems handle the challenges of managing large-scale scientific computations across distributed infrastructure.},
why = {Scientific workflows enable large-scale computational analysis through specification of data processing activities and dependencies, yet diverse workflow systems with specialized requirements make it difficult to reconcile them. This work innovates by providing systematic comparison of workflow management approaches and identifying key features needed for effective scientific workflow execution. The survey highlights the importance of workflow lifecycle management including composition, execution tracking, fault tolerance, and data provenance for reproducible science.},
results = {The survey identifies key differences in how workflow systems handle composition and representation, with some supporting graphical interfaces while others use textual specifications. Results show variations in fault tolerance approaches and metadata handling, with most systems using ad-hoc techniques rather than integrated frameworks. The work demonstrates that effective workflow management requires addressing multiple interdependent concerns from specification through execution monitoring to result verification.},
keywords = {scientific workflows, workflow management, distributed computing, fault tolerance, data provenance, monitoring},
project_tags = {middleware, scalable AI}
}
Scientific workflows require the coordination of data processing activities, resulting in executions driven by data dependencies. Due to the scales involved and the repetition of analysis, typically workflows are analyzed in coordinated campaigns, each execution managed and controlled by the workflow management system. In this respect, a workflow management system is required to (1) provide facilities for specifying workflows: intermediate steps, inputs/outputs, and parameters, (2) manage the execution of the workflow based on specified parameters, (3) provide facilities for managing data provenance, and (4) provide facilities to monitor the progress of the workflow, include facilities to detect anomalies, isolate faults and provide recovery actions. In this paper, part-I of a two part series, we provide a comparison of some state of the art workflow management systems with respect to these four primary requirements.
@article{Piccoli2010,
author = {Piccoli, Luciano and Dubey, Abhishek and Simone, James N and Kowalkowlski, James B},
journal = {Journal of Physics: Conference Series},
title = {{LQCD} workflow execution framework: Models, provenance and fault-tolerance},
year = {2010},
month = apr,
number = {7},
pages = {072047},
volume = {219},
contribution = {colab},
doi = {10.1088/1742-6596/219/7/072047},
file = {:Piccoli2010-LQCD_workflow_execution_framework.pdf:PDF},
keywords = {scientific workflows, fault tolerance, reliability, data provenance, workflow execution, distributed computing},
publisher = {{IOP} Publishing},
what = {This paper introduces a model-based, hierarchical, reliable execution framework for scientific workflows that integrates workflow and reliability subsystems to enable data provenance tracking, execution monitoring, and online fault tolerance. The work proposes parametrized abstract workflow templates instantiated with specific input parameters to define concrete workflows executed in a distributed environment. The framework supports both configuration generation and analysis campaign workflows with execution tracking and monitoring of vital health parameters allocated on compute nodes.},
why = {Large computing clusters used for scientific processing suffer from intermittent faults when operated over long periods, and workflow execution can fail due to single job failures, communication delays, or synchronization issues. This work innovates by integrating workflow specification with formal reliability verification methods to provide explicit fault isolation and mitigation capabilities. The model-based approach enables recovery from failures without manual intervention while maintaining data provenance necessary for scientific reproducibility.},
results = {The framework successfully executes LQCD workflows on computing clusters with reliable recovery from job failures and performance tracking across multiple participants. Results demonstrate effective fault isolation and mitigation using reflex engine architecture with pre-specified mitigating actions. The system maintains data provenance enabling workflow result verification and enables recovery from failure points using stored intermediate results.},
project_tags = {middleware, scalable AI, emergency}
}
Large computing clusters used for scientific processing suffer from systemic failures when operated over long continuous periods for executing workflows. Diagnosing job problems and faults leading to eventual failures in this complex environment is difficult, specifically when the success of an entire workflow might be affected by a single job failure. In this paper, we introduce a model-based, hierarchical, reliable execution framework that encompass workflow specification, data provenance, execution tracking and online monitoring of each workflow task, also referred to as participants. The sequence of participants is described in an abstract parameterized view, which is translated into a concrete data dependency based sequence of participants with defined arguments. As participants belonging to a workflow are mapped onto machines and executed, periodic and on-demand monitoring of vital health parameters on allocated nodes is enabled according to pre-specified rules. These rules specify conditions that must be true pre-execution, during execution and post-execution. Monitoring information for each participant is propagated upwards through the reflex and healing architecture, which consists of a hierarchical network of decentralized fault management entities, called reflex engines. They are instantiated as state machines or timed automatons that change state and initiate reflexive mitigation action(s) upon occurrence of certain faults. We describe how this cluster reliability framework is combined with the workflow execution framework using formal rules and actions specified within a structure of first order predicate logic that enables a dynamic management design that reduces manual administrative workload, and increases cluster-productivity.
@inproceedings{Pan2010,
author = {{Pan}, P. and Dubey, Abhishek and {Piccoli}, L.},
booktitle = {2010 Seventh IEEE International Conference and Workshops on Engineering of Autonomic and Autonomous Systems},
title = {Dynamic Workflow Management and Monitoring Using DDS},
year = {2010},
month = mar,
pages = {20-29},
category = {conference},
contribution = {lead},
doi = {10.1109/EASe.2010.12},
file = {:Pan2010-Dynamic_workflow_management_and_monitoring_using_dds.pdf:PDF},
issn = {2168-1872},
keywords = {workflow management, monitoring, fault tolerance, distributed systems, publish-subscribe middleware, scientific computing},
what = {This paper extends previous workflow management frameworks with dynamic workflow management and monitoring using Data Distribution Services middleware. The work presents a hierarchical framework for managing scientific workflows through workflow managers coordinating job execution and participant managers tracking individual task execution. It integrates monitoring of infrastructure resources and workflow status to enable both fault detection and workflow recovery through stopping and restarting failed workflow portions.},
why = {Scientific workflows executing over long periods on unreliable computing infrastructure need both fault tolerance and recovery capabilities with minimal manual intervention. This work innovates by integrating workflow management with distributed monitoring infrastructure using publish-subscribe middleware to decouple workflow components. The hierarchical management structure enables both local rapid response to failures and global coordination of recovery actions across the distributed system.},
results = {The framework successfully manages scientific workflows with distributed monitoring that tracks both infrastructure status and workflow execution. Results demonstrate effective fault detection and workflow recovery through stopping failed portions and restarting from known checkpoints. The integration with Data Distribution Services enables scalable monitoring across large computing clusters without centralized bottlenecks.},
project_tags = {middleware, scalable AI}
}
Large scientific computing data-centers require a distributed dependability subsystem that can provide fault isolation and recovery and is capable of learning and predicting failures to improve the reliability of scientific workflows. This paper extends our previous work on the autonomic scientific workflow management systems by presenting a hierarchical dynamic workflow management system that tracks the state of job execution using timed state machines. Workflow monitoring is achieved using a reliable distributed monitoring framework, which employs publish-subscribe middleware built upon OMG Data Distribution Service standard. Failure recovery is achieved by stopping and restarting the failed portions of workflow directed acyclic graph.
@techreport{4201,
author = {Neema, Himanshu and Dubey, Abhishek and Karsai, Gabor},
institution = {Insitute For Software Integrated Systems},
title = {A Report On Simulating External Applications With SOAMANET in the Loop},
year = {2010},
address = {Nashville},
month = {08/2010},
number = {ISIS-10-108},
attachments = {http://www.isis.vanderbilt.edu/sites/default/files/report.pdf},
contribution = {colab}
}
@techreport{4196,
author = {Dubey, Abhishek and Karsai, Gabor and Mahadevan, Nagabhushan},
institution = {Institute for Software Integrated Systems},
title = {Towards Model-based Software Health Management for Real-Time Systems},
year = {2010},
attachments = {http://www.isis.vanderbilt.edu/sites/default/files/Report_0.pdf , http://www.isis.vanderbilt.edu/sites/default/files/Report.pdf},
contribution = {lead},
issn = {ISIS-10-106}
}
The complexity of software systems has reached the point where we need run-time mechanisms that can be used to provide fault management services. Testing and verification may not cover all possible scenarios that a system can encounter, hence a simpler, yet formally specified run-time monitoring, diagnosis, and fault mitigation architecture is needed to increase the software system’s dependability. The approach described in this paper borrows concepts and principles from the field of ‘Systems Health Management’ for complex systems. The paper introduces the fundamental ideas for software health management, and then illustrates how these can be implemented in a model-based software development process, including a case study and related work.
@inproceedings{Balasubramanian2010,
author = {Balasubramanian, Jaiganesh and Gokhale, Aniruddha S. and Dubey, Abhishek and Wolf, Friedhelm and Lu, Chenyang and Gill, Christopher D. and Schmidt, Douglas C.},
booktitle = {16th {IEEE} Real-Time and Embedded Technology and Applications Symposium, {RTAS} 2010, Stockholm, Sweden, April 12-15, 2010},
title = {Middleware for Resource-Aware Deployment and Configuration of Fault-Tolerant Real-time Systems},
year = {2010},
pages = {69--78},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/conf/rtas/BalasubramanianGDWLGS10},
category = {selectiveconference},
contribution = {colab},
doi = {10.1109/RTAS.2010.30},
file = {:Balasubramanian2010-Middleware_for_Resource-Aware_Deployment_and_Configuration.pdf:PDF},
keywords = {fault tolerance, real-time systems, passive replication, task allocation, embedded systems, resource management},
project = {cps-middleware,cps-reliability},
tag = {platform},
timestamp = {Tue, 05 Nov 2019 00:00:00 +0100},
url = {https://doi.org/10.1109/RTAS.2010.30},
what = {This paper presents DeCoRAM, a deployment and configuration middleware for distributed real-time and embedded systems that supports passive replication and task allocation while minimizing resource consumption. The framework provides resource-aware task allocation algorithms that improve current state-of-the-art in passive replication and real-time task allocation, along with a deployment and configuration reasoning engine that automatically deploys tasks and configures middleware. The work addresses challenges of supporting multiple processor failures while maintaining real-time requirements.},
why = {Distributed real-time embedded systems require both fault tolerance and strict real-time guarantees, yet supporting passive replication for fault tolerance while maintaining real-time constraints and minimizing resource consumption is challenging. This work innovates by developing resource-aware task allocation algorithms that consider multiple processor failures and fault-tolerant replication patterns while respecting timing constraints. The automated deployment and configuration engine eliminates manual configuration errors and enables seamless handling of system changes.},
results = {The framework demonstrates successful deployment of passively replicated real-time systems with resource-efficient task allocation that handles multiple processor failures. Experimental results show effective resource reduction through passive replication compared to active approaches while maintaining fault tolerance and real-time properties. The automated configuration engine successfully deploys and configures complex systems with minimal operator intervention.},
project_tags = {CPS, middleware, scalable AI}
}
Developing large-scale distributed real-time and embedded (DRE) systems is hard in part due to complex deployment and configuration issues involved in satisfying multiple quality for service (QoS) properties, such as real-timeliness and fault tolerance. This paper makes three contributions to the study of deployment and configuration middleware for DRE systems that satisfy multiple QoS properties. First, it describes a novel task allocation algorithm for passively replicated DRE systems to meet their real-time and fault-tolerance QoS properties while consuming significantly less resources. Second, it presents the design of a strategizable allocation engine that enables application developers to evaluate different allocation algorithms. Third, it presents the design of a middleware agnostic configuration framework that uses allocation decisions to deploy application components/replicas and configure the underlying middleware automatically on the chosen nodes. These contributions are realized in the DeCoRAM (Deployment and Configuration Reasoning and Analysis via Modeling) middleware. Empirical results on a distributed testbed demonstrate DeCoRAM’s ability to handle multiple failures and provide efficient and predictable real-time performance.
@inproceedings{Dubey2010a,
author = {Dubey, Abhishek and Karsai, Gabor and Keresk{\'{e}}nyi, R{\'{o}}bert and Mahadevan, Nagabhushan},
booktitle = {13th {IEEE} International Symposium on Object/Component/Service-Oriented Real-Time Distributed Computing, {ISORC} 2010, Carmona, Sevilla, Spain, 5-6 May 2010},
title = {A Real-Time Component Framework: Experience with {CCM} and {ARINC-653}},
year = {2010},
pages = {143--150},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/conf/isorc/DubeyKKM10},
category = {selectiveconference},
contribution = {lead},
doi = {10.1109/ISORC.2010.39},
file = {:Dubey2010a-A_Real-Time_Component_Framework_Experience_with_CCM_and_ARINC-653.pdf:PDF},
keywords = {component-based development, hard real-time systems, ARINC-653, middleware, timing constraints, system integration},
project = {cps-middleware,cps-reliability},
tag = {platform},
timestamp = {Wed, 16 Oct 2019 14:14:53 +0200},
url = {https://doi.org/10.1109/ISORC.2010.39},
what = {This paper presents a real-time component framework combining component-based software construction with hard real-time operating system services through implementation of CORBA Component Model concepts on ARINC-653 partitions. The work demonstrates practical feasibility of the approach through a GPS example component assembly that integrates multiple real-time components within strict timing constraints. The framework supports both synchronous and asynchronous component interactions while maintaining real-time properties and enabling component reuse across different systems.},
why = {Component-based development offers significant benefits for complex systems through reusability and modularity, yet applying these concepts to hard real-time systems requires careful attention to timing properties and resource isolation. This work innovates by showing how standard component middleware can be adapted to operate within ARINC-653 constraints while preserving component semantics and enabling reusable component development. The practical implementation demonstrates feasibility of the approach for real-world hard real-time systems.},
results = {The framework successfully implements GPS component assembly that respects strict timing constraints while enabling component reuse and dynamic configuration. Results show practical feasibility of layering component middleware on ARINC-653 partitions with acceptable overhead. The approach enables developers to leverage component-based development for real-time systems while maintaining all required timing and safety properties.},
project_tags = {CPS, middleware}
}
The complexity of software in systems like aerospace vehicles has reached the point where new techniques are needed to ensure system dependability while improving the productivity of developers. One possible approach is to use precisely defined software execution platforms that (1) enable the system to be composed from separate components, (2) restrict component interactions and prevent fault propagation, and (3) whose compositional properties are well-known. In this paper we describe the initial steps towards building a platform that combines component-based software construction with hard real-time operating system services. Specifically, the paper discusses how the CORBA Component Model (CCM) could be combined with the ARINC-653 platform services and the lessons learned from this experiment. The results point towards both extending the CCM as well as revising the ARINC-653.
@inproceedings{Mehrotra2010,
author = {Mehrotra, Rajat and Dubey, Abhishek and Abdelwahed, Sherif and Tantawi, Asser N.},
booktitle = {{MASCOTS} 2010, 18th Annual {IEEE/ACM} International Symposium on Modeling, Analysis and Simulation of Computer and Telecommunication Systems, Miami, Florida, USA, August 17-19, 2010},
title = {Integrated Monitoring and Control for Performance Management of Distributed Enterprise Systems},
year = {2010},
pages = {424--426},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/conf/mascots/MehrotraDAT10},
category = {poster},
contribution = {lead},
doi = {10.1109/MASCOTS.2010.57},
file = {:Mehrotra2010-Integrated_Monitoring_and_Control.pdf:PDF},
keywords = {performance management, feedback control, system modeling, quality of service, monitoring, enterprise systems},
project = {cps-middleware},
timestamp = {Wed, 16 Oct 2019 14:14:53 +0200},
url = {https://doi.org/10.1109/MASCOTS.2010.57},
what = {This paper presents an integrated monitoring and control framework for distributed enterprise systems that estimates system state from limited measurements and applies feedback control to achieve multiple quality of service objectives. The work develops system models that characterize performance behavior, applies exponential Kalman filtering to estimate service times and delays, and implements a feedback controller that optimizes multiple performance objectives. The approach demonstrates integration of monitoring, modeling, and control for managing distributed system performance.},
why = {Large enterprise systems must simultaneously achieve multiple quality of service objectives including response time, throughput, and power consumption under varying workload conditions. This work innovates by integrating state estimation with model predictive control to adjust system parameters and maintain performance objectives. The approach accounts for complex system dynamics including bottleneck resources and context switching overhead that traditional queuing models overlook.},
results = {Experiments demonstrate effective integration of monitoring and control for multi-tier systems achieving multiple quality of service objectives. Results show accurate system state estimation using Kalman filtering and effective parameter adjustment through feedback control. The framework achieves 18% power savings while maintaining response time constraints through integrated management of CPU frequency and resource allocation.},
project_tags = {scalable AI, ML for CPS}
}
This paper describes an integrated monitoring and control framework for managing performance of distributed enterprise systems.
@inproceedings{Saxena2010,
author = {Saxena, Tripti and Dubey, Abhishek and Balasubramanian, Daniel and Karsai, Gabor},
booktitle = {2010 Seventh IEEE International Conference and Workshops on Engineering of Autonomic and Autonomous Systems},
title = {Enabling self-management by using model-based design space exploration},
year = {2010},
organization = {IEEE},
pages = {137--144},
category = {conference},
contribution = {lead},
file = {:Saxena2010-Enabling_Self-Management_by_Using_Model-based_Design_Space_Exploration.pdf:PDF},
keywords = {scientific workflows, model-driven development, domain-specific languages, workflow systems, constraint specification},
what = {This paper presents techniques for designing scientific workflow management systems using model-based approaches and meta-tools that enable custom workflow system development. The work develops ComponentML domain-specific language for specifying component-based systems and uses model-driven architecture to enable composition of component libraries. The approach supports specification of constraints on components and automatic generation of deployment configurations that satisfy those constraints.},
why = {Diverse scientific domains have specialized workflow requirements making it difficult to develop single universal workflow system, yet completely custom development for each domain is expensive and error-prone. This work innovates by providing meta-tools and domain-specific languages that enable rapid development of customized workflow systems. The model-based approach enables both constraint specification and automated configuration generation ensuring correct system deployment.},
results = {The framework successfully supports development of customized workflow systems for scientific domains through specification of domain-specific models and constraints. Results show effective automatic generation of deployment configurations that satisfy complex constraints including performance and composition restrictions. The approach demonstrates practical feasibility of model-driven development for scientific workflow systems.},
project_tags = {middleware, scalable AI}
}
Reconfiguration and self-management are important properties for systems that operate in hazardous and uncontrolled environments, such as inter-planetary space. These systems need a reconfiguration mechanism that provides recovery from individual component failures as well as the ability to dynamically adapt to evolving mission goals. One way to provide this functionality is to define a model of alternative system configurations and allow the system to choose the current configuration based on its current state, including environmental parameters and goals. The primary difficulties with this approach are (1) the state space of configurations can grow very large, which can make explicit enumeration infeasible, and (2) the component failures and evolving system goals must be somehow encoded in the system configuration model. This paper describes an online reconfiguration method based on model-based designspace exploration. We symbolically encode the set of valid system configurations and assert the current system state and goals as symbolic constraints. Our initial work indicates that this method scales and is capable of providing effective online dynamic reconfiguration.
@techreport{4181,
author = {Mehrotra, Rajat and Dubey, Abhishek and Abdelwahed, Sherif and Tantawi, Asser},
institution = {Institute for Software Integrated Systems},
title = {Model Identification for Performance Management of Distributed Enterprise Systems},
year = {2010},
address = {Nashville},
number = {ISIS-10-104},
type = {Technical Report},
attachments = {http://www.isis.vanderbilt.edu/sites/default/files/Paper_4.pdf},
contribution = {colab},
issn = {ISIS-10-104}
}
@inproceedings{Dubey2009d,
author = {Dubey, Abhishek and {Piccoli}, L. and {Kowalkowski}, J. B. and {Simone}, J. N. and {Sun}, X. and {Karsai}, G. and {Neema}, S.},
booktitle = {2009 Sixth IEEE Conference and Workshops on Engineering of Autonomic and Autonomous Systems},
title = {Using Runtime Verification to Design a Reliable Execution Framework for Scientific Workflows},
year = {2009},
month = apr,
pages = {87-96},
category = {conference},
contribution = {lead},
doi = {10.1109/EASe.2009.13},
file = {:Dubey2009d-Using_runtime_verification_to_design_a_reliable_execution_framework_for_scientific_workflows.pdf:PDF},
issn = {2168-1872},
keywords = {self-management, design space exploration, reconfiguration, constraint satisfaction, autonomous systems, model-based design},
what = {This paper presents enabling self-management in systems through model-based design space exploration that symbolically encodes valid system configurations and constraints. The work develops the DESERT tool that works on finite-state design spaces to identify configurations satisfying specified constraints and performance objectives. The approach uses binary decision diagrams to represent design spaces and constraint satisfaction for efficient exploration of large configuration spaces.},
why = {Autonomous systems must adapt to changing environmental conditions and mission objectives, yet manual configuration management is infeasible for complex systems with many possible configurations. This work innovates by enabling online reconfiguration through systematic exploration of valid configurations that satisfy current constraints and objectives. The model-based constraint approach allows specifying both functional and non-functional requirements that guide configuration selection.},
results = {The DESERT tool successfully explores large design spaces to identify valid configurations and recomputes solutions when system conditions change. Results demonstrate effective online reconfiguration of systems in response to component failures and new mission objectives. The approach scales to realistic system complexities while maintaining constraint satisfaction during all configuration changes.},
project_tags = {CPS, middleware, scalable AI}
}
In this paper, we describe the design of a scientific workflow execution framework that integrates runtime verification to monitor its execution and checking it against the formal specifications. For controlling workflow execution, this framework provides for data provenance, execution tracking and online monitoring of each work flow task, also referred to as participants. The sequence of participants is described in an abstract parameterized view, which is used to generate concrete data dependency based sequence of participants with defined arguments. As participants belonging to a workflow are mapped onto machines and executed, periodic and on-demand monitoring of vital health parameters on allocated nodes is enabled according to pre-specified invariant conditions with actions to be taken upon violation of invariants.
@techreport{4121,
author = {Balasubramanian, Jaiganesh and Gokhale, Aniruddha and Wolf, Friedhelm and Dubey, Abhishek and Lu, Chenyang and Gill, Chris and Schmidt, Douglas C.},
institution = {Institute for Software Integrated Systems, Vanderbilt University},
title = {Resource-Aware Deployment and Configuration of Fault-tolerant Real-time Systems},
year = {2009},
month = {10/2009},
number = {ISIS-09-109},
attachments = {http://www.isis.vanderbilt.edu/sites/default/files/decoram_tr09_0.pdf},
contribution = {colab},
issn = {ISIS-09-109}
}
@techreport{4136,
author = {Dubey, Abhishek},
institution = {Institute for Software Integrated Systems},
title = {A Discussion on Supervisory Control Theory in Real-Time Discrete Event Systems},
year = {2009},
month = {11/2009},
number = {ISIS-09-112},
attachments = {http://www.isis.vanderbilt.edu/sites/default/files/TechReport.pdf},
contribution = {lead},
isbn = {ISIS-09-112}
}
@techreport{4137,
author = {Dubey, Abhishek and Karsai, Gabor and Kereskenyi, Robert and Mahadevan, Nagabhushan},
institution = {Institute for Software Integrated Systems},
title = {Towards a Real-time Component Framework for Software Health Management},
year = {2009},
address = {Nashville},
month = {11/2009},
number = {ISIS-09-111},
type = {Technical Report},
attachments = {http://www.isis.vanderbilt.edu/sites/default/files/TechReport2009.pdf},
contribution = {lead}
}
The complexity of software in systems like aerospace vehicles has reached the point where new techniques are needed to ensure system dependability. Such techniques include a novel direction called ‘Software Health Management’ (SHM) that extends classic software fault tolerance with techniques borrowed from System Health Management. In this paper the initial steps towards building a SHM approach are described that combine component-based software construction with hard real-time operating system platforms. Specifically, the paper discusses how the CORBA Component Model could be combined with the ARINC-653 platform services and the lessons learned from this experiment. The results point towards both extending the CCM as well as revising the ARINC-653
@article{Dubey2009,
author = {Dubey, Abhishek and Mehrotra, Rajat and Abdelwahed, Sherif and Tantawi, Asser N.},
journal = {{SIGMETRICS} Performance Evaluation Review},
title = {Performance modeling of distributed multi-tier enterprise systems},
year = {2009},
number = {2},
pages = {9--11},
volume = {37},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/journals/sigmetrics/DubeyMAT09},
contribution = {lead},
doi = {10.1145/1639562.1639566},
file = {:Dubey2009-Performance_modeling_of_distributed_multi-tier_enterprise_systems.pdf:PDF},
keywords = {runtime verification, workflow execution, fault tolerance, formal methods, scientific computing, monitoring},
project = {cps-middleware},
tag = {platform},
timestamp = {Tue, 06 Nov 2018 00:00:00 +0100},
url = {https://doi.org/10.1145/1639562.1639566},
what = {This paper introduces using runtime verification to design reliable execution frameworks for scientific workflows that integrate execution tracking with online fault checking. The work describes integration of runtime verification with workflow execution enabling conditions to be periodically verified during workflow execution. The framework provides detection of anomalies through monitoring of vital health parameters and provides participants with pre-specified conditions for fault tolerance.},
why = {Scientific workflows executing on distributed infrastructure face intermittent failures from hardware, network, and software faults that can compromise experiment reproducibility and validity. This work innovates by integrating formal verification methods with workflow execution to detect problems early and enable automatic recovery. The formal specification of workflow properties enables rigorous checking against implementation avoiding manual error-prone monitoring.},
results = {The framework successfully integrates runtime verification with workflow execution demonstrating detection of anomalies and enforcement of timing constraints. Results show effective monitoring of workflow properties during execution with minimal performance overhead. The approach enables reliable scientific workflow execution with automatic detection of timing violations and anomalies.},
project_tags = {middleware, scalable AI}
}
@inproceedings{Dubey2009a,
author = {Dubey, Abhishek and Riley, Derek and Abdelwahed, Sherif and Bapty, Ted},
booktitle = {16th Annual {IEEE} International Conference and Workshop on the Engineering of Computer Based Systems, {ECBS} 2009, San Francisco, California, USA, 14-16 April 2009},
title = {Modeling and Analysis of Probabilistic Timed Systems},
year = {2009},
pages = {69--78},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/conf/ecbs/DubeyRAB09},
category = {conference},
contribution = {lead},
doi = {10.1109/ECBS.2009.44},
file = {:Dubey2009a-Modeling_and_Analysis_of_Probabilistic_Timed_Systems.pdf:PDF},
keywords = {probabilistic systems, timed automata, verification, safety, reachability, Markov processes},
project = {cps-reliability},
timestamp = {Wed, 16 Oct 2019 14:14:51 +0200},
url = {https://doi.org/10.1109/ECBS.2009.44},
what = {This paper presents modeling and analysis of probabilistic timed systems using approximation with Markov Decision Processes to enable verification of safety, liveness, and bounded time reachability properties. The work develops techniques to convert probabilistic timed automata into equivalent Markov decision processes enabling analysis of probabilistic real-time systems. The approach supports analysis of systems with both stochastic and timed behaviors enabling verification of temporal properties of probabilistic systems.},
why = {Real-time systems exhibiting probabilistic behavior due to uncertainty in timing or failure rates require sophisticated analysis techniques beyond traditional deterministic or purely probabilistic approaches. This work innovates by developing methods to bridge probabilistic and timed models enabling rigorous analysis of systems with both characteristics. The approximation approach enables practical analysis of realistic systems while maintaining verification guarantees.},
results = {The approach successfully analyzes probabilistic timed systems including reachability, safety, and bounded time properties through conversion to Markov decision processes. Results demonstrate effective approximation of probabilistic timed automata enabling practical verification. The framework enables analysis of complex systems with both timed and probabilistic behaviors.},
project_tags = {CPS, scalable AI}
}
Probabilistic models are useful for analyzing systems which operate under the presence of uncertainty. In this paper, we present a technique for verifying safety and liveness properties for probabilistic timed automata. The proposed technique is an extension of a technique used to verify stochastic hybrid automata using an approximation with Markov Decision Processes. A case study for CSMA/CD protocol has been used to show case the methodology used in our technique.
@inproceedings{Dubey2009b,
author = {Dubey, Abhishek},
booktitle = {16th Annual {IEEE} International Conference and Workshop on the Engineering of Computer Based Systems, {ECBS} 2009, San Francisco, California, USA, 14-16 April 2009},
title = {Algorithms for Synthesizing Safe Sets of Operation for Embedded Systems},
year = {2009},
pages = {149--155},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/conf/ecbs/Dubey09},
category = {conference},
contribution = {lead},
doi = {10.1109/ECBS.2009.43},
file = {:Dubey2009b-Algorithms_for_Synthesizing_Safe_Sets_of_Operation_for_Embedded_Systems.pdf:PDF},
keywords = {embedded systems, formal methods, reachable sets, safety, temporal properties, continuous dynamics},
project = {cps-reliability},
timestamp = {Wed, 16 Oct 2019 14:14:51 +0200},
url = {https://doi.org/10.1109/ECBS.2009.43},
what = {This paper develops algorithms for synthesizing safe sets of operations for embedded systems by exploring the infinite state space of continuous dynamical systems to find initial states satisfying temporal properties. The work presents symbolic algorithms for computing forward and backward reachable sets of continuous systems and identifies initial conditions enabling systems to satisfy safety and liveness properties. The approach uses level set methods to represent continuous state spaces and symbolic computation for efficient exploration.},
why = {Embedded systems must provide safe operation across the full range of possible operating conditions, yet manual specification of safe operating regions is error-prone and incomplete. This work innovates by automating the discovery of safe initial conditions through formal analysis of system dynamics. The symbolic approach enables handling continuous state spaces and complex temporal properties infeasible with manual analysis.},
results = {The algorithms successfully identify safe sets of initial conditions for continuous dynamical systems satisfying both safety and liveness properties. Results demonstrate effective computation of constrained reachable sets using level set methods and symbolic algorithms. The approach enables systematic identification of operating regions ensuring system safety.},
project_tags = {CPS, scalable AI}
}
A large number of embedded computing systems are modeled as hybrid system with both discrete and continuous dynamics. In this paper, we present algorithms for analyzing nonlinear time-invariant continuous-time systems by employing reachability algorithms. We propose synthesis algorithms for finding sets of initial states for the continuous dynamical systems so that temporal properties, such as safety and liveness properties, are satisfied. The initial sets produced by the algorithms are related to some classical concepts for continuous dynamical systems, such as invariant sets and domains of attraction.
@inproceedings{Dubey2009c,
author = {Dubey, Abhishek and Karsai, Gabor and Abdelwahed, Sherif},
booktitle = {2009 {IEEE} International Symposium on Object/Component/Service-Oriented Real-Time Distributed Computing, {ISORC} 2009, Tokyo, Japan, 17-20 March 2009},
title = {Compensating for Timing Jitter in Computing Systems with General-Purpose Operating Systems},
year = {2009},
pages = {55--62},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/conf/isorc/DubeyKA09},
category = {selectiveconference},
contribution = {lead},
doi = {10.1109/ISORC.2009.28},
file = {:Dubey2009c-Compensating_for_Timing_Jitter_in_Computing_Systems_with_General-Purpose_Operating_Systems.pdf:PDF},
keywords = {timing jitter, periodic task scheduling, feedback control, PID controller, real-time systems, Linux kernel, sensor scheduling},
project = {cps-middleware,cps-reliability},
tag = {platform},
timestamp = {Wed, 16 Oct 2019 14:14:53 +0200},
url = {https://doi.org/10.1109/ISORC.2009.28},
what = {This paper addresses timing jitter in periodic task execution on commodity operating systems like Linux by proposing a user-space scheduling approach that uses periodic sleep cycles to compensate for jitter. The authors develop a feedback controller based on a proportional-integral derivative (PID) scheme to maintain bounded jitter in sensor tasks without modifying the OS kernel. They present both theoretical analysis and empirical validation on Linux and Windows systems, showing that the approach significantly reduces jitter accumulation compared to uncompensated execution.},
why = {Timing jitter in periodic tasks is critical for reliable distributed systems and sensor-based applications, particularly in computing clusters where general-purpose operating systems like Linux provide best-effort rather than real-time guarantees. The innovation lies in providing jitter compensation at the application level without requiring kernel modifications, making it practical and deployable on existing systems. This work enables deterministic scheduling for monitoring and control tasks on commercial hardware.},
results = {Experimental results demonstrate that the proposed feedback controller reduces timing jitter to less than 1% of the sampling period when running a task with a 1-second period on Linux. The approach maintains a total jitter bounded across multiple hours of execution, with an average CPU utilization overhead of less than 1%. The method is shown to be effective across different operating systems and scales well with system load variation.},
project_tags = {CPS, middleware, ML for CPS}
}
Fault-tolerant frameworks for large scale computing clusters require sensor programs, which are executed periodically to facilitate performance and fault management. By construction, these clusters use general purpose operating systems such as Linux that are built for best average case performance and do not provide deterministic scheduling guarantees. Consequently, periodic applications show jitter in execution times relative to the expected execution time. Obtaining a deterministic schedule for periodic tasks in general purpose operating systems is difficult without using kernel-level modifications such as RTAI and RTLinux. However, due to performance and administrative issues kernel modification cannot be used in all scenarios. In this paper, we address the problem of jitter compensation for periodic tasks that cannot rely on modifying the operating system kernel. ; Towards that, (a) we present motivating examples; (b) we present a feedback controller based approach that runs in the user space and actively compensates periodic schedule based on past jitter; This approach is platform-agnostic i.e. it can be used in different operating systems without modification; and (c) we show through analysis and experiments that this approach is platform-agnostic i.e. it can be used in different operating systems without modification and also that it maintains a stable system with bounded total jitter.
@inproceedings{Dubey2009e,
author = {Dubey, Abhishek and Mahadevan, Nagbhushan and Kereskenyi, Robert},
booktitle = {International workshop on software health management. IEEE conference on space mission challenges for information technology},
title = {Reflex and healing architecture for software health management},
year = {2009},
category = {workshop},
contribution = {lead},
file = {:Dubey2009e-Reflex_and_healing_architecture_for_software_health_management.pdf:PDF},
keywords = {software health management, fault detection, fault diagnosis, reflex engines, hierarchical architecture, mitigation strategies, real-time systems},
what = {This paper presents a reflex and healing architecture for software health management in complex embedded systems such as those used in space missions. The framework employs hierarchical reflex engines that detect discrepancies through monitoring, diagnose faults using Timed Fault Propagation Graphs, and execute fault mitigation strategies using state machines. The architecture supports both reactive and proactive healing actions coordinated across multiple hierarchical levels.},
why = {Large-scale embedded systems require autonomous fault detection and recovery mechanisms since manual intervention is often infeasible, particularly in space missions where communication latency and system inaccessibility constrain responses. The reflex and healing approach provides a systematic framework for integrating fault management into system architecture, enabling systems to maintain reliable operation despite failures. This work is innovative in demonstrating hierarchical fault management that can scale to complex distributed systems.},
results = {The paper demonstrates a three-level hierarchical management structure (global, regional, and local) with reflex engines at each level that can detect and respond to faults autonomously. The framework successfully integrates with the ARINC-653 avionics standard, enabling applicability to safety-critical real-time systems. Case studies show how the architecture enables both fault isolation and coordinated recovery actions across system components.},
project_tags = {CPS, emergency, middleware}
}
@techreport{SWHM31,
author = {Dubey, Abhishek},
institution = {IBM Research},
title = {Towards Dynamic CPU Demand Estimation in Multi-Tiered Web Setup},
year = {2009},
contribution = {lead},
issn = {ISIS-09-111},
owner = {abhishek},
timestamp = {2010.09.24},
url = {https://wiki.isis.vanderbilt.edu/mbshm/images/3/3e/TechReport2009.pdf}
}
The complexity of software in systems like aerospace vehicles has reached the point where new techniques are needed to ensure system dependability. Such techniques include a novel direction called Software Health Management (SHM) that extends classic software fault tolerance with techniques borrowed from System Health Management. In this paper, the initial steps towards building a SHM approach are described that combine component-based software construction with hard real-time operating system platforms. Specifically, the paper discusses how the CORBA Component Model could be combined with the ARINC-653 platform services and the lessons learned from this experiment. The results point towards both extending the CCM as well as revising the ARINC-653.
@inproceedings{Dubey2008a,
author = {Dubey, Abhishek and {Nordstrom}, S. and {Keskinpala}, T. and {Neema}, S. and {Bapty}, T. and {Karsai}, G.},
booktitle = {Fifth IEEE Workshop on Engineering of Autonomic and Autonomous Systems (ease 2008)},
title = {Towards A Model-Based Autonomic Reliability Framework for Computing Clusters},
year = {2008},
month = mar,
pages = {75-85},
category = {conference},
contribution = {lead},
doi = {10.1109/EASe.2008.15},
file = {:Dubey2008a-Towards_a_model-based_autonomic_reliability_framework_for_computing_clusters.pdf:PDF},
issn = {2168-1872},
keywords = {scientific computing, cluster reliability, fault mitigation, distributed monitoring, hierarchical management, reflex engines, job reallocation},
what = {This paper introduces the Scientific Computing Autonomic Reliability Framework (SCARF) designed for large computing clusters used in scientific applications like the Large Hadron Collider. The framework provides hierarchical fault mitigation using reflex engines and includes components for monitoring system health parameters, diagnosing faults through distributed reasoning, and allocating resources optimally during failures. SCARF enables coordinated fault management across cluster nodes with mechanisms for workflow reallocation and predictive failure detection.},
why = {Large scientific computing clusters require fault tolerance mechanisms that maintain productivity despite frequent component failures, as downtime directly impacts expensive research campaigns. The innovation lies in applying hierarchical reflex and healing architectures specifically to scientific computing environments, providing model-based fault mitigation that enables both fault prediction and autonomous recovery. This addresses a critical gap between theoretical reliability frameworks and practical deployment needs in high-performance computing.},
results = {SCARF successfully implements distributed monitoring on LQCD computing clusters at Fermi National Accelerator Laboratory, enabling detection of multiple fault classes including communication errors, storage failures, and CPU issues. The framework provides automated mitigation strategies that can reallocate jobs and resources, with experimental data showing improvements in cluster reliability and job completion rates. The hierarchical organization allows scalability from individual nodes to multiple regional managers.},
project_tags = {scalable AI, CPS, emergency}
}
One of the primary problems with computing clusters is to ensure that they maintain a reliable working state most of the time to justify economics of operation. In this paper, we introduce a model-based hierarchical reliability framework that enables periodic monitoring of vital health parameters across the cluster and provides for autonomic fault mitigation. We also discuss some of the challenges faced by autonomic reliability frameworks in cluster environments such as non-determinism in task scheduling in standard operating systems such as Linux and need for synchronized execution of monitoring sensors across the cluster. Additionally, we present a solution to these problems in the context of our framework, which utilizes a feedback controller based approach to compensate for the scheduling jitter in non real-time operating systems. Finally, we present experimental data that illustrates the effectiveness of our approach.
@inproceedings{Dubey2008,
author = {Dubey, Abhishek and Neema, Sandeep and Kowalkowski, Jim and Singh, Amitoj},
booktitle = {Fourth International Conference on e-Science, e-Science 2008, 7-12 December 2008, Indianapolis, IN, {USA}},
title = {Scientific Computing Autonomic Reliability Framework},
year = {2008},
pages = {352--353},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/conf/eScience/DubeyNKS08},
category = {poster},
contribution = {lead},
doi = {10.1109/eScience.2008.113},
file = {:Dubey2008-Scientific_Computing_Autonomic_Reliability_Framework.pdf:PDF},
keywords = {cluster reliability, autonomous systems, fault tolerance, model-based design, monitoring, mitigation strategies, distributed computing},
project = {cps-middleware,cps-reliability},
timestamp = {Wed, 16 Oct 2019 14:14:49 +0200},
url = {https://doi.org/10.1109/eScience.2008.113},
what = {This paper presents a model-based autonomic reliability framework for computing clusters that enables periodic monitoring of vital health parameters and provides automated fault mitigation. The work develops techniques for fault prediction by analyzing correlations between system parameters like CPU utilization and temperature, enabling proactive mitigation before failures occur. The framework includes both discrete event-based scheduling and feedback control mechanisms for managing sensor execution across cluster nodes.},
why = {Computing clusters in scientific computing environments suffer from both transient and persistent failures that impact application performance and result in significant economic losses. The innovation of this work is applying model-based design to cluster reliability, enabling specification of mitigation policies that can be verified for correctness. This systematic approach provides a bridge between autonomous fault management concepts and practical implementation on distributed systems.},
results = {The framework demonstrates model-based reliability engineering for LQCD clusters with 127-600 computing nodes across multiple systems. Monitoring and mitigation components show the ability to detect and respond to various failure modes including power outages, hardware failures, and non-responsive jobs. Experimental data illustrates the effectiveness of the approach in maintaining cluster productivity despite hardware and software faults.},
project_tags = {CPS, scalable AI, emergency}
}
Large scientific computing clusters require a distributed dependability subsystem that can provide fault isolation and recovery and is capable of learning and predicting failures, to improve the reliability of scientific workflows. In this paper, we outline the key ideas in the design of a Scientific Computing Autonomic Reliability Framework (SCARF) for large computing clusters used in the Lattice Quantum Chromo Dynamics project at Fermi Lab.
@inproceedings{Nordstrom2007,
author = {{Nordstrom}, S. and Dubey, Abhishek and {Keskinpala}, T. and {Datta}, R. and {Neema}, S. and {Bapty}, T.},
booktitle = {Fourth IEEE International Workshop on Engineering of Autonomic and Autonomous Systems (EASe'07)},
title = {Model Predictive Analysis for AutonomicWorkflow Management in Large-scale Scientific Computing Environments},
year = {2007},
month = mar,
pages = {37-42},
category = {conference},
contribution = {colab},
doi = {10.1109/EASE.2007.18},
file = {:Nordstrom2007-Model_predictive_analysis_for_autonomicworkflow_management_in_large-scale_scientific_computing_environments.pdf:PDF},
issn = {null},
keywords = {workflow modeling, predictive analysis, scientific computing, job dependencies, fault prediction, workflow management, lookahead algorithm},
what = {This paper develops model predictive analysis techniques for autonomous workflow management in large-scale scientific computing environments. The authors present a WorkflowML modeling language for specifying job dependencies, data flows, and synchronization constraints, and develop a lookahead algorithm that can predict workflow execution failures. The approach enables proactive workflow modification to avoid stalled states by predicting which jobs cannot complete based on current failure conditions.},
why = {Scientific computing workflows often consist of hundreds of interdependent tasks that must be executed across shared cluster resources, and failures in single nodes can cause entire workflow stalls. The innovation lies in applying model-based predictive analysis to dynamically determine workflow feasibility and guide reallocation decisions. This enables systems to make informed decisions about which workflows should continue execution versus those that are predicted to fail.},
results = {The paper demonstrates model-driven workflow analysis using a simplified workflow model with synchronization, sequence, and data dependencies. The lookahead algorithm successfully predicts workflow stall conditions and can identify alternative execution paths that avoid predicted failures. Experimental simulations show that the approach can improve overall workflow completion compared to executing all workflows regardless of failure predictions.},
project_tags = {scalable AI, planning}
}
In large scale scientific computing, proper planning and management of computational resources lead to higher system utilizations and increased scientific productivity. Scientists are increasingly leveraging the use of business process management techniques and workflow management tools to balance the needs of the scientific analyses with the availability of computational resources. However, the advancements in productivity from execution of workflows in a large scale computing environments are often thwarted by runtime resource failures. This paper presents our initial work toward autonomic model based fault analysis in workflow based environments
@article{Dubey2007,
author = {Dubey, Abhishek and Nordstrom, Steven and Keskinpala, Turker and Neema, Sandeep and Bapty, Ted and Karsai, Gabor},
journal = {{Innovations in Systems and Software Engineering}},
title = {Towards a verifiable real-time, autonomic, fault mitigation framework for large scale real-time systems},
year = {2007},
number = {1},
pages = {33--52},
volume = {3},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/journals/isse/DubeyNKNBK07},
contribution = {lead},
doi = {10.1007/s11334-006-0015-7},
file = {:Dubey2007-Towards_a_verifiable_real-time_autonomic_fault_mitigation_framework.pdf:PDF},
project = {cps-middleware,cps-reliability},
tag = {platform},
timestamp = {Sun, 28 May 2017 01:00:00 +0200},
url = {https://doi.org/10.1007/s11334-006-0015-7},
what = {This paper presents a comprehensive real-time fault mitigation framework for large-scale real-time systems that combines reflex and healing architectures with formal verification. The framework enables autonomous fault diagnosis through discrete event modeling and uses hierarchical reflex engines to execute mitigation actions that respond to both time-based and event-based triggers. The work introduces timed automata models to formally specify and verify the fault mitigation strategies deployed in real-time systems.},
why = {Large real-time systems require fault responses that are not only functionally correct but also satisfy temporal properties like liveness, safety, and bounded responsiveness. The innovation of this work is the rigorous integration of formal verification techniques into autonomous fault mitigation, enabling designers to prove that mitigation actions will execute correctly and meet timing deadlines. This bridges the gap between theoretical fault management concepts and practical deployment in safety-critical systems.},
results = {The paper demonstrates transformation of a reflex and healing framework into a network of timed automata models that can be verified using model checking tools like UPPAAL. Case studies show how the framework can formally specify and verify properties such as liveness and bounded time responsiveness for fault mitigation strategies. The approach enables verification of both individual reflex engines and complex multi-engine fault management hierarchies.},
keywords = {fault mitigation, timed automata, formal verification, real-time systems, reflex engines, model checking, autonomous computing},
project_tags = {CPS, emergency, Explainable AI}
}
Designing autonomic fault responses is difficult, particularly in large-scale systems, as there is no single ‘perfect’ fault mitigation response to a given failure. The design of appropriate mitigation actions depend upon the goals and state of the application and environment. Strict time deadlines in real-time systems further exacerbate this problem. Any autonomic behavior in such systems must not only be functionally correct but should also conform to properties of liveness, safety and bounded time responsiveness. This paper details a real-time fault-tolerant framework, which uses a reflex and healing architecture to provide fault mitigation capabilities for large-scale real-time systems. At the heart of this architecture is a real-time reflex engine, which has a state-based failure management logic that can respond to both event- and time-based triggers. We also present a semantic domain for verifying properties of systems, which use this framework of real-time reflex engines. Lastly, a case study, which examines the details of such an approach, is presented.
@inproceedings{Nordstrom2016,
author = {Nordstrom, Steven and Bapty, Ted and Neema, Sandeep and Dubey, Abhishek and Keskinpala, Turker},
booktitle = {Second IEEE conference on Space Mission Challenges for Information Technology (SMC-IT)},
title = {A Guided Explorative Approach for Autonomic Healing of Model-Based Systems},
year = {2006},
address = {Pasadena, CA},
month = jul,
category = {conference},
contribution = {minor},
file = {:Nordstrom2006-A_guided_explorative_approach_for_autonomic_healing_of_model_based_systems.pdf:PDF},
keywords = {autonomic; guided; healing; reflex-healing;model-based; model integrated; embedded}
}
Embedded computing is an area in which many of the Self-* properties of autonomic systems are desirable. Model based tools for designing embedded systems, while proven successful in many applications, are not yet applicable toward building autonomic or self-sustaining embedded systems. This paper reports on the progress made by our group in developing a model based toolset which specifically targets the creation of autonomic embedded systems.
@inproceedings{Dubey2006,
author = {Dubey, Abhishek and {Nordstrom}, S. and {Keskinpala}, T. and {Neema}, S. and {Bapty}, T.},
booktitle = {Third IEEE International Workshop on Engineering of Autonomic Autonomous Systems (EASE'06)},
title = {Verifying Autonomic Fault Mitigation Strategies in Large Scale Real-Time Systems},
year = {2006},
month = mar,
pages = {129-140},
category = {conference},
contribution = {lead},
doi = {10.1109/EASE.2006.24},
file = {:Dubey2006-Verifying_autonomic_fault_mitigation_strategies_in_large_scale_real-time_systems.pdf:PDF},
issn = {2168-1872},
keywords = {system healing, resource adaptation, fault recovery, optimization, hierarchical systems, autonomous healing, embedded systems},
what = {This paper presents the GHOST (Guided Healing and Optimization Search Technique) algorithm for automated healing of large-scale embedded system structures. When a resource fault is detected, GHOST applies iterative transformation criteria to find alternative system models that can continue operation with reduced resources. The approach defines performance criteria including throughput, robustness, and similarity to the original system, and uses weighted optimization to balance competing objectives during the healing process.},
why = {When hardware failures occur in large-scale embedded systems, manual reconfiguration is infeasible and systems must autonomously adapt their task allocations to maintain operation despite diminished resources. The innovation of GHOST lies in providing a systematic optimization process for finding appropriate healing actions that satisfy both system designer and user requirements. This enables autonomous systems to make intelligent recovery decisions rather than simply failing or restarting.},
results = {GHOST successfully handles both resource reduction cases (where available resources are reduced) and resource failure cases (where persistent faults block certain resources). The algorithm demonstrates healing transformations on hierarchical system structures, showing how it can redistribute tasks and management responsibilities to maintain system functionality. Examples illustrate how the optimization process balances throughput maximization against maintaining structural similarity to the original design.},
project_tags = {CPS, scalable AI, emergency}
}
In large scale real-time systems many problems associated with self-management are exacerbated by the addition of time deadlines. In these systems any autonomic behavior must not only be functionally correct but they must also not violate properties of liveness, safety and bounded time responsiveness. In this paper we present and analyze a realtime reflex engine for providing fault mitigation capability to large scale real time systems. We also present a semantic domain for analyzing and verifying the properties of such systems along with the framework of real-time reflex engines
@inproceedings{Nordstrom2006a,
author = {{Nordstrom}, S. and Dubey, Abhishek and {Keskinpala}, T. and {Neema}, S. and {Bapty}, T.},
booktitle = {Third IEEE International Workshop on Engineering of Autonomic Autonomous Systems (EASE'06)},
title = {GHOST: Guided Healing and Optimization Search Technique for Healing Large-Scale Embedded Systems},
year = {2006},
month = mar,
pages = {54-60},
category = {conference},
contribution = {minor},
doi = {10.1109/EASE.2006.8},
file = {:Nordstrom2006a-Ghost_Guided_Healing_and_Optimization_Search_Techniques.pdf:PDF},
issn = {2168-1872},
keywords = {model-driven testing, middleware testing, test configuration, domain-specific modeling, distributed systems, test generation, Model Integrated Computing},
what = {This paper develops a model-driven tool for automated system-level testing of distributed middleware frameworks like XDAQ. The approach uses Model Integrated Computing (MIC) principles to automatically generate test configurations and deployment scenarios from high-level system specifications. The tool combines behavioral modeling with resource and deployment specifications to create comprehensive test suites that exercise different system configurations without manual effort.},
why = {Testing distributed middleware frameworks requires creating many different configuration combinations to exercise various component interactions and resource allocations. Manual creation of these test configurations is error-prone, time-consuming, and cannot scale to cover all meaningful combinations. The innovation lies in applying model-driven techniques to automatically derive test cases from system models, improving test coverage and reducing manual testing effort.},
results = {The paper presents a testing tool infrastructure that uses Model Integrated Computing to generate test configuration models from domain-specific specifications. The tool successfully generates XML configuration files and test scripts for the XDAQ middleware framework used in high-energy physics experiments. The approach demonstrates how model-driven techniques can automate test generation for complex distributed systems.},
project_tags = {middleware, CPS}
}
Reflex and healing architectures have been shown to provide adequate user-defined initial failure mitigation behaviors in the presence of system faults. What is lacking, however, is a user-guided means of healing the system after the initial reflexes have been enacted. This process should be autonomic in the sense that new system configurations can be achieved by defining a priori only a small set of criteria to which the healed system should conform. What follows is an explanation of this technique for guided healing which allows system designers to direct the healing process from a higher level in such a way that the resulting system configurations satisfy their particular needs. A brief example outlining the application of this approach is given
@inproceedings{Keskinpala2006,
author = {Keskinpala, Turker and Dubey, Abhishek and Nordstrom, Steve and Bapty, Ted and Neema, Sandeep},
booktitle = {Systems Testing and Validation},
title = {A Model Driven Tool for Automated System Level Testing of Middleware},
year = {2006},
pages = {19},
category = {conference},
contribution = {minor},
file = {:Keskinpala2006-A_Model_Driven_Tool_for_Automated_System_Level_Testing_of_Middleware.pdf:PDF},
keywords = {formal verification, autonomous systems, fault mitigation, discrete event systems, timed automata, model checking, real-time properties},
what = {This paper presents a model-based approach for analyzing and verifying autonomous fault mitigation strategies in large-scale real-time systems using discrete event system theory. The work transforms the reflex and healing framework into a formal model that can be checked for properties like liveness, safety, and bounded time responsiveness using model checking tools. The paper develops algorithms for analyzing whether fault mitigation actions will successfully recover the system without introducing deadlocks or missing time deadlines.},
why = {Autonomous fault mitigation strategies in real-time systems must not only respond to failures but also guarantee that responses satisfy critical timing and safety properties. The innovation lies in providing formal verification methods that can prove fault mitigation strategies are correct, a critical requirement for safety-critical systems. This work enables designers to gain confidence that autonomic behaviors will not introduce new problems during failure recovery.},
results = {The paper demonstrates formal analysis of fault mitigation strategies using discrete event models and timed automata that can be verified with tools like UPPAAL. Case studies show analysis of liveness properties (eventual fault recovery), safety properties (no deadlocks), and bounded time responsiveness. The verification approach successfully identifies potential problems in mitigation strategies before deployment.},
project_tags = {CPS, Explainable AI, emergency}
}
This paper presents a contribution to the challenges of manually creating test configurations and deployments for high performance distributed middleware frameworks. We present our testing tool based on the Model Integrated Computing (MIC) paradigm and describe and discuss its generative abilities that can be used to generate many test configurations and deployment scenarios from high-level system specifications through model replication.
@inproceedings{Dubey2005,
author = {Dubey, Abhishek and Wu, X. and Su, H. and Koo, T. J.},
booktitle = {Automated Technology for Verification and Analysis},
title = {Computation Platform for Automatic Analysis of Embedded Software Systems Using Model Based Approach},
year = {2005},
address = {Berlin, Heidelberg},
editor = {Peled, Doron A. and Tsay, Yih-Kuen},
pages = {114--128},
publisher = {Springer Berlin Heidelberg},
category = {selectiveconference},
contribution = {lead},
file = {:Dubey2005-Computation_Platform_for_Automatic_Analysis_of_Embedded_Software_Systems_Using_Model_Based_Approach.pdf:PDF},
isbn = {978-3-540-31969-6},
keywords = {hybrid systems, embedded systems analysis, model-integrated computing, domain-specific language, algorithmic specifications, reachability analysis},
project = {cps-reliability},
tag = {platform},
what = {This paper presents ReachLab, a computation platform for automatic analysis of embedded software systems using model-based approach. The platform uses the Hybrid System Analysis and Design Language (HADL) as a meta-model to enable design of analysis algorithms that can be reused across different system domains. ReachLab separates algorithm design from implementation details through domain-specific modeling, allowing researchers to specify analysis algorithms that are automatically executed using various computational kernels.},
why = {Analyzing hybrid embedded systems requires techniques that combine continuous and discrete dynamics analysis, but implementing these algorithms for different system models is time-consuming and error-prone. The innovation of ReachLab is applying Model Integrated Computing principles to enable algorithm designers to work with abstract system models while automatic code generation handles translation to specific computational tools. This enables broader adoption of advanced analysis techniques.},
results = {ReachLab successfully implements a platform where analysis algorithms specified in HADL can be executed against different hybrid system models using both symbolic and reachability analysis methods. The platform demonstrates support for multiple analysis techniques including d/dt kernels for computing reachable sets and Level Set methods for symbolic analysis. Experimental examples show how the platform separates algorithm design from specific implementation details.},
project_tags = {CPS, ML for CPS, scalable AI}
}
In this paper, we describe a computation platform called ReachLab, which enables automatic analysis of embedded software systems that interact with continuous environment. Algorithms are used to specify how the state space of the system model should be explored in order to perform analysis. In ReachLab, both system models and analysis algorithm models are specified in the same framework using Hybrid System Analysis and Design Language (HADL), which is a meta-model based language. The platform allows the models of algorithms to be constructed hierarchically and promotes their reuse in constructing more complex algorithms. Moreover, the platform is designed in such a way that the concerns of design and implementation of analysis algorithms are separated. On one hand, the models of analysis algorithms are abstract and therefore the design of algorithms can be made independent of implementation details. On the other hand, translators are provided to automatically generate implementations from the models for computing analysis results based on computation kernels. Multiple computation kernels, which are based on specific computation tools such as d/dt and the Level Set toolbox, are supported and can be chosen to enable hybrid state space exploration. An example is provided to illustrate the design and implementation process in ReachLab.