@phdthesis{17376, author = {Qinghua Xu}, title = {Traversing the Data Spectrum: Path to Dependable Cyber-Physical Systems through Digital Twins}, abstract = {Cyber-physical Systems (CPSs) play an essential role in realizing the Industry 4.0 vision. Intensive studies from academia and industry, along with the advance of hardware and software, spur the burgeoning development of various CPSs, such as smart cities, autonomous driving systems, and water treatment plants. Nowadays, CPSs have evolved to be complex, heterogeneous, and integrated to provide rich functionalities, which inevitably exposes CPSs to various threats. Most traditional techniques become obsolete when facing new threats; therefore, it is paramount for researchers and practitioners to develop novel techniques for CPSs. One key disadvantage of most existing techniques is the necessity of interacting with real CPSs, which might interfere with their normal operations. To reduce such interactions, Digital Twins (DTs) have emerged as an effective solution for enhancing CPS dependability. The key idea of DTs is to simulate CPSs in real-time, enabling interactions with simulations instead of the real CPSs. Following this research line, this thesis aims to explore the concept of DT and to construct data-driven DTs to improve the dependability of CPS. The effectiveness of a data-driven DT hinges on the quality of the training data. Given a specific task, we consider a spectrum of data, which spans from domain-specific data to domain-related data, and finally to domain-agnostic data, signifying the decreasing data quality. Ideally, a data-driven DT can be well established with data collected specifically for the target task (domain-specific data). However, sufficient domain-specific data cannot always be guaranteed, e.g., only limited data can be collected from a newly deployed CPS. Under such circumstances, data collected from related domains (domain- related data, e.g., data collected from previous versions of CPSs) can provide additional information for DT construction. In addition to domain-specific and domain-related data, generic data not directly related to the target task (domain-agnostic, e.g., log data of CPS normal operations) can also provide noisy yet useful information for DT construction. In this thesis, we conducted three works (six papers, i.e., Papers I, II, III, IV, V, and VI) to construct DTs with domain- specific, domain-related, and domain-agnostic data, respectively. In Work I, we first proposed and implemented a digital twin framework with domain-specific data (Paper I). To optimize the training process of DT, we utilize the concept of curriculum learning to mimic a human learning process by arranging the data in an easy- to-difficult curriculum (Paper II). In Work II, we aim to construct DTs with domain-related data. We utilize transfer learning to transfer knowledge from related domains to the target domain. Specifically, we cover both classification (Paper III) and regression tasks (Paper IV) in CPS dependability. Furthermore, we take inspiration from the success of large language models and improve the regression DT with prompt learning (Paper V). In Work III, we aim to further release DT from domain-specific data by utilizing knowledge distillation to extract knowledge from domain-agnostic data (Paper VI). Evaluation results in the three works demonstrate our proposed DT framework can enhance the dependability of CPS by taking advantage of domain-specific, domain-related, and domain-agnostic data. Papers I and II studied the anomaly detection task in public CPS testbeds (domain-specific data) and offered a neural network-based DT solution with state-of-the-art effectiveness. With Work II, we successfully applied our DT framework to open source datasets, i.e., DeepScenario autonomous driving systems dataset, and industrial case studies, i.e., Orona elevators (Papers III and IV) and a cancer registry system from Cancer Registry of Norway (CRN) (Paper V). Notice that the cancer registry system is software rather than a CPS. With this software system, we aspire to apply our method beyond CPSs. We found that transfer learning effectively improves the performance of DT in both classification (i.e., predicting the validation results of the cancer registry system) and regression tasks (i.e., predicting passenger waiting time for Orona elevator). In Work III, we evaluated our approach with an industrial case study from Alstom and discovered that knowledge distillation is effective in extracting knowledge from domain-agnostic data (Paper VI).}, year = {2023}, journal = {University of Oslo, Norway}, publisher = {University of Oslo}, }