时间轴

2025-11-23

  1. init

环境

1
2
3
4
5
6
wget https://download.qemu.org/qemu-10.1.2.tar.xz
tar xvJf qemu-10.1.2.tar.xz
cd qemu-10.1.2
mkdir -p output
./configure --prefix=$PWD/output --target-list=aarch64-softmmu,riscv64-softmmu --enable-debug
bear -- make -j$(nproc)

创建.clangd

1
2
3
CompileFlags:
Add: -Wno-unknown-warning-option
Remove: [-m*, -f*]

gdb

1
$ gdb -args ./build/qemu-system-riscv64 -M virt -device edu,id=edu1 -nographic

Machine建模

QEMU 创建虚拟机的核心流程可分为 6 个阶段,每个阶段都涉及特定的组件和操作

命令行解析

入口函数:main() 关键操作:

  • 解析 -M 参数指定机器类型(如 virt)
  • 初始化 QOM(QEMU 对象模型)类型系统
  • 创建全局状态机 MachineState

system/main.c

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
#include "qemu/osdep.h"
#include "qemu-main.h"
#include "qemu/main-loop.h"
#include "system/replay.h"
#include "system/system.h"


static void *qemu_default_main(void *opaque)
{
int status;

replay_mutex_lock();
bql_lock();
status = qemu_main_loop();
qemu_cleanup(status);
bql_unlock();
replay_mutex_unlock();

exit(status);
}

int (*qemu_main)(void);

#ifdef CONFIG_DARWIN
static int os_darwin_cfrunloop_main(void)
{
CFRunLoopRun();
g_assert_not_reached();
}
int (*qemu_main)(void) = os_darwin_cfrunloop_main;
#endif


int main(int argc, char **argv)
{
qemu_init(argc, argv);

bql_unlock();
replay_mutex_unlock();

if (qemu_main) {
QemuThread main_loop_thread;
qemu_thread_create(&main_loop_thread, "qemu_main",
qemu_default_main, NULL, QEMU_THREAD_DETACHED);
return qemu_main();
} else {
qemu_default_main(NULL);
g_assert_not_reached();
}
}

system/vl.c

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
void qemu_init(int argc, char **argv) {
...
machine_opts_dict = qdict_new(); // 创建机器参数字典
...
case QEMU_OPTION_M: // 解析-M 参数
case QEMU_OPTION_machine:
{
bool help;

keyval_parse_into(machine_opts_dict, optarg, "type", &help, &error_fatal);
if (help) {
machine_help_func(machine_opts_dict);
exit(EXIT_SUCCESS);
}
break;
}
..
qemu_create_machine(machine_opts_dict); // 创建机器实例
}

Machine类型选择与注册

核心组件:MachineClass(机器类描述)

关键操作:

  • 通过 select_machine() 匹配用户指定的机器类型
  • 从全局类型表 type_table 查找对应的 TypeImpl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
// system/vl.c
static MachineClass *select_machine(QDict *qdict, Error **errp)
{
ERRP_GUARD();
const char *machine_type = qdict_get_try_str(qdict, "type");
g_autoptr(GSList) machines = object_class_get_list(TYPE_MACHINE, false);
MachineClass *machine_class = NULL;

if (machine_type) {
machine_class = find_machine(machine_type, machines);
if (!machine_class) {
error_setg(errp, "unsupported machine type: \"%s\"", machine_type);
}
qdict_del(qdict, "type");
} else {
machine_class = find_default_machine(machines);
if (!machine_class) {
error_setg(errp, "No machine specified, and there is no default");
}
}

if (!machine_class) {
error_append_hint(errp,
"Use -machine help to list supported machines\n");
}
return machine_class;
}


// qom/object.c
GSList *object_class_get_list(const char *implements_type) {
g_hash_table_foreach(type_table, object_class_foreach_tramp, &data);
// 遍历所有已注册的机器类型
}

Machine对象实例化

核心组件:MachineState(机器运行时状态)

关键操作:

  • 通过 QOM 的 object_new_with_class() 实例化机器
  • 初始化内存、CPU 拓扑等基础属性
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
// system/vl.c
static void qemu_create_machine(QDict *qdict)
{
MachineClass *machine_class = select_machine(qdict, &error_fatal);
object_set_machine_compat_props(machine_class->compat_props);

current_machine = MACHINE(object_new_with_class(OBJECT_CLASS(machine_class)));
object_property_add_child(object_get_root(), "machine",
OBJECT(current_machine));
qemu_create_machine_containers(OBJECT(current_machine));
object_property_add_child(machine_get_container("unattached"),
"sysbus", OBJECT(sysbus_get_default()));

if (machine_class->minimum_page_bits) {
if (!set_preferred_target_page_bits(machine_class->minimum_page_bits)) {
/* This would be a board error: specifying a minimum smaller than
* a target's compile-time fixed setting.
*/
g_assert_not_reached();
}
}

cpu_exec_init_all();

if (machine_class->hw_version) {
qemu_set_hw_version(machine_class->hw_version);
}

/*
* Get the default machine options from the machine if it is not already
* specified either by the configuration file or by the command line.
*/
if (machine_class->default_machine_opts) {
QDict *default_opts =
keyval_parse(machine_class->default_machine_opts, NULL, NULL,
&error_abort);
qemu_apply_legacy_machine_options(default_opts);
object_set_properties_from_keyval(OBJECT(current_machine), default_opts,
false, &error_abort);
qobject_unref(default_opts);
}
}


// hw/core/machine.c

static void machine_initfn(Object *obj)
{
MachineState *ms = MACHINE(obj);
MachineClass *mc = MACHINE_GET_CLASS(obj);

ms->dump_guest_core = true;
ms->mem_merge = (QEMU_MADV_MERGEABLE != QEMU_MADV_INVALID);
ms->enable_graphics = true;
ms->kernel_cmdline = g_strdup("");
ms->ram_size = mc->default_ram_size; // 默认内存大小
ms->maxram_size = mc->default_ram_size;

if (mc->nvdimm_supported) {
ms->nvdimms_state = g_new0(NVDIMMState, 1);
object_property_add_bool(obj, "nvdimm",
machine_get_nvdimm, machine_set_nvdimm);
object_property_set_description(obj, "nvdimm",
"Set on/off to enable/disable "
"NVDIMM instantiation");

object_property_add_str(obj, "nvdimm-persistence",
machine_get_nvdimm_persistence,
machine_set_nvdimm_persistence);
object_property_set_description(obj, "nvdimm-persistence",
"Set NVDIMM persistence"
"Valid values are cpu, mem-ctrl");
}

if (mc->cpu_index_to_instance_props && mc->get_default_cpu_node_id) {
ms->numa_state = g_new0(NumaState, 1);
object_property_add_bool(obj, "hmat",
machine_get_hmat, machine_set_hmat);
object_property_set_description(obj, "hmat",
"Set on/off to enable/disable "
"ACPI Heterogeneous Memory Attribute "
"Table (HMAT)");
}

/* SPCR */
ms->acpi_spcr_enabled = true;
object_property_add_bool(obj, "spcr", machine_get_spcr, machine_set_spcr);
object_property_set_description(obj, "spcr",
"Set on/off to enable/disable "
"ACPI Serial Port Console Redirection "
"Table (spcr)");

/* default to mc->default_cpus */
ms->smp.cpus = mc->default_cpus; // 默认 CPU 数量
ms->smp.max_cpus = mc->default_cpus;
ms->smp.drawers = 1;
ms->smp.books = 1;
ms->smp.sockets = 1;
ms->smp.dies = 1;
ms->smp.clusters = 1;
ms->smp.modules = 1;
ms->smp.cores = 1;
ms->smp.threads = 1;

for (int i = 0; i < CACHE_LEVEL_AND_TYPE__MAX; i++) {
ms->smp_cache.props[i].cache = (CacheLevelAndType)i;
ms->smp_cache.props[i].topology = CPU_TOPOLOGY_LEVEL_DEFAULT;
}

machine_copy_boot_config(ms, &(BootConfiguration){ 0 });
}

硬件设备初始化

核心阶段:machine_run_board_init()

关键操作:

  • 调用机器专属的 init() 方法(如 RISC-V virt 的 virt_machine_init())
  • 初始化 CPU、内存控制器、总线等核心组件
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
// system/vl.c
void machine_run_board_init(MachineState *machine, const char *mem_path, Error **errp)
{
ERRP_GUARD();
MachineClass *machine_class = MACHINE_GET_CLASS(machine);

/* This checkpoint is required by replay to separate prior clock
reading from the other reads, because timer polling functions query
clock values from the log. */
replay_checkpoint(CHECKPOINT_INIT);

if (!xen_enabled()) {
/* On 32-bit hosts, QEMU is limited by virtual address space */
if (machine->ram_size > (2047 << 20) && HOST_LONG_BITS == 32) {
error_setg(errp, "at most 2047 MB RAM can be simulated");
return;
}
}

if (machine->memdev) {
ram_addr_t backend_size = object_property_get_uint(OBJECT(machine->memdev),
"size", &error_abort);
if (backend_size != machine->ram_size) {
error_setg(errp, "Machine memory size does not match the size of the memory backend");
return;
}
} else if (machine_class->default_ram_id && machine->ram_size &&
numa_uses_legacy_mem()) {
if (object_property_find(object_get_objects_root(),
machine_class->default_ram_id)) {
error_setg(errp, "object's id '%s' is reserved for the default"
" RAM backend, it can't be used for any other purposes",
machine_class->default_ram_id);
error_append_hint(errp,
"Change the object's 'id' to something else or disable"
" automatic creation of the default RAM backend by setting"
" 'memory-backend=%s' with '-machine'.\n",
machine_class->default_ram_id);
return;
}

if (!machine_class->create_default_memdev(current_machine, mem_path,
errp)) {
return;
}
}

if (machine->numa_state) {
numa_complete_configuration(machine);
if (machine->numa_state->num_nodes) {
machine_numa_finish_cpu_init(machine);
if (machine_class->cpu_cluster_has_numa_boundary) {
validate_cpu_cluster_to_numa_boundary(machine);
}
}
}

if (!machine->ram && machine->memdev) {
machine->ram = machine_consume_memdev(machine, machine->memdev);
}

/* Check if the CPU type is supported */
if (machine->cpu_type && !is_cpu_type_supported(machine, errp)) {
return;
}

if (machine->cgs) {
/*
* With confidential guests, the host can't see the real
* contents of RAM, so there's no point in it trying to merge
* areas.
*/
machine_set_mem_merge(OBJECT(machine), false, &error_abort);

/*
* Virtio devices can't count on directly accessing guest
* memory, so they need iommu_platform=on to use normal DMA
* mechanisms. That requires also disabling legacy virtio
* support for those virtio pci devices which allow it.
*/
object_register_sugar_prop(TYPE_VIRTIO_PCI, "disable-legacy",
"on", true);
object_register_sugar_prop(TYPE_VIRTIO_DEVICE, "iommu_platform",
"on", false);
}

accel_init_interfaces(ACCEL_GET_CLASS(machine->accelerator));
machine_class->init(machine); // 调用机器专属初始化函数
phase_advance(PHASE_MACHINE_INITIALIZED);
}



// hw/riscv/virt.c (RISC-V 示例)
static void virt_machine_init(MachineState *machine) {
// ...
// 1. 初始化 CPU 拓扑
for (int i = 0; i < smp_cpus; i++) {
object_initialize_child(OBJECT(machine), "cpu", &s->soc[i], CPU_TYPE);
}
//...
// 2. 初始化内存系统
memory_region_init_ram(&s->ram, "riscv_virt_board.ram", ram_size);
//...
// 3. 初始化外设总线
sysbus_init_mmio(SYS_BUS_DEVICE(&s->plic), &s->plic_mmio);
//...
// 4. 创建设备树 (FDT)
create_fdt(s);
//...
}

加速器与 CPU 初始化

核心组件:AccelState 和 CPUState

关键操作:

  • 根据 -accel 参数选择 KVM/TCG 加速器
  • 创建 vCPU 线程并绑定到物理 CPU
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
// accel/accel-system.c
int accel_init_machine(AccelState *accel, MachineState *ms)
{
AccelClass *acc = ACCEL_GET_CLASS(accel);
int ret;
ms->accelerator = accel;
*(acc->allowed) = true;
ret = acc->init_machine(accel, ms);// 初始化加速器
if (ret < 0) {
ms->accelerator = NULL;
*(acc->allowed) = false;
object_unref(OBJECT(accel));
} else {
object_set_accelerator_compat_props(acc->compat_props);
}
return ret;
}


// accel/kvm/kvm-all.c (KVM 示例)
static void kvm_init(MachineState *ms) {
kvm_state->fd = open("/dev/kvm", O_RDWR); // 打开 KVM 设备
kvm_state->vmfd = ioctl(kvm_state->fd, KVM_CREATE_VM); // 创建虚拟机
}

// system/cpus.c
void qemu_init_vcpu(CPUState *cpu)
{
MachineState *ms = MACHINE(qdev_get_machine());

cpu->nr_threads = ms->smp.threads;
cpu->stopped = true;
cpu->random_seed = qemu_guest_random_seed_thread_part1();

if (!cpu->as) {
/* If the target cpu hasn't set up any address spaces itself,
* give it the default one.
*/
cpu->num_ases = 1;
cpu_address_space_init(cpu, 0, "cpu-memory", cpu->memory);
}

/* accelerators all implement the AccelOpsClass */
g_assert(cpus_accel != NULL && cpus_accel->create_vcpu_thread != NULL);
cpus_accel->create_vcpu_thread(cpu);

while (!cpu->created) {
qemu_cond_wait(&qemu_cpu_cond, &bql);
}
}

启动虚拟机

最终阶段:qemu_main_loop()

qemu_init中加载固件

进入最终阶段qemu_main_loop

system/runstate.c

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
int qemu_main_loop(void)
{
int status = EXIT_SUCCESS;

while (!main_loop_should_exit(&status)) {
main_loop_wait(false);
}

return status;
}


static bool main_loop_should_exit(int *status)
{
RunState r;
ShutdownCause request;

if (qemu_debug_requested()) {
vm_stop(RUN_STATE_DEBUG);
}
if (qemu_suspend_requested()) {
qemu_system_suspend();
}
request = qemu_shutdown_requested();
if (request) {
qemu_kill_report();
qemu_system_shutdown(request);
if (shutdown_action == SHUTDOWN_ACTION_PAUSE) {
vm_stop(RUN_STATE_SHUTDOWN);
} else {
if (shutdown_exit_code != EXIT_SUCCESS) {
*status = shutdown_exit_code;
} else if (request == SHUTDOWN_CAUSE_GUEST_PANIC &&
panic_action == PANIC_ACTION_EXIT_FAILURE) {
*status = EXIT_FAILURE;
}
return true;
}
}
request = qemu_reset_requested();
if (request) {
pause_all_vcpus();
qemu_system_reset(request);
resume_all_vcpus();
/*
* runstate can change in pause_all_vcpus()
* as iothread mutex is unlocked
*/
if (!runstate_check(RUN_STATE_RUNNING) &&
!runstate_check(RUN_STATE_INMIGRATE) &&
!runstate_check(RUN_STATE_FINISH_MIGRATE)) {
runstate_set(RUN_STATE_PRELAUNCH);
}
}
if (qemu_wakeup_requested()) {
pause_all_vcpus();
qemu_system_wakeup();
notifier_list_notify(&wakeup_notifiers, &wakeup_reason);
wakeup_reason = QEMU_WAKEUP_REASON_NONE;
resume_all_vcpus();
qapi_event_send_wakeup();
}
if (qemu_powerdown_requested()) {
qemu_system_powerdown();
}
if (qemu_vmstop_requested(&r)) {
vm_stop(r);
}
return false;
}

外设建模(PL011串口)

L011 串口在 QEMU 中的建模完整展示了硬件外设模拟的核心流程:

  • 类型系统集成:通过 QOM 实现设备类的继承体系
  • 状态精确建模:寄存器、FIFO、中断状态的全周期管理
  • 硬件接口实现:MMIO 回调处理 CPU 访问,字符后端对接宿主 I/O
  • 中断系统集成:与平台中断控制器 (PLIC) 协同工作
  • 设备生命周期:实现重置、迁移、销毁等完整生命周期

串口

设备类型定义

核心机制:QEMU Object Model (QOM)

关键操作:

  • 定义 TypeInfo 结构体描述设备类型
  • 实现类初始化 (class_init) 和实例初始化 (instance_init)
  • 注册到全局类型系统
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
 // hw/char/pl011.c
static const TypeInfo pl011_arm_info = {
.name = TYPE_PL011,
.parent = TYPE_SYS_BUS_DEVICE,
.instance_size = sizeof(PL011State),
.instance_init = pl011_init,
.class_init = pl011_class_init,
};

static void pl011_register_types(void)
{
type_register_static(&pl011_arm_info);
type_register_static(&pl011_luminary_info);//pl011_luminary_info(Luminary Micro 特殊版本)(Luminary 被 TI 收购,是一种早期 ARM SoC)
}

type_init(pl011_register_types)

设备状态结构设计

核心组件:PL011State

关键字段:

  • 寄存器状态 (lcr, imsc, 等)
  • FIFO 缓冲区
  • 中断信号线
  • 字符后端 (CharBackend)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
// hw/char/pl011.c
struct PL011State {
SysBusDevice parent_obj;

MemoryRegion iomem;
uint32_t flags;
uint32_t lcr;// 线路控制寄存器
uint32_t rsr;// 中断屏蔽寄存器
uint32_t cr;// 状态标志位
uint32_t dmacr;
uint32_t int_enabled;
uint32_t int_level;
uint32_t read_fifo[PL011_FIFO_DEPTH]; /* FIFO 缓冲区 */
uint32_t ilpr;
uint32_t ibrd;
uint32_t fbrd;
uint32_t ifl;
int read_pos;
int read_count;
int read_trigger;
CharBackend chr; // 连接宿主终端或文件
qemu_irq irq[6];// 中断信号线
Clock *clk;
bool migrate_clk;
const unsigned char *id;
/*
* Since some users embed this struct directly, we must
* ensure that the C struct is at least as big as the Rust one.
*/
uint8_t padding_for_rust[16];
};

设备类初始化

核心任务:

  • 设置设备属性 (properties)

  • 绑定 realize 和 reset 方法

  • 定义设备迁移支持

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
// hw/char/pl011.c
static void pl011_class_init(ObjectClass *oc, const void *data)
{
DeviceClass *dc = DEVICE_CLASS(oc);

dc->realize = pl011_realize;
device_class_set_legacy_reset(dc, pl011_reset);
dc->vmsd = &vmstate_pl011;
device_class_set_props(dc, pl011_properties);
}

static const Property pl011_properties[] = {
DEFINE_PROP_CHR("chardev", PL011State, chr),
DEFINE_PROP_BOOL("migrate-clk", PL011State, migrate_clk, true),
};

设备实例化与硬件连接

核心操作:

  • 初始化内存区域 (MemoryRegion)
  • 注册 MMIO 回调函数
  • 连接中断信号
  • 绑定字符设备后端
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
// hw/char/pl011.c

static void pl011_init(Object *obj)
{
SysBusDevice *sbd = SYS_BUS_DEVICE(obj);
PL011State *s = PL011(obj);
int i;
/* 初始化内存区域 */
memory_region_init_io(&s->iomem, OBJECT(s), &pl011_ops, s, "pl011", 0x1000);
sysbus_init_mmio(sbd, &s->iomem);
for (i = 0; i < ARRAY_SIZE(s->irq); i++) {
/* 初始化中断 */
sysbus_init_irq(sbd, &s->irq[i]);
}

s->clk = qdev_init_clock_in(DEVICE(obj), "clk", pl011_clock_update, s,
ClockUpdate);

s->id = pl011_id_arm;
}
// hw/char/pl011.c
static void pl011_realize(DeviceState *dev, Error **errp)
{
PL011State *s = PL011(dev);

/* 连接字符设备 */
qemu_chr_fe_set_handlers(&s->chr, pl011_can_receive, pl011_receive,
pl011_event, NULL, s, NULL, true);
}

实现设备功能逻辑

核心组件:

  • MMIO 读写回调
  • 中断生成逻辑
  • FIFO 管理
  • 字符设备通信
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
static void pl011_write(void *opaque, hwaddr offset,
uint64_t value, unsigned size)
{
PL011State *s = (PL011State *)opaque;
unsigned char ch;

trace_pl011_write(offset, value, pl011_regname(offset));

switch (offset >> 2) {
case 0: /* UARTDR */ // 数据寄存器
ch = value;
pl011_write_txdata(s, ch);
break;
case 1: /* UARTRSR/UARTECR */
s->rsr = 0;
break;
case 6: /* UARTFR */
/* Writes to Flag register are ignored. */
break;
case 8: /* UARTILPR */
s->ilpr = value;
break;
case 9: /* UARTIBRD */
s->ibrd = value & IBRD_MASK;
pl011_trace_baudrate_change(s);
break;
case 10: /* UARTFBRD */
s->fbrd = value & FBRD_MASK;
pl011_trace_baudrate_change(s);
break;
case 11: /* UARTLCR_H */
/* Reset the FIFO state on FIFO enable or disable */
if ((s->lcr ^ value) & LCR_FEN) {
pl011_reset_rx_fifo(s);
pl011_reset_tx_fifo(s);
}
if ((s->lcr ^ value) & LCR_BRK) {
int break_enable = value & LCR_BRK;
qemu_chr_fe_ioctl(&s->chr, CHR_IOCTL_SERIAL_SET_BREAK,
&break_enable);
pl011_loopback_break(s, break_enable);
}
s->lcr = value;
pl011_set_read_trigger(s);
break;
case 12: /* UARTCR */
/* ??? Need to implement the enable bit. */
s->cr = value;
pl011_loopback_mdmctrl(s);
break;
case 13: /* UARTIFS */
s->ifl = value;
pl011_set_read_trigger(s);
break;
case 14: /* UARTIMSC */
s->int_enabled = value;
pl011_update(s);
break;
case 17: /* UARTICR */
s->int_level &= ~value;
pl011_update(s);
break;
case 18: /* UARTDMACR */
s->dmacr = value;
if (value & 3) {
qemu_log_mask(LOG_UNIMP, "pl011: DMA not implemented\n");
}
break;
default:
qemu_log_mask(LOG_GUEST_ERROR,
"pl011_write: Bad offset 0x%x\n", (int)offset);
}
}


static void pl011_receive(void *opaque, const uint8_t *buf, int size)
{
trace_pl011_receive(size);
/*
* In loopback mode, the RX input signal is internally disconnected
* from the entire receiving logics; thus, all inputs are ignored,
* and BREAK detection on RX input signal is also not performed.
*/
if (pl011_loopback_enabled(opaque)) {
return;
}

for (int i = 0; i < size; i++) {
pl011_fifo_rx_put(opaque, buf[i]);
}
}

中断处理与状态更新

1
2
3
4
5
6
7
8
9
10
11
static void pl011_update(PL011State *s)
{
uint32_t flags;
int i;

flags = s->int_level & s->int_enabled;
trace_pl011_irq_state(flags != 0);
for (i = 0; i < ARRAY_SIZE(s->irq); i++) {
qemu_set_irq(s->irq[i], (flags & irqmask[i]) != 0); /* 更新中断线 */
}
}

一个简单 PL011 设备建模的代码示例

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
/* hw/char/pl011.c - PL011 UART 完整实现 */

#define TYPE_PL011 "pl011"
#define PL011(obj) OBJECT_CHECK(PL011State, (obj), TYPE_PL011)

/* 寄存器偏移定义 */
enum {
PL011_DR = 0x00, // 数据寄存器
PL011_FR = 0x18, // 标志寄存器
PL011_ILPR = 0x20, // 低功耗寄存器
PL011_IBRD = 0x24, // 整数波特率
PL011_FBRD = 0x28, // 小数波特率
PL011_LCRH = 0x2C, // 线路控制
PL011_CR = 0x30, // 控制寄存器
PL011_IMSC = 0x38, // 中断屏蔽
};

/* 设备状态结构 */
typedef struct PL011State {
SysBusDevice parent_obj;
MemoryRegion iomem;
CharBackend chr;
qemu_irq irq;

/* 寄存器状态 */
uint32_t readbuff;
uint32_t flags;
uint32_t lcr;
uint32_t cr;
uint32_t imsc;
uint32_t int_level;

/* FIFO 状态 */
uint8_t read_fifo[PL011_FIFO_DEPTH];
uint32_t read_pos;
uint32_t read_count;
uint32_t read_trigger;
} PL011State;

/* MMIO 操作回调 */
static const MemoryRegionOps pl011_ops = {
.read = pl011_read,
.write = pl011_write,
.endianness = DEVICE_NATIVE_ENDIAN,
.impl.min_access_size = 4,
.impl.max_access_size = 4,
};

/* 读操作实现 */
static uint64_t pl011_read(void *opaque, hwaddr offset, unsigned size) {
PL011State *s = opaque;
uint32_t ret = 0;

switch (offset) {
case PL011_DR:
if (s->read_count > 0) {
ret = s->read_fifo[s->read_pos];
s->read_pos = (s->read_pos + 1) % PL011_FIFO_DEPTH;
s->read_count--;
pl011_update(s); // 更新中断状态
}
break;
case PL011_FR:
ret = s->flags;
break;
case PL011_IMSC:
ret = s->imsc;
break;
}
return ret;
}

/* 写操作实现 */
static void pl011_write(void *opaque, hwaddr offset, uint64_t value, unsigned size) {
PL011State *s = opaque;

switch (offset) {
case PL011_DR:
qemu_chr_fe_write(&s->chr, (uint8_t*)&value, 1);
break;
case PL011_LCRH:
s->lcr = value;
break;
case PL011_IMSC:
s->imsc = value;
pl011_update(s);
break;
}
}

/* 字符设备接收回调 */
static void pl011_receive(void *opaque, const uint8_t *buf, int size) {
PL011State *s = opaque;

if (s->read_count < PL011_FIFO_DEPTH) {
int slot = (s->read_pos + s->read_count) % PL011_FIFO_DEPTH;
s->read_fifo[slot] = *buf;
s->read_count++;

if (s->read_count >= s->read_trigger) {
s->int_level |= INT_RX;
qemu_set_irq(s->irq, 1);
}
}
}

/* 中断状态更新 */
static void pl011_update(PL011State *s) {
uint32_t int_level = 0;

// 接收中断 (RX)
if ((s->imsc & INT_RX) && (s->read_count > 0)) {
int_level |= INT_RX;
}

// 发送中断 (TX)
if ((s->imsc & INT_TX) && (s->xmit_count < PL011_FIFO_DEPTH)) {
int_level |= INT_TX;
}

qemu_set_irq(s->irq, int_level ? 1 : 0);
}

/* 设备重置 */
static void pl011_reset(DeviceState *dev) {
PL011State *s = PL011(dev);

s->lcr = 0;
s->cr = 0x300;
s->imsc = 0;
s->read_count = 0;
s->read_pos = 0;
s->int_level = 0;
qemu_set_irq(s->irq, 0);
}

/* 设备初始化 */
static void pl011_realize(DeviceState *dev, Error **errp) {
PL011State *s = PL011(dev);
SysBusDevice *sbd = SYS_BUS_DEVICE(dev);

memory_region_init_io(&s->iomem, OBJECT(s), &pl011_ops, s, "pl011", 0x1000);
sysbus_init_mmio(sbd, &s->iomem);
sysbus_init_irq(sbd, &s->irq);

qemu_chr_fe_set_handlers(&s->chr, pl011_can_receive, pl011_receive,
NULL, NULL, s, NULL, true);
}

/* 类型注册 */
static const TypeInfo pl011_info = {
.name = TYPE_PL011,
.parent = TYPE_SYS_BUS_DEVICE,
.instance_size = sizeof(PL011State),
.class_init = pl011_class_init,
};

static void pl011_register_types(void) {
type_register_static(&pl011_info);
}
type_init(pl011_register_types)

在 RISC-V virt 机器中实例化 PL011:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
/* hw/riscv/virt.c - 机器初始化 */

static void virt_machine_init(MachineState *machine) {
// ...

/* 创建 PL011 串口 */
DeviceState *dev = qdev_new(TYPE_PL011);
qdev_prop_set_chr(dev, "chardev", serial_hd(0));

sysbus_realize_and_unref(SYS_BUS_DEVICE(dev), &error_fatal);
sysbus_mmio_map(SYS_BUS_DEVICE(dev), 0, 0x10000000); // 映射到 0x10000000
sysbus_connect_irq(SYS_BUS_DEVICE(dev), 0, plic_irq[UART0_IRQ]); // 连接中断

// 设备树添加节点
qemu_fdt_add_subnode(fdt, "/soc/serial");
qemu_fdt_setprop_string(fdt, "/soc/serial", "compatible", "arm,pl011");
qemu_fdt_setprop_cells(fdt, "/soc/serial", "reg", 0x10000000, 0x1000);
qemu_fdt_setprop_cells(fdt, "/soc/serial", "interrupts", UART0_IRQ);
}

关键实现要点

寄存器精确建模:

  • 实现所有 32 个寄存器的精确行为
  • 处理特殊位(如 FIFO 使能位、中断屏蔽位)
  • 支持波特率计算(IBRD/FBRD)

FIFO 状态机:

1
2
3
4
5
6
7
8
9
10
11
12
13
cstatic void fifo_push(PL011State *s, uint32_t value) {
if (s->read_count < PL011_FIFO_DEPTH) {
uint32_t slot = (s->read_pos + s->read_count) % PL011_FIFO_DEPTH;
s->read_fifo[slot] = value;
s->read_count++;

// 触发中断条件
if (s->read_count >= s->read_trigger) {
s->int_level |= INT_RX;
qemu_set_irq(s->irq, 1);
}
}
}

时钟域处理:

  • 实现 pl011_clock_update 回调
  • 处理波特率变化事件
  • 支持动态时钟调整

迁移支持:

1
2
3
4
5
6
7
8
9
10
11
cstatic const VMStateDescription vmstate_pl011 = {
.name = "pl011",
.version_id = 2,
.fields = (VMStateField[]) {
VMSTATE_UINT32(lcr, PL011State),
VMSTATE_UINT32_ARRAY(read_fifo, PL011State, PL011_FIFO_DEPTH),
VMSTATE_UINT32(read_pos, PL011State),
VMSTATE_UINT32(read_count, PL011State),
VMSTATE_END_OF_LIST()
}
};

性能优化:

  • 使用位操作代替除法计算波特率
  • FIFO 操作用掩码替代取模运算
  • 中断状态缓存减少计算次数

这种建模方式使 PL011 在 QEMU 中能精确模拟真实硬件行为,支持从裸机程序到 Linux 内核的全栈开发。

外设建模(edu设备)

在计算机系统中,CPU 与外设的交互本质是 地址访问 和 中断通知:

  • 地址访问:CPU 通过 MMIO(内存映射 I/O)或 PIO(端口 I/O)读写设备寄存器
  • 中断通知:外设通过中断控制器向 CPU 发送事件信号
1
2
3
4
5
6
7
8
// 伪代码:CPU 与设备交互流程
while (1) {
if (中断触发) {
读取设备状态寄存器;
处理数据;
}
写入控制寄存器启动操作;
}

edu 设备介绍

edu 是 QEMU 官方提供的教学用虚拟设备,其功能设计精简却完整:

  • 基础功能:实现阶乘计算(fact 寄存器)
  • 高级功能:DMA 传输、中断触发
  • 典型应用:演示 PCI 设备全生命周期管理

定义设备属性

关键点:通过继承 PCIDevice 自动获得 PCI 配置空间管理能力。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
struct EduState {
PCIDevice pdev; // 继承 PCI 设备基类
MemoryRegion mmio;// MMIO 内存区域

QemuThread thread; // 后台计算线程
QemuMutex thr_mutex;// 线程锁
QemuCond thr_cond;
bool stopping; // 线程停止标志

uint32_t addr4;
uint32_t fact;
#define EDU_STATUS_COMPUTING 0x01
#define EDU_STATUS_IRQFACT 0x80
uint32_t status; // 状态寄存器 (bit0=计算中,bit7=中断使能)

uint32_t irq_status;

#define EDU_DMA_RUN 0x1
#define EDU_DMA_DIR(cmd) (((cmd) & 0x2) >> 1)
# define EDU_DMA_FROM_PCI 0
# define EDU_DMA_TO_PCI 1
#define EDU_DMA_IRQ 0x4
struct dma_state {
dma_addr_t src;
dma_addr_t dst;
dma_addr_t cnt;
dma_addr_t cmd;
} dma;
QEMUTimer dma_timer;
char dma_buf[DMA_SIZE];
uint64_t dma_mask;
};

初始化 PCI 配置空间

1
2
3
4
5
6
7
8
9
10
11
12
13
14
static void edu_class_init(ObjectClass *class, const void *data)
{
DeviceClass *dc = DEVICE_CLASS(class);
PCIDeviceClass *k = PCI_DEVICE_CLASS(class);

k->realize = pci_edu_realize;
k->exit = pci_edu_uninit;
k->vendor_id = PCI_VENDOR_ID_QEMU;// 厂商 ID
k->device_id = 0x11e8;// 设备 ID
k->revision = 0x10;
k->class_id = PCI_CLASS_OTHERS; // 设备类
set_bit(DEVICE_CATEGORY_MISC, dc->categories);
}

此步骤让操作系统能识别设备为 “Edu Device”

映射 MMIO 区域

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
static void pci_edu_realize(PCIDevice *pdev, Error **errp)
{
EduState *edu = EDU(pdev);
uint8_t *pci_conf = pdev->config;

pci_config_set_interrupt_pin(pci_conf, 1);

if (msi_init(pdev, 0, 1, true, false, errp)) {
return;
}

timer_init_ms(&edu->dma_timer, QEMU_CLOCK_VIRTUAL, edu_dma_timer, edu);

qemu_mutex_init(&edu->thr_mutex);
qemu_cond_init(&edu->thr_cond);
qemu_thread_create(&edu->thread, "edu", edu_fact_thread,
edu, QEMU_THREAD_JOINABLE);

memory_region_init_io(&edu->mmio, OBJECT(edu), &edu_mmio_ops, edu,
"edu-mmio", 1 * MiB);// 1MB 地址空间
pci_register_bar(pdev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY, &edu->mmio);
}

地址布局:

1
2
3
4
0x00 : 阶乘输入寄存器 (可写)
0x04 : 阶乘结果寄存器 (只读)
0x80 : DMA 源地址寄存器
0x88 : DMA 目标地址寄存器

实现 MMIO 回调函数

以阶乘计算为例:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
static uint64_t edu_mmio_read(void *opaque, hwaddr addr, unsigned size)
{
EduState *edu = opaque;
uint64_t val = ~0ULL;

if (addr < 0x80 && size != 4) {
return val;
}

if (addr >= 0x80 && size != 4 && size != 8) {
return val;
}

switch (addr) {
case 0x00:
val = 0x010000edu;
break;
case 0x04:// 读取结果寄存器
val = edu->addr4;
break;
case 0x08:
qemu_mutex_lock(&edu->thr_mutex);
val = edu->fact;
qemu_mutex_unlock(&edu->thr_mutex);
break;
case 0x20:
val = qatomic_read(&edu->status);
break;
case 0x24:
val = edu->irq_status;
break;
case 0x80:
dma_rw(edu, false, &val, &edu->dma.src, false);
break;
case 0x88:
dma_rw(edu, false, &val, &edu->dma.dst, false);
break;
case 0x90:
dma_rw(edu, false, &val, &edu->dma.cnt, false);
break;
case 0x98:
dma_rw(edu, false, &val, &edu->dma.cmd, false);
break;
}

return val;
}


static void edu_mmio_write(void *opaque, hwaddr addr, uint64_t val,
unsigned size)
{
EduState *edu = opaque;

if (addr < 0x80 && size != 4) {
return;
}

if (addr >= 0x80 && size != 4 && size != 8) {
return;
}

switch (addr) {
case 0x04:
edu->addr4 = ~val;
break;
case 0x08:
if (qatomic_read(&edu->status) & EDU_STATUS_COMPUTING) {
break;
}
/* EDU_STATUS_COMPUTING cannot go 0->1 concurrently, because it is only
* set in this function and it is under the iothread mutex.
*/
qemu_mutex_lock(&edu->thr_mutex);
edu->fact = val;
qatomic_or(&edu->status, EDU_STATUS_COMPUTING);
qemu_cond_signal(&edu->thr_cond);
qemu_mutex_unlock(&edu->thr_mutex);
break;
case 0x20:
if (val & EDU_STATUS_IRQFACT) {
qatomic_or(&edu->status, EDU_STATUS_IRQFACT);
/* Order check of the COMPUTING flag after setting IRQFACT. */
smp_mb__after_rmw();
} else {
qatomic_and(&edu->status, ~EDU_STATUS_IRQFACT);
}
break;
case 0x60:
edu_raise_irq(edu, val);
break;
case 0x64:
edu_lower_irq(edu, val);
break;
case 0x80:
dma_rw(edu, true, &val, &edu->dma.src, false);
break;
case 0x88:
dma_rw(edu, true, &val, &edu->dma.dst, false);
break;
case 0x90:
dma_rw(edu, true, &val, &edu->dma.cnt, false);
break;
case 0x98:
if (!(val & EDU_DMA_RUN)) {
break;
}
dma_rw(edu, true, &val, &edu->dma.cmd, true);
break;
}
}

实现后台计算线程

关键设计:避免阻塞主线程,通过条件变量实现异步计算。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
/*
* We purposely use a thread, so that users are forced to wait for the status
* register.
*/
static void *edu_fact_thread(void *opaque)
{
EduState *edu = opaque;

while (1) {
uint32_t val, ret = 1;

qemu_mutex_lock(&edu->thr_mutex);
while ((qatomic_read(&edu->status) & EDU_STATUS_COMPUTING) == 0 &&
!edu->stopping) {
qemu_cond_wait(&edu->thr_cond, &edu->thr_mutex);
}

if (edu->stopping) {
qemu_mutex_unlock(&edu->thr_mutex);
break;
}

val = edu->fact;
qemu_mutex_unlock(&edu->thr_mutex);

while (val > 0) {
ret *= val--;
}

/*
* We should sleep for a random period here, so that students are
* forced to check the status properly.
*/

qemu_mutex_lock(&edu->thr_mutex);
edu->fact = ret;
qemu_mutex_unlock(&edu->thr_mutex);
qatomic_and(&edu->status, ~EDU_STATUS_COMPUTING);

/* Clear COMPUTING flag before checking IRQFACT. */
smp_mb__after_rmw();

if (qatomic_read(&edu->status) & EDU_STATUS_IRQFACT) {
bql_lock();
edu_raise_irq(edu, FACT_IRQ);//触发中断
bql_unlock();
}
}

return NULL;
}

中断触发机制

1
2
3
4
5
6
7
8
9
10
11
12
static void edu_raise_irq(EduState *edu, uint32_t val)
{
edu->irq_status |= val;
if (edu->irq_status) {
if (edu_msi_enabled(edu)) {// 使用 MSI 中断
msi_notify(&edu->pdev, 0);
} else {// 使用传统 INTx 中断
pci_set_irq(&edu->pdev, 1);
}
}
}

DMA 传输实现

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
static void edu_dma_timer(void *opaque)
{
EduState *edu = opaque;
bool raise_irq = false;

if (!(edu->dma.cmd & EDU_DMA_RUN)) {
return;
}
// 执行 DMA 拷贝
if (EDU_DMA_DIR(edu->dma.cmd) == EDU_DMA_FROM_PCI) {
uint64_t dst = edu->dma.dst;
edu_check_range(dst, edu->dma.cnt, DMA_START, DMA_SIZE);
dst -= DMA_START;
pci_dma_read(&edu->pdev, edu_clamp_addr(edu, edu->dma.src),
edu->dma_buf + dst, edu->dma.cnt);
} else {
uint64_t src = edu->dma.src;
edu_check_range(src, edu->dma.cnt, DMA_START, DMA_SIZE);
src -= DMA_START;
pci_dma_write(&edu->pdev, edu_clamp_addr(edu, edu->dma.dst),
edu->dma_buf + src, edu->dma.cnt);
}

edu->dma.cmd &= ~EDU_DMA_RUN;
if (edu->dma.cmd & EDU_DMA_IRQ) {
raise_irq = true;
}

if (raise_irq) {
// 触发中断
edu_raise_irq(edu, DMA_IRQ);
}
}

客户机视角的设备交互

当客户机程序的驱动访问 edu 设备时:

  1. 探测设备:通过 PCI ID 0x1234:11e8 识别
  2. 映射 MMIO:ioremap() 获取寄存器虚拟地址
  3. 计算阶乘
  4. 处理中断:在中断服务例程中清除状态标志

要点总结

edu 浓缩了外设建模的 5 大核心要素:

1
寄存器操作 → 中断通知 → DMA 传输 → 多线程协同 → PCI 规范

主要涉及:

  • 状态机模型:通过寄存器位表示设备状态(如计算中/完成)
  • 异步处理:耗时操作移交后台线程,避免阻塞 VCPU
  • 地址隔离:每个设备拥有独立 MMIO 空间
  • 中断抽象:支持传统 INTx 和现代 MSI 两种模式
  • DMA 安全:地址校验防止虚拟机逃逸

中断建模

PL011 中断在 QEMU 中的全链路处理流程

  • 设备层:PL011 根据硬件状态触发中断信号
  • 中断控制器层:PLIC 实现优先级仲裁和中断分发
  • CPU 层:RISC-V 架构响应外部中断并切换上下文
  • Guest 层:客户机程序或者操作系统中断处理程序完成设备服务

下面我们按照流程进行分析:

PL011 触发中断

1
2
3
4
5
6
7
8
9
10
11
12
13
// hw/char/pl011.c
static void pl011_update(PL011State *s)
{
uint32_t flags;
int i;

flags = s->int_level & s->int_enabled;
trace_pl011_irq_state(flags != 0);
for (i = 0; i < ARRAY_SIZE(s->irq); i++) {
qemu_set_irq(s->irq[i], (flags & irqmask[i]) != 0);// 触发中断
}
}

s->irq 连接到 PLIC 的特定中断线(如 UART0_IRQ=10),PLIC 再将中断路由到 CPU。

PLIC 接收中断

PLIC 的 GPIO 输入处理函数更新中断状态:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
// hw/intc/sifive_plic.c
static void sifive_plic_irq_request(void *opaque, int irq, int level)
{
SiFivePLICState *s = opaque;

if (level > 0) {
sifive_plic_set_pending(s, irq, true);// 设置 pending 位
sifive_plic_update(s); // 触发更新
}
}


static void sifive_plic_set_pending(SiFivePLICState *plic, int irq, bool level)
{
atomic_set_masked(&plic->pending[irq >> 5], 1 << (irq & 31), -!!level);
}

PLIC 仲裁逻辑

在 sifive_plic_update() 中实现优先级仲裁:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
static void sifive_plic_update(SiFivePLICState *plic)
{
int addrid;

/* raise irq on harts where this irq is enabled */
for (addrid = 0; addrid < plic->num_addrs; addrid++) {
uint32_t hartid = plic->addr_config[addrid].hartid;
PLICMode mode = plic->addr_config[addrid].mode;
bool level = !!sifive_plic_claimed(plic, addrid);

switch (mode) {
case PLICMode_M:
qemu_set_irq(plic->m_external_irqs[hartid - plic->hartid_base], level);
break;
case PLICMode_S:
qemu_set_irq(plic->s_external_irqs[hartid - plic->hartid_base], level);
break;
default:
break;
}
}
}

CPU 中断注入

通过 cpu_interrupt 设置 CPU 中断请求标志:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
// system/cpus.c
void cpu_interrupt(CPUState *cpu, int mask)
{
g_assert(bql_locked());

cpus_accel->handle_interrupt(cpu, mask);
}
// accel/tcg/tcg-accel-ops.c
void tcg_handle_interrupt(CPUState *cpu, int mask)
{
cpu->interrupt_request |= mask;

/*
* If called from iothread context, wake the target cpu in
* case its halted.
*/
if (!qemu_cpu_is_self(cpu)) {
qemu_cpu_kick(cpu);
} else {
qatomic_set(&cpu->neg.icount_decr.u16.high, -1);
}
}

  • 设置 CPU_INTERRUPT_HARD 标志
  • 修改 icount 强制退出当前 TB

CPU 中断检查

在 vCPU 主循环中检测中断请求:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106

static inline bool cpu_handle_interrupt(CPUState *cpu,
TranslationBlock **last_tb)
{
/*
* If we have requested custom cflags with CF_NOIRQ we should
* skip checking here. Any pending interrupts will get picked up
* by the next TB we execute under normal cflags.
*/
if (cpu->cflags_next_tb != -1 && cpu->cflags_next_tb & CF_NOIRQ) {
return false;
}

/* Clear the interrupt flag now since we're processing
* cpu->interrupt_request and cpu->exit_request.
* Ensure zeroing happens before reading cpu->exit_request or
* cpu->interrupt_request (see also smp_wmb in cpu_exit())
*/
qatomic_set_mb(&cpu->neg.icount_decr.u16.high, 0);

if (unlikely(qatomic_read(&cpu->interrupt_request))) {
int interrupt_request;
bql_lock();
interrupt_request = cpu->interrupt_request;
if (unlikely(cpu->singlestep_enabled & SSTEP_NOIRQ)) {
/* Mask out external interrupts for this step. */
interrupt_request &= ~CPU_INTERRUPT_SSTEP_MASK;
}
if (interrupt_request & CPU_INTERRUPT_DEBUG) {
cpu->interrupt_request &= ~CPU_INTERRUPT_DEBUG;
cpu->exception_index = EXCP_DEBUG;
bql_unlock();
return true;
}
#if !defined(CONFIG_USER_ONLY)
if (replay_mode == REPLAY_MODE_PLAY && !replay_has_interrupt()) {
/* Do nothing */
} else if (interrupt_request & CPU_INTERRUPT_HALT) {
replay_interrupt();
cpu->interrupt_request &= ~CPU_INTERRUPT_HALT;
cpu->halted = 1;
cpu->exception_index = EXCP_HLT;
bql_unlock();
return true;
} else {
const TCGCPUOps *tcg_ops = cpu->cc->tcg_ops;

if (interrupt_request & CPU_INTERRUPT_RESET) {
replay_interrupt();
tcg_ops->cpu_exec_reset(cpu);
bql_unlock();
return true;
}

/*
* The target hook has 3 exit conditions:
* False when the interrupt isn't processed,
* True when it is, and we should restart on a new TB,
* and via longjmp via cpu_loop_exit.
*/
if (tcg_ops->cpu_exec_interrupt(cpu, interrupt_request)) {// 调用架构相关中断处理
if (!tcg_ops->need_replay_interrupt ||
tcg_ops->need_replay_interrupt(interrupt_request)) {
replay_interrupt();
}
/*
* After processing the interrupt, ensure an EXCP_DEBUG is
* raised when single-stepping so that GDB doesn't miss the
* next instruction.
*/
if (unlikely(cpu->singlestep_enabled)) {
cpu->exception_index = EXCP_DEBUG;
bql_unlock();
return true;
}
cpu->exception_index = -1;
*last_tb = NULL;
}
/* The target hook may have updated the 'cpu->interrupt_request';
* reload the 'interrupt_request' value */
interrupt_request = cpu->interrupt_request;
}
#endif /* !CONFIG_USER_ONLY */
if (interrupt_request & CPU_INTERRUPT_EXITTB) {
cpu->interrupt_request &= ~CPU_INTERRUPT_EXITTB;
/* ensure that no TB jump will be modified as
the program flow was changed */
*last_tb = NULL;
}

/* If we exit via cpu_loop_exit/longjmp it is reset in cpu_exec */
bql_unlock();
}

/* Finally, check if we need to exit to the main loop. */
if (unlikely(qatomic_read(&cpu->exit_request)) || icount_exit_request(cpu)) {
qatomic_set(&cpu->exit_request, 0);
if (cpu->exception_index == -1) {
cpu->exception_index = EXCP_INTERRUPT;
}
return true;
}

return false;
}

RISC-V 中断处理

riscv_cpu_do_interrupt 实现中断上下文切换:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
// target/riscv/cpu_helper.c
void riscv_cpu_do_interrupt(CPUState *cs)
{
RISCVCPU *cpu = RISCV_CPU(cs);
CPURISCVState *env = &cpu->env;

// 1. 保存 PC 到 mepc
env->mepc = env->pc;

// 2. 设置中断原因
env->mcause = MCAUSE_INTR | (IRQ_M_EXT << MCAUSE_CODE_SHIFT);

// 3. 跳转到中断向量
if (env->mtvec & MTVEC_MODE_MASK) {
// 向量模式
env->pc = (env->mtvec & ~MTVEC_MODE_MASK) + 4 * cause;
} else {
// 直接模式
env->pc = env->mtvec & ~MTVEC_MODE_MASK;
}

// 4. 更新权限级别
env->priv = PRV_M;
}

Guest OS 中断处理

以 Linux 中断处理流程为例:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
// arch/riscv/kernel/entry.S
ENTRY(handle_arch_irq)
SAVE_ALL
csrr a0, mcause
csrr a1, mepc
mv a2, sp
call riscv_intc_irq // 跳转到 C 语言处理程序
RESTORE_ALL
END(handle_arch_irq)

// drivers/irqchip/irq-sifive-plic.c
static void plic_irq_handler(struct irq_desc *desc)
{
u32 hwirq = readl(plic->regs + PRIORITY_THRESHOLD);
struct irq_chip *chip = irq_desc_get_chip(desc);

chained_irq_enter(chip, desc);
generic_handle_irq(irq_find_mapping(domain, hwirq));
chained_irq_exit(chip, desc);

// 完成中断处理
writel(hwirq, plic->regs + INTERRUPT_COMPLETE);
}

调试与跟踪

开启 PLIC 调试:

1
qemu-system-riscv64 -d int,guest_errors -D plic.log

关键跟踪点 (trace-event)

1
qemu-system-riscv64 --trace "trace_sifive_plic_*"

相关函数

1
2
3
// hw/intc/sifive_plic.c
trace_sifive_plic_set_irq(irq, level);
trace_sifive_plic_update(cpu_index, max_irq, max_prio);

参考: