时间轴

2025-11-10

init


驱动一般都需要调用module_init函数来向内核注册驱动。

编译进内核的驱动

module_init

定义在include/linux/module.h中

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
#ifndef MODULE
/**
* module_init() - driver initialization entry point
* @x: function to be run at kernel boot time or module insertion
*
* module_init() will either be called during do_initcalls() (if
* builtin) or at module insertion time (if a module). There can only
* be one per module.
*/
#define module_init(x) __initcall(x);

/**
* module_exit() - driver exit entry point
* @x: function to be run when driver is removed
*
* module_exit() will wrap the driver clean-up code
* with cleanup_module() when used with rmmod when
* the driver is a module. If the driver is statically
* compiled into the kernel, module_exit() has no effect.
* There can only be one per module.
*/
#define module_exit(x) __exitcall(x);

#else /* MODULE */

/*
* In most cases loadable modules do not need custom
* initcall levels. There are still some valid cases where
* a driver may be needed early if built in, and does not
* matter when built as a loadable module. Like bus
* snooping debug drivers.
*/
#define early_initcall(fn) module_init(fn)
#define core_initcall(fn) module_init(fn)
#define core_initcall_sync(fn) module_init(fn)
#define postcore_initcall(fn) module_init(fn)
#define postcore_initcall_sync(fn) module_init(fn)
#define arch_initcall(fn) module_init(fn)
#define subsys_initcall(fn) module_init(fn)
#define subsys_initcall_sync(fn) module_init(fn)
#define fs_initcall(fn) module_init(fn)
#define fs_initcall_sync(fn) module_init(fn)
#define rootfs_initcall(fn) module_init(fn)
#define device_initcall(fn) module_init(fn)
#define device_initcall_sync(fn) module_init(fn)
#define late_initcall(fn) module_init(fn)
#define late_initcall_sync(fn) module_init(fn)

#define console_initcall(fn) module_init(fn)

/* Each module must use one module_init(). */
#define module_init(initfn) \
static inline initcall_t __maybe_unused __inittest(void) \
{ return initfn; } \
int init_module(void) __copy(initfn) __attribute__((alias(#initfn)));

/* This is only required if you want to be unloadable. */
#define module_exit(exitfn) \
static inline exitcall_t __maybe_unused __exittest(void) \
{ return exitfn; } \
void cleanup_module(void) __copy(exitfn) __attribute__((alias(#exitfn)));

#endif

如果把驱动编译进Linux内核,module_exit是没有意义的,因为静态编译的驱动无法卸载

MODULE宏

场景 含义 MODULE 状态
模块(.ko) 使用 obj-m += xxx.o 编译为可加载模块 ✅ 被定义
内建驱动(built-in) 使用 obj-y += xxx.o 编进内核镜像 ❌ 未定义

在linux源码顶层Makefile里

1
2
3
4
KBUILD_AFLAGS_KERNEL :=
KBUILD_CFLAGS_KERNEL :=
KBUILD_AFLAGS_MODULE := -DMODULE
KBUILD_CFLAGS_MODULE := -DMODULE
变量名 作用 是否带 -DMODULE
KBUILD_CFLAGS_KERNEL 编译内建(built-in)代码时使用的 CFLAGS ❌ 否
KBUILD_CFLAGS_MODULE 编译模块(.ko)代码时使用的 CFLAGS ✅ 是
KBUILD_AFLAGS_KERNEL 汇编文件(built-in)使用的 AFLAGS ❌ 否
KBUILD_AFLAGS_MODULE 汇编文件(模块)使用的 AFLAGS ✅ 是

以我们把驱动编译进内核为例:

include/linux/module.h

1
2
// include/linux/module.h
#define module_init(x) __initcall(x);

include/linux/init.h

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
// include/linux/init.h

#define __initcall(fn) device_initcall(fn)

#define pure_initcall(fn) __define_initcall(fn, 0)
#define core_initcall(fn) __define_initcall(fn, 1)
#define core_initcall_sync(fn) __define_initcall(fn, 1s)
#define postcore_initcall(fn) __define_initcall(fn, 2)
#define postcore_initcall_sync(fn) __define_initcall(fn, 2s)
#define arch_initcall(fn) __define_initcall(fn, 3)
#define arch_initcall_sync(fn) __define_initcall(fn, 3s)
#define subsys_initcall(fn) __define_initcall(fn, 4)
#define subsys_initcall_sync(fn) __define_initcall(fn, 4s)
#define fs_initcall(fn) __define_initcall(fn, 5)
#define fs_initcall_sync(fn) __define_initcall(fn, 5s)
#define rootfs_initcall(fn) __define_initcall(fn, rootfs)
#define device_initcall(fn) __define_initcall(fn, 6)
#define device_initcall_sync(fn) __define_initcall(fn, 6s)
#define late_initcall(fn) __define_initcall(fn, 7)
#define late_initcall_sync(fn) __define_initcall(fn, 7s)

#define __define_initcall(fn, id) ___define_initcall(fn, id, .initcall##id)

typedef int (*initcall_t)(void);

#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
#define ___define_initcall(fn, id, __sec) \
__ADDRESSABLE(fn) \
asm(".section \"" #__sec ".init\", \"a\" \n" \
"__initcall_" #fn #id ": \n" \
".long " #fn " - . \n" \
".previous \n");
#else
#define ___define_initcall(fn, id, __sec) \
static initcall_t __initcall_##fn##id __used \
__attribute__((__section__(#__sec ".init"))) = fn;
#endif

__attribute__((__section__(".initcall6.init"))) 告诉编译器:把这个指针变量放到 .initcall6.init 段中

这个段是内核专门保留的一个“初始化函数指针表”,对应初始化阶段的不同阶段调用顺序。

以fn为my_driver为例:module_init最终展开为

1
__initcall_my_driver_init6 = my_driver_init;

也就是在这个段里存放了一个指向 my_driver_init() 的指针。一个8字节。

如果想要用其他优先级,可以把驱动中的module_init替换为fs_initcall等。

段名 调用阶段
early_initcall(fn) .initcall0.init 最早
core_initcall(fn) .initcall1.init 核心子系统初始化
postcore_initcall(fn) .initcall2.init 核心完成后初始化
arch_initcall(fn) .initcall3.init 与架构相关部分
subsys_initcall(fn) .initcall4.init 子系统(例如 driver core)
fs_initcall(fn) .initcall5.init 文件系统初始化
device_initcall(fn) .initcall6.init 设备驱动初始化(默认)
late_initcall(fn) .initcall7.init 最晚阶段初始化

PREL32

CONFIG_HAVE_ARCH_PREL32_RELOCATIONS 是一个 Kconfig 选项,出现在部分架构(如 ARM64、RISC-V)中。它的含义是:

  • 该架构支持使用 PREL32(PC-relative 32-bit relocation) 类型的重定位方式,也就是说:存储在 .initcall 段中的地址,是 相对当前指针位置(PC-relative) 的 32 位偏移量,而不是绝对地址。

在没有这个选项的系统上(如传统 x86),.initcallX.init 段直接存放函数指针:

1
2
static initcall_t __initcall_driver6 __used
__attribute__((section(".initcall6.init"))) = driver_init;

也就是:

1
2
3
.initcall6.init:
0xffffffff81001234 // 绝对地址
0xffffffff81004567

但是在支持 PREL32 的系统上:

1
2
3
4
asm(".section \".initcall6.init\", \"a\"\n"
"__initcall_driver6:\n"
".long driver_init - .\n"
".previous\n");

这里 .long driver_init - . 表示:把“函数地址减当前地址”的差值(一个 32 位有符号偏移)存入 .initcall 表。

在运行时,内核会这样取值:

1
real_addr = (u64)&entry + (s32)*entry;

这样,内核就能在 不需要重定位修正 的情况下定位函数。这对于 位置无关的内核(KASLR、RELATIVE linking) 特别重要,因为函数的绝对地址在启动时才确定。

优势:

  1. 节省空间:每个 entry 只占 4 字节,而不是 8 字节(在 64 位下节省一半)。
  2. 支持内核地址随机化(KASLR):不需要在启动时对所有指针段做重定位修正。
  3. 启动更快:省去一大堆 relocation fixup 操作。

INIT_CALLS

include/asm-generic.h/vmlinux.lds.h

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
#define INIT_CALLS_LEVEL(level)						\
__initcall##level##_start = .; \
KEEP(*(.initcall##level##.init)) \
KEEP(*(.initcall##level##s.init)) \

#define INIT_CALLS \
__initcall_start = .; \
KEEP(*(.initcallearly.init)) \
INIT_CALLS_LEVEL(0) \
INIT_CALLS_LEVEL(1) \
INIT_CALLS_LEVEL(2) \
INIT_CALLS_LEVEL(3) \
INIT_CALLS_LEVEL(4) \
INIT_CALLS_LEVEL(5) \
INIT_CALLS_LEVEL(rootfs) \
INIT_CALLS_LEVEL(6) \
INIT_CALLS_LEVEL(7) \
__initcall_end = .;

最终展开放入连接脚本中:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
__initcall_start = .;
KEEP(*(.initcallearly.init))
__initcall0_start = .;
KEEP(*(.initcall0.init))
KEEP(*(.initcall0s.init))
__initcall1_start = .;
KEEP(*(.initcall1.init))
KEEP(*(.initcall1s.init))
...
__initcall7_start = .;
KEEP(*(.initcall7.init))
KEEP(*(.initcall7s.init))
__initcall_end = .;

init/main.c

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
extern initcall_entry_t __initcall_start[];
extern initcall_entry_t __initcall0_start[];
extern initcall_entry_t __initcall1_start[];
extern initcall_entry_t __initcall2_start[];
extern initcall_entry_t __initcall3_start[];
extern initcall_entry_t __initcall4_start[];
extern initcall_entry_t __initcall5_start[];
extern initcall_entry_t __initcall6_start[];
extern initcall_entry_t __initcall7_start[];
extern initcall_entry_t __initcall_end[];

static initcall_entry_t *initcall_levels[] __initdata = {
__initcall0_start,
__initcall1_start,
__initcall2_start,
__initcall3_start,
__initcall4_start,
__initcall5_start,
__initcall6_start,
__initcall7_start,
__initcall_end,
};



asmlinkage __visible void __init __no_sanitize_address start_kernel(void)
{
//前面省略
/* Do the rest non-__init'ed, we're now alive */
arch_call_rest_init();

prevent_tail_call_optimization();
}

void __init __weak arch_call_rest_init(void)
{
rest_init();
}

noinline void __ref rest_init(void)
{
struct task_struct *tsk;
int pid;

rcu_scheduler_starting();
/*
* We need to spawn init first so that it obtains pid 1, however
* the init task will end up wanting to create kthreads, which, if
* we schedule it before we create kthreadd, will OOPS.
*/
pid = kernel_thread(kernel_init, NULL, CLONE_FS);
// 后面的省略
}

static int __ref kernel_init(void *unused)
{
int ret;

kernel_init_freeable();
// 后面省略

}

//
static noinline void __init kernel_init_freeable(void)
{
// 前面省略

do_basic_setup();

// 后面省略
}

static void __init do_basic_setup(void)
{
cpuset_init_smp();
driver_init();
init_irq_proc();
do_ctors();
usermodehelper_enable();
do_initcalls();
}


static void __init do_initcalls(void)
{
int level;
size_t len = strlen(saved_command_line) + 1;
char *command_line;

command_line = kzalloc(len, GFP_KERNEL);
if (!command_line)
panic("%s: Failed to allocate %zu bytes\n", __func__, len);
// 这个for循环可以看到是从0开始初始化的,因此数字越小优先级越高
// 此外相同id的带s的优先级小于不带s的
for (level = 0; level < ARRAY_SIZE(initcall_levels) - 1; level++) {
/* Parser modifies command_line, restore it each time */
strcpy(command_line, saved_command_line);
do_initcall_level(level, command_line);
}

kfree(command_line);
}


static void __init do_initcall_level(int level, char *command_line)
{
initcall_entry_t *fn;

parse_args(initcall_level_names[level],
command_line, __start___param,
__stop___param - __start___param,
level, level,
NULL, ignore_unknown_bootoption);

trace_initcall_level(initcall_level_names[level]);
for (fn = initcall_levels[level]; fn < initcall_levels[level+1]; fn++)
do_one_initcall(initcall_from_entry(fn));
}

int __init_or_module do_one_initcall(initcall_t fn)
{
int count = preempt_count();
char msgbuf[64];
int ret;

if (initcall_blacklisted(fn))
return -EPERM;

do_trace_initcall_start(fn);
ret = fn();// 核心代码,执行一次这个初始化函数
do_trace_initcall_finish(fn, ret);

msgbuf[0] = 0;

if (preempt_count() != count) {
sprintf(msgbuf, "preemption imbalance ");
preempt_count_set(count);
}
if (irqs_disabled()) {
strlcat(msgbuf, "disabled interrupts ", sizeof(msgbuf));
local_irq_enable();
}
WARN(msgbuf[0], "initcall %pS returned with %s\n", fn, msgbuf);

add_latent_entropy();
return ret;
}

其中do_initcall_level中使用的initcall_from_entry定义在include/linux/init.h

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
typedef int initcall_entry_t;

static inline initcall_t initcall_from_entry(initcall_entry_t *entry)
{
return offset_to_ptr(entry);
}
#else
typedef initcall_t initcall_entry_t;

static inline initcall_t initcall_from_entry(initcall_entry_t *entry)
{
return *entry;
}
#endif

总结

在使用moduleinit(hello_world)的时候,hello_world()这个函数会被放在.initcall6.init段处。当内核启动的时候,会执行do_initcall()函数根据指针数组initcall_levels[6]找到__initcall6_start,在include/asm-generic/vmlinux.lds.h中可以查到\_initcall6_start对应的.initcall6.init段的起始地址,然后依次去除这个段的函数指针并执行函数。

内核模块驱动

系统调用

include/linux/syscalls.h

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
#define SYSCALL_DEFINE_MAXARGS	6

#define SYSCALL_DEFINEx(x, sname, ...) \
SYSCALL_METADATA(sname, x, __VA_ARGS__) \
__SYSCALL_DEFINEx(x, sname, __VA_ARGS__)

#define __PROTECT(...) asmlinkage_protect(__VA_ARGS__)

/*
* The asmlinkage stub is aliased to a function named __se_sys_*() which
* sign-extends 32-bit ints to longs whenever needed. The actual work is
* done within __do_sys_*().
*/
#ifndef __SYSCALL_DEFINEx
#define __SYSCALL_DEFINEx(x, name, ...) \
__diag_push(); \
__diag_ignore(GCC, 8, "-Wattribute-alias", \
"Type aliasing is used to sanitize syscall arguments");\
asmlinkage long sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)) \
__attribute__((alias(__stringify(__se_sys##name)))); \
ALLOW_ERROR_INJECTION(sys##name, ERRNO); \
static inline long __do_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__));\
asmlinkage long __se_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \
asmlinkage long __se_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \
{ \
long ret = __do_sys##name(__MAP(x,__SC_CAST,__VA_ARGS__));\
__MAP(x,__SC_TEST,__VA_ARGS__); \
__PROTECT(x, ret,__MAP(x,__SC_ARGS,__VA_ARGS__)); \
return ret; \
} \
__diag_pop(); \
static inline long __do_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__))
#endif /* __SYSCALL_DEFINEx */



// ================SYSCALL_DEFINE=====================
#ifndef SYSCALL_DEFINE0
#define SYSCALL_DEFINE0(sname) \
SYSCALL_METADATA(_##sname, 0); \
asmlinkage long sys_##sname(void); \
ALLOW_ERROR_INJECTION(sys_##sname, ERRNO); \
asmlinkage long sys_##sname(void)
#endif /* SYSCALL_DEFINE0 */

#define SYSCALL_DEFINE1(name, ...) SYSCALL_DEFINEx(1, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE2(name, ...) SYSCALL_DEFINEx(2, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE3(name, ...) SYSCALL_DEFINEx(3, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE4(name, ...) SYSCALL_DEFINEx(4, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE5(name, ...) SYSCALL_DEFINEx(5, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE6(name, ...) SYSCALL_DEFINEx(6, _##name, __VA_ARGS__)


数字代表参数的个数

添加一个自定义系统调用

helloworld.c

1
2
3
4
5
6
#include <linux/kernel.h>
#include <linux/syscalls.h>

SYSCALL_DEFINE0(helloworld){
printk("hello world syscall\n");
}

可以放在linux源码任意路径下

添加系统调用号

include/uapi/asm-generic/unistd.h中添加

1
2
3
4
5
6
7
8
9
10
11
12
13
14
// ...
#define __NR_openat2 437
__SYSCALL(__NR_openat2, sys_openat2)
#define __NR_pidfd_getfd 438
__SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd)
#define __NR_faccessat2 439
__SYSCALL(__NR_faccessat2, sys_faccessat2)
#define __NR_process_madvise 440
__SYSCALL(__NR_process_madvise, sys_process_madvise)

#define __NR_helloworld 441
__SYSCALL(__NR_helloworld, sys_helloworld)
#undef __NR_syscalls
#define __NR_syscalls 442

测试

1
2
3
4
5
6
7
8
9
#include <sys/stat.h>
#include <stdlib.h>

#define __NR_helloworld 441

int main(int argc, char **argv){
syscall(__NR_helloworld);
return 0;
}

module_init

对于的initcall:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
#else /* MODULE */

/*
* In most cases loadable modules do not need custom
* initcall levels. There are still some valid cases where
* a driver may be needed early if built in, and does not
* matter when built as a loadable module. Like bus
* snooping debug drivers.
*/
#define early_initcall(fn) module_init(fn)
#define core_initcall(fn) module_init(fn)
#define core_initcall_sync(fn) module_init(fn)
#define postcore_initcall(fn) module_init(fn)
#define postcore_initcall_sync(fn) module_init(fn)
#define arch_initcall(fn) module_init(fn)
#define subsys_initcall(fn) module_init(fn)
#define subsys_initcall_sync(fn) module_init(fn)
#define fs_initcall(fn) module_init(fn)
#define fs_initcall_sync(fn) module_init(fn)
#define rootfs_initcall(fn) module_init(fn)
#define device_initcall(fn) module_init(fn)
#define device_initcall_sync(fn) module_init(fn)
#define late_initcall(fn) module_init(fn)
#define late_initcall_sync(fn) module_init(fn)

#define console_initcall(fn) module_init(fn)

/* Each module must use one module_init(). */
#define module_init(initfn) \
static inline initcall_t __maybe_unused __inittest(void) \
{ return initfn; } \
int init_module(void) __copy(initfn) __attribute__((alias(#initfn)));

/* This is only required if you want to be unloadable. */
#define module_exit(exitfn) \
static inline exitcall_t __maybe_unused __exittest(void) \
{ return exitfn; } \
void cleanup_module(void) __copy(exitfn) __attribute__((alias(#exitfn)));

#endif

模块的初始化是在 insmod / modprobe 时由模块加载器统一调用 的,不会按内核的启动阶段区分。因此:

1
2
3
#define early_initcall(fn)       module_init(fn)
#define device_initcall(fn) module_init(fn)
...

这些宏都退化成同一个东西:
模块加载时,内核直接调用 init_module(),进而执行 fn()

1
2
3
4
5
6
#define module_init(initfn)                                     \
static inline initcall_t __maybe_unused __inittest(void) { \
return initfn; \
} \
int init_module(void) __copy(initfn) \
__attribute__((alias(#initfn)));

展开后等价于生成两个符号:

  • 一个辅助的 __inittest() 内联函数(只是返回 initfn,不重要);

  • 一个名为 init_module 的函数别名,直接指向我们定义的初始化函数。

例子:

1
2
static int mydriver_init(void) { ... }
module_init(mydriver_init);

预处理后就等价于:

1
int init_module(void) __attribute__((alias("mydriver_init")));

也就是说:内核加载模块时,真正执行的入口函数是 init_module(),它被别名绑定到我们定义的初始化函数 mydriver_init()。

include/uapi/asm-generic/unistd.h

uapi属于 通用内核接口(UAPI, User API),也就是用户空间可以包含的头文件。定义了 大部分架构通用的系统调用号(syscall numbers),供用户程序调用。

1
2
3
4
5
6
7
8
/* kernel/module.c */
#define __NR_init_module 105
__SYSCALL(__NR_init_module, sys_init_module)
#define __NR_delete_module 106
__SYSCALL(__NR_delete_module, sys_delete_module)

#define __NR_finit_module 273
__SYSCALL(__NR_finit_module, sys_finit_module)

include/linux/syscalls.h

1
2
3
4
5
6
7
/* kernel/module.c */
asmlinkage long sys_init_module(void __user *umod, unsigned long len,
const char __user *uargs);
asmlinkage long sys_delete_module(const char __user *name_user,
unsigned int flags);

asmlinkage long sys_finit_module(int fd, const char __user *uargs, int flags);

asmlinkage 是一个 调用约定修饰符,用于告诉编译器:

系统调用入口函数的所有参数都从栈上传递(而不是寄存器),因为它是从汇编入口(syscall trap)进入的。

kernel/module.c

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
SYSCALL_DEFINE3(init_module, void __user *, umod,
unsigned long, len, const char __user *, uargs)
{
int err;
struct load_info info = { };

err = may_init_module();
if (err)
return err;

pr_debug("init_module: umod=%p, len=%lu, uargs=%p\n",
umod, len, uargs);

err = copy_module_from_user(umod, len, &info);
if (err)
return err;

return load_module(&info, uargs, 0);
}

SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags)
{
struct load_info info = { };
void *hdr = NULL;
int err;

err = may_init_module();
if (err)
return err;

pr_debug("finit_module: fd=%d, uargs=%p, flags=%i\n", fd, uargs, flags);

if (flags & ~(MODULE_INIT_IGNORE_MODVERSIONS
|MODULE_INIT_IGNORE_VERMAGIC))
return -EINVAL;

err = kernel_read_file_from_fd(fd, 0, &hdr, INT_MAX, NULL,
READING_MODULE);
if (err < 0)
return err;
info.hdr = hdr;
info.len = err;

return load_module(&info, uargs, flags);
}

SYSCALL_DEFINE3 是一个宏,用于生成系统调用封装函数。它生成的函数名就是 sys_init_module。这里的3表示3个参数,SYSCALL_DEFINE最有有6个参数。

最终会展开为__se_sys_init_module。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
/* Allocate and load the module: note that size of section 0 is always
zero, and we rely on this for optional sections. */
static int load_module(struct load_info *info, const char __user *uargs,
int flags)
{
struct module *mod;
long err = 0;
char *after_dashes;

/*
* Do the signature check (if any) first. All that
* the signature check needs is info->len, it does
* not need any of the section info. That can be
* set up later. This will minimize the chances
* of a corrupt module causing problems before
* we even get to the signature check.
*
* The check will also adjust info->len by stripping
* off the sig length at the end of the module, making
* checks against info->len more correct.
*/
err = module_sig_check(info, flags);
if (err)
goto free_copy;

/*
* Do basic sanity checks against the ELF header and
* sections.
*/
err = elf_validity_check(info);
if (err) {
pr_err("Module has invalid ELF structures\n");
goto free_copy;
}

/*
* Everything checks out, so set up the section info
* in the info structure.
*/
err = setup_load_info(info, flags);
if (err)
goto free_copy;

/*
* Now that we know we have the correct module name, check
* if it's blacklisted.
*/
if (blacklisted(info->name)) {
err = -EPERM;
pr_err("Module %s is blacklisted\n", info->name);
goto free_copy;
}

err = rewrite_section_headers(info, flags);
if (err)
goto free_copy;

/* Check module struct version now, before we try to use module. */
if (!check_modstruct_version(info, info->mod)) {
err = -ENOEXEC;
goto free_copy;
}

/* Figure out module layout, and allocate all the memory. */
mod = layout_and_allocate(info, flags);
if (IS_ERR(mod)) {
err = PTR_ERR(mod);
goto free_copy;
}

audit_log_kern_module(mod->name);

/* Reserve our place in the list. */
err = add_unformed_module(mod);
if (err)
goto free_module;

#ifdef CONFIG_MODULE_SIG
mod->sig_ok = info->sig_ok;
if (!mod->sig_ok) {
pr_notice_once("%s: module verification failed: signature "
"and/or required key missing - tainting "
"kernel\n", mod->name);
add_taint_module(mod, TAINT_UNSIGNED_MODULE, LOCKDEP_STILL_OK);
}
#endif

/* To avoid stressing percpu allocator, do this once we're unique. */
err = percpu_modalloc(mod, info);
if (err)
goto unlink_mod;

/* Now module is in final location, initialize linked lists, etc. */
err = module_unload_init(mod);
if (err)
goto unlink_mod;

init_param_lock(mod);

/* Now we've got everything in the final locations, we can
* find optional sections. */
err = find_module_sections(mod, info);
if (err)
goto free_unload;

err = check_module_license_and_versions(mod);
if (err)
goto free_unload;

/* Set up MODINFO_ATTR fields */
setup_modinfo(mod, info);

/* Fix up syms, so that st_value is a pointer to location. */
err = simplify_symbols(mod, info);
if (err < 0)
goto free_modinfo;

err = apply_relocations(mod, info);
if (err < 0)
goto free_modinfo;

err = post_relocation(mod, info);
if (err < 0)
goto free_modinfo;

flush_module_icache(mod);

/* Now copy in args */
mod->args = strndup_user(uargs, ~0UL >> 1);
if (IS_ERR(mod->args)) {
err = PTR_ERR(mod->args);
goto free_arch_cleanup;
}

dynamic_debug_setup(mod, info->debug, info->num_debug);

/* Ftrace init must be called in the MODULE_STATE_UNFORMED state */
ftrace_module_init(mod);

/* Finally it's fully formed, ready to start executing. */
err = complete_formation(mod, info);
if (err)
goto ddebug_cleanup;

err = prepare_coming_module(mod);
if (err)
goto bug_cleanup;

/* Module is ready to execute: parsing args may do that. */
after_dashes = parse_args(mod->name, mod->args, mod->kp, mod->num_kp,
-32768, 32767, mod,
unknown_module_param_cb);
if (IS_ERR(after_dashes)) {
err = PTR_ERR(after_dashes);
goto coming_cleanup;
} else if (after_dashes) {
pr_warn("%s: parameters '%s' after `--' ignored\n",
mod->name, after_dashes);
}

/* Link in to sysfs. */
err = mod_sysfs_setup(mod, info, mod->kp, mod->num_kp);
if (err < 0)
goto coming_cleanup;

if (is_livepatch_module(mod)) {
err = copy_module_elf(mod, info);
if (err < 0)
goto sysfs_cleanup;
}

/* Get rid of temporary copy. */
free_copy(info);

/* Done! */
trace_module_load(mod);
//-----> 最终返回do_init_module
return do_init_module(mod);

sysfs_cleanup:
mod_sysfs_teardown(mod);
coming_cleanup:
mod->state = MODULE_STATE_GOING;
destroy_params(mod->kp, mod->num_kp);
blocking_notifier_call_chain(&module_notify_list,
MODULE_STATE_GOING, mod);
klp_module_going(mod);
bug_cleanup:
mod->state = MODULE_STATE_GOING;
/* module_bug_cleanup needs module_mutex protection */
mutex_lock(&module_mutex);
module_bug_cleanup(mod);
mutex_unlock(&module_mutex);

ddebug_cleanup:
ftrace_release_mod(mod);
dynamic_debug_remove(mod, info->debug);
synchronize_rcu();
kfree(mod->args);
free_arch_cleanup:
module_arch_cleanup(mod);
free_modinfo:
free_modinfo(mod);
free_unload:
module_unload_free(mod);
unlink_mod:
mutex_lock(&module_mutex);
/* Unlink carefully: kallsyms could be walking list. */
list_del_rcu(&mod->list);
mod_tree_remove(mod);
wake_up_all(&module_wq);
/* Wait for RCU-sched synchronizing before releasing mod->list. */
synchronize_rcu();
mutex_unlock(&module_mutex);
free_module:
/* Free lock-classes; relies on the preceding sync_rcu() */
lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size);

module_deallocate(mod, info);
free_copy:
free_copy(info);
return err;
}

最终会返回 do_init_module(mod);

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
/*
* This is where the real work happens.
*
* Keep it uninlined to provide a reliable breakpoint target, e.g. for the gdb
* helper command 'lx-symbols'.
*/
static noinline int do_init_module(struct module *mod)
{
int ret = 0;
struct mod_initfree *freeinit;

freeinit = kmalloc(sizeof(*freeinit), GFP_KERNEL);
if (!freeinit) {
ret = -ENOMEM;
goto fail;
}
freeinit->module_init = mod->init_layout.base;

do_mod_ctors(mod);
/* Start the module */
//-------->这里mod->init就是模块入口函数,然后调用do_one_initcall
if (mod->init != NULL)
ret = do_one_initcall(mod->init);
if (ret < 0) {
goto fail_free_freeinit;
}
if (ret > 0) {
pr_warn("%s: '%s'->init suspiciously returned %d, it should "
"follow 0/-E convention\n"
"%s: loading module anyway...\n",
__func__, mod->name, ret, __func__);
dump_stack();
}

/* Now it's a first class citizen! */
mod->state = MODULE_STATE_LIVE;
blocking_notifier_call_chain(&module_notify_list,
MODULE_STATE_LIVE, mod);

/* Delay uevent until module has finished its init routine */
kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD);

/*
* We need to finish all async code before the module init sequence
* is done. This has potential to deadlock if synchronous module
* loading is requested from async (which is not allowed!).
*
* See commit 0fdff3ec6d87 ("async, kmod: warn on synchronous
* request_module() from async workers") for more details.
*/
if (!mod->async_probe_requested)
async_synchronize_full();

ftrace_free_mem(mod, mod->init_layout.base, mod->init_layout.base +
mod->init_layout.size);
mutex_lock(&module_mutex);
/* Drop initial reference. */
module_put(mod);
trim_init_extable(mod);
#ifdef CONFIG_KALLSYMS
/* Switch to core kallsyms now init is done: kallsyms may be walking! */
rcu_assign_pointer(mod->kallsyms, &mod->core_kallsyms);
#endif
module_enable_ro(mod, true);
mod_tree_remove_init(mod);
module_arch_freeing_init(mod);
mod->init_layout.base = NULL;
mod->init_layout.size = 0;
mod->init_layout.ro_size = 0;
mod->init_layout.ro_after_init_size = 0;
mod->init_layout.text_size = 0;
/*
* We want to free module_init, but be aware that kallsyms may be
* walking this with preempt disabled. In all the failure paths, we
* call synchronize_rcu(), but we don't want to slow down the success
* path. module_memfree() cannot be called in an interrupt, so do the
* work and call synchronize_rcu() in a work queue.
*
* Note that module_alloc() on most architectures creates W+X page
* mappings which won't be cleaned up until do_free_init() runs. Any
* code such as mark_rodata_ro() which depends on those mappings to
* be cleaned up needs to sync with the queued work - ie
* rcu_barrier()
*/
if (llist_add(&freeinit->node, &init_free_list))
schedule_work(&init_free_wq);

mutex_unlock(&module_mutex);
wake_up_all(&module_wq);

return 0;

fail_free_freeinit:
kfree(freeinit);
fail:
/* Try to protect us from buggy refcounters. */
mod->state = MODULE_STATE_GOING;
synchronize_rcu();
module_put(mod);
blocking_notifier_call_chain(&module_notify_list,
MODULE_STATE_GOING, mod);
klp_module_going(mod);
ftrace_release_mod(mod);
free_module(mod);
wake_up_all(&module_wq);
return ret;
}

调用do_one_initcall(mod->init);

struct module

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
struct module {
enum module_state state;

/* Member of list of modules */
struct list_head list;

/* Unique handle for this module */
char name[MODULE_NAME_LEN];

/* Sysfs stuff. */
struct module_kobject mkobj;
struct module_attribute *modinfo_attrs;
const char *version;
const char *srcversion;
struct kobject *holders_dir;

/* Exported symbols */
const struct kernel_symbol *syms;
const s32 *crcs;
unsigned int num_syms;

/* Kernel parameters. */
#ifdef CONFIG_SYSFS
struct mutex param_lock;
#endif
struct kernel_param *kp;
unsigned int num_kp;

/* GPL-only exported symbols. */
unsigned int num_gpl_syms;
const struct kernel_symbol *gpl_syms;
const s32 *gpl_crcs;
bool using_gplonly_symbols;

#ifdef CONFIG_UNUSED_SYMBOLS
/* unused exported symbols. */
const struct kernel_symbol *unused_syms;
const s32 *unused_crcs;
unsigned int num_unused_syms;

/* GPL-only, unused exported symbols. */
unsigned int num_unused_gpl_syms;
const struct kernel_symbol *unused_gpl_syms;
const s32 *unused_gpl_crcs;
#endif

#ifdef CONFIG_MODULE_SIG
/* Signature was verified. */
bool sig_ok;
#endif

bool async_probe_requested;

/* symbols that will be GPL-only in the near future. */
const struct kernel_symbol *gpl_future_syms;
const s32 *gpl_future_crcs;
unsigned int num_gpl_future_syms;

/* Exception table */
unsigned int num_exentries;
struct exception_table_entry *extable;

/* Startup function. */
int (*init)(void);

/* Core layout: rbtree is accessed frequently, so keep together. */
struct module_layout core_layout __module_layout_align;
struct module_layout init_layout;

/* Arch-specific module values */
struct mod_arch_specific arch;

unsigned long taints; /* same bits as kernel:taint_flags */

#ifdef CONFIG_GENERIC_BUG
/* Support for BUG */
unsigned num_bugs;
struct list_head bug_list;
struct bug_entry *bug_table;
#endif

#ifdef CONFIG_KALLSYMS
/* Protected by RCU and/or module_mutex: use rcu_dereference() */
struct mod_kallsyms __rcu *kallsyms;
struct mod_kallsyms core_kallsyms;

/* Section attributes */
struct module_sect_attrs *sect_attrs;

/* Notes attributes */
struct module_notes_attrs *notes_attrs;
#endif

/* The command line arguments (may be mangled). People like
keeping pointers to this stuff */
char *args;

#ifdef CONFIG_SMP
/* Per-cpu data. */
void __percpu *percpu;
unsigned int percpu_size;
#endif
void *noinstr_text_start;
unsigned int noinstr_text_size;

#ifdef CONFIG_TRACEPOINTS
unsigned int num_tracepoints;
tracepoint_ptr_t *tracepoints_ptrs;
#endif
#ifdef CONFIG_TREE_SRCU
unsigned int num_srcu_structs;
struct srcu_struct **srcu_struct_ptrs;
#endif
#ifdef CONFIG_BPF_EVENTS
unsigned int num_bpf_raw_events;
struct bpf_raw_event_map *bpf_raw_events;
#endif
#ifdef CONFIG_JUMP_LABEL
struct jump_entry *jump_entries;
unsigned int num_jump_entries;
#endif
#ifdef CONFIG_TRACING
unsigned int num_trace_bprintk_fmt;
const char **trace_bprintk_fmt_start;
#endif
#ifdef CONFIG_EVENT_TRACING
struct trace_event_call **trace_events;
unsigned int num_trace_events;
struct trace_eval_map **trace_evals;
unsigned int num_trace_evals;
#endif
#ifdef CONFIG_FTRACE_MCOUNT_RECORD
unsigned int num_ftrace_callsites;
unsigned long *ftrace_callsites;
#endif
#ifdef CONFIG_KPROBES
void *kprobes_text_start;
unsigned int kprobes_text_size;
unsigned long *kprobe_blacklist;
unsigned int num_kprobe_blacklist;
#endif
#ifdef CONFIG_HAVE_STATIC_CALL_INLINE
int num_static_call_sites;
struct static_call_site *static_call_sites;
#endif

#ifdef CONFIG_LIVEPATCH
bool klp; /* Is this a livepatch module? */
bool klp_alive;

/* Elf information */
struct klp_modinfo *klp_info;
#endif

#ifdef CONFIG_MODULE_UNLOAD
/* What modules depend on me? */
struct list_head source_list;
/* What modules do I depend on? */
struct list_head target_list;

/* Destruction function. */
void (*exit)(void);

atomic_t refcnt;
#endif

#ifdef CONFIG_MITIGATION_ITS
int its_num_pages;
void **its_page_array;
#endif

#ifdef CONFIG_CONSTRUCTORS
/* Constructor functions. */
ctor_fn_t *ctors;
unsigned int num_ctors;
#endif

#ifdef CONFIG_FUNCTION_ERROR_INJECTION
struct error_injection_entry *ei_funcs;
unsigned int num_ei_funcs;
#endif
} ____cacheline_aligned __randomize_layout;

内核是如何运行ko文件的?

insmod命令

busybox1.37.0/modutils/insmod.c

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
/* vi: set sw=4 ts=4: */
/*
* Mini insmod implementation for busybox
*
* Copyright (C) 2008 Timo Teras <timo.teras@iki.fi>
*
* Licensed under GPLv2 or later, see file LICENSE in this source tree.
*/
//config:config INSMOD
//config: bool "insmod (22 kb)"
//config: default y
//config: help
//config: insmod is used to load specified modules in the running kernel.

//applet:IF_INSMOD(IF_NOT_MODPROBE_SMALL(APPLET_NOEXEC(insmod, insmod, BB_DIR_SBIN, BB_SUID_DROP, insmod)))

//kbuild:ifneq ($(CONFIG_MODPROBE_SMALL),y)
//kbuild:lib-$(CONFIG_INSMOD) += insmod.o modutils.o
//kbuild:endif

#include "libbb.h"
#include "modutils.h"

/* 2.6 style insmod has no options and required filename
* (not module name - .ko can't be omitted) */

//usage:#if !ENABLE_MODPROBE_SMALL
//usage:#define insmod_trivial_usage
//usage: IF_FEATURE_2_4_MODULES("[-fkvqLx] MODULE")
//usage: IF_NOT_FEATURE_2_4_MODULES("FILE")
//usage: IF_FEATURE_CMDLINE_MODULE_OPTIONS(" [SYMBOL=VALUE]...")
//usage:#define insmod_full_usage "\n\n"
//usage: "Load kernel module"
//usage: IF_FEATURE_2_4_MODULES( "\n"
//usage: "\n -f Force module to load into the wrong kernel version"
//usage: "\n -k Make module autoclean-able"
//usage: "\n -v Verbose"
//usage: "\n -q Quiet"
//usage: "\n -L Lock: prevent simultaneous loads"
//usage: IF_FEATURE_INSMOD_LOAD_MAP(
//usage: "\n -m Output load map to stdout"
//usage: )
//usage: "\n -x Don't export externs"
//usage: )
//usage:#endif

int insmod_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE;
int insmod_main(int argc UNUSED_PARAM, char **argv)
{
char *filename;
int rc;

/* Compat note:
* 2.6 style insmod has no options and required filename
* (not module name - .ko can't be omitted).
* 2.4 style insmod can take module name without .o
* and performs module search in default directories
* or in $MODPATH.
*/

IF_FEATURE_2_4_MODULES(
getopt32(argv, INSMOD_OPTS INSMOD_ARGS);
argv += optind - 1;
);

filename = *++argv;
if (!filename)
bb_show_usage();

rc = bb_init_module(filename, parse_cmdline_module_options(argv, /*quote_spaces:*/ 0));
if (rc)
bb_error_msg("can't insert '%s': %s", filename, moderror(rc));

return rc;
}

调用了bb_init_module

busybox1.37.0/modutils/modutils.c

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
#include <sys/syscall.h>

#define init_module(mod, len, opts) syscall(__NR_init_module, mod, len, opts)
#if defined(__NR_finit_module)
#define finit_module(fd, uargs, flags) syscall(__NR_finit_module, fd, uargs, flags)
#ifndef MODULE_INIT_COMPRESSED_FILE
#define MODULE_INIT_COMPRESSED_FILE 4
#endif
#endif

#define delete_module(mod, flags) syscall(__NR_delete_module, mod, flags)

int FAST_FUNC bb_init_module(const char *filename, const char *options)
{
size_t image_size;
char *image;
int rc;
bool mmaped;

if (!options)
options = "";

//TODO: audit bb_init_module_24 to match error code convention
#if ENABLE_FEATURE_2_4_MODULES
if (get_linux_version_code() < KERNEL_VERSION(2,6,0))
return bb_init_module_24(filename, options);
#endif

/*
* First we try finit_module if available. Some kernels are configured
* to only allow loading of modules off of secure storage (like a read-
* only rootfs) which needs the finit_module call. If it fails, we fall
* back to normal module loading to support compressed modules.
*/
# ifdef __NR_finit_module
{
// 方法1:通过句柄打开文件
int fd = open(filename, O_RDONLY | O_CLOEXEC);
if (fd >= 0) {
int flags = is_suffixed_with(filename, ".ko") ? 0 : MODULE_INIT_COMPRESSED_FILE;
for (;;) {
// 然后调用finit_module
rc = finit_module(fd, options, flags);
if (rc == 0 || flags == 0)
break;
/* Loading non-.ko named uncompressed module? Not likely, but let's try it */
flags = 0;
}
close(fd);
if (rc == 0)
return rc;
}
}
# endif

image_size = INT_MAX - 4095;
mmaped = 0;
// 方法2:将ko文件映射到内存
image = try_to_mmap_module(filename, &image_size);
if (image) {
mmaped = 1;
} else {
errno = ENOMEM; /* may be changed by e.g. open errors below */
// 如果映射失败,尝试将ko文件malloc到内存
image = xmalloc_open_zipped_read_close(filename, &image_size);
if (!image)
return -errno;
}

errno = 0;
// 最后调用到init_module
init_module(image, image_size, options);
rc = errno;
if (mmaped)
munmap(image, image_size);
else
free(image);
return rc;
}

最后都会通过系统调用调用到__NRinit_module 或 \_NR_finit_module