在 xv6 系统中,fork() 系统调用会复制父进程的所有用户空间内存给子进程(正如我们在 pgtbl 实验中看到过的那样)。可是,如果父进程很大,复制过程会消耗很长的时间。更糟糕的是,这可能是无用功,例如 fork() 后子进程紧接着 exec(),那么刚复制下来的内存根本不会用到。另一方面,如果父子进程都要用到这块内存,那么复制又是必须的。

写时复制(Copy-on-Write)是一种计算机程序设计领域的优化策略。它的核心思想是,如果有多个呼叫者同时请求相同资源(如内存或磁盘上的数据存储),他们会共享相同的指针指向该资源,直到某个呼叫者试图修改资源内容时,系统才会真正复制一份专用副本给该呼叫者使用。这种技术通常用于文件系统、进程管理和虚拟内存等方面。

COW fork() 只给子进程创建一个页表,其 PTE 指向父进程的物理页,然后给父子进程的 PTE 全部打上不可写的标记。于是,当父子进程之一试图写这些页面时会产生 page fault,内核的 page fault 处理程序检测到这种情况后,给产生异常的那个进程分配一个新的页面,把原页面复制过去,修改 PTE 并打上可写的标记。处理程序返回之后,进程就能继续正常执行了。
COW fork() 让释放物理页面变得很 tricky,因为一个物理页面可能被多个进程的页表同时指向,因此只当最后一个指针撤去后才能释放。

Implement copy-on-write

RISC-V 的 PTE 有 10 个标志位,其中第 8、9 位是为用户保留的,因此我用第 8 位作为 PTE_COW 标志,表示该 PTE 是否需要 copy-on-write:

1
2
// kernel/riscv.h
#define PTE_COW (1L << 8) // copy-on-write

我们查看 fork() 的代码,发现页表复制是由 uvmcopy() 实现的,因此我们需要改的地方其实是 uvmcopy()。(我顺便查了一下还有没有其他函数调用了 uvmcopy(),结果惊奇地发现只有 fork() 调用了它)我们将开辟内存并复制内容的代码注释掉,改为直接向原空间建立映射,并把 PTE_W 置零,把 PTE_COW 置一

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
// kernel/proc.c
// Create a new process, copying the parent.
// Sets up child kernel stack to return as if from fork() system call.
int
fork(void)
{
int i, pid;
struct proc *np;
struct proc *p = myproc();

// Allocate process.
if((np = allocproc()) == 0){
return -1;
}

// Copy user memory from parent to child.
if(uvmcopy(p->pagetable, np->pagetable, p->sz) < 0){
freeproc(np);
release(&np->lock);
return -1;
}
np->sz = p->sz;

np->parent = p;

// copy saved user registers.
*(np->trapframe) = *(p->trapframe);

// Cause fork to return 0 in the child.
np->trapframe->a0 = 0;

// increment reference counts on open file descriptors.
for(i = 0; i < NOFILE; i++)
if(p->ofile[i])
np->ofile[i] = filedup(p->ofile[i]);
np->cwd = idup(p->cwd);

safestrcpy(np->name, p->name, sizeof(p->name));

pid = np->pid;

np->state = RUNNABLE;

release(&np->lock);

return pid;
}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
// kernel/vm.c
int
uvmcopy(pagetable_t old, pagetable_t new, uint64 sz)
{
pte_t *pte;
uint64 pa, i;
uint flags;
//char *mem;

for(i = 0; i < sz; i += PGSIZE){
if((pte = walk(old, i, 0)) == 0)
panic("uvmcopy: pte should exist");
if((*pte & PTE_V) == 0)
panic("uvmcopy: page not present");
pa = PTE2PA(*pte);
flags = PTE_FLAGS(*pte);
// xyf
flags = (flags | PTE_COW) & (~PTE_W);
*pte = PA2PTE(pa) | flags;
if(mappages(new, i, PGSIZE, pa, flags) != 0)
goto err;
update_refcount(pa, 1);
/*
if((mem = kalloc()) == 0)
goto err;
memmove(mem, (char*)pa, PGSIZE);
if(mappages(new, i, PGSIZE, (uint64)mem, flags) != 0){
kfree(mem);
goto err;
}*/
}
return 0;

err:
uvmunmap(new, 0, i / PGSIZE, 1);
return -1;
}

update_refcount(pa, 1) 的作用后文叙述。

现在,对于某个带有 PTE_COW 标记的 PTE 指向的页面,我们写它时会引起 page fault——因为它的 PTE_W 被置零了。和上一个实验(lazy allocation)一样,这个 page fault 在 usertrap() 中处理。当写页面发生异常时,scause 寄存器的值会被置为 15,stval 寄存器会存储导致异常的地址,所以我们根据这两个寄存器处理 copy-on-write:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
// kernel/trap.c
void
usertrap(void)
{
int which_dev = 0;

if((r_sstatus() & SSTATUS_SPP) != 0)
panic("usertrap: not from user mode");

// send interrupts and exceptions to kerneltrap(),
// since we're now in the kernel.
w_stvec((uint64)kernelvec);

struct proc *p = myproc();

// save user program counter.
p->trapframe->epc = r_sepc();

if(r_scause() == 8){
// system call

if(p->killed)
exit(-1);

// sepc points to the ecall instruction,
// but we want to return to the next instruction.
p->trapframe->epc += 4;

// an interrupt will change sstatus &c registers,
// so don't enable until done with those registers.
intr_on();

syscall();
} else if((which_dev = devintr()) != 0){
// ok
} else {
// check cow
uint64 cause = r_scause();
if(cause == 15){
// page fault
uint64 stval = r_stval();
if(handle_cow(p->pagetable, stval, 0) == 0)
goto brk; // copy-on-write success
}

printf("usertrap(): unexpected scause %p pid=%d\n", r_scause(), p->pid);
printf(" sepc=%p stval=%p\n", r_sepc(), r_stval());
p->killed = 1;
}

brk:
if(p->killed)
exit(-1);

// give up the CPU if this is a timer interrupt.
if(which_dev == 2)
yield();

usertrapret();
}

其中 handle_cow() 函数处理 copy-on-write:它先解析虚拟地址,如果发现其 PTE 的 PTE_COW 被置位了,则开辟新的空间,并将 PTE 指向这个新的空间,同时把 PTE_W 置一、PTE_COW 置零。这样,当我们返回用户空间时,用户进程就能正常执行了:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
// kernel/vm.c
// Check if virtual address refers to a cow page and handle it.
// Store the physical address of the newly allocated page in parameter newpa.
// Return: -1 something wrong
// 0 copy-on-write success
// 1 no need to copy-on-write
int
handle_cow(pagetable_t pagetable, uint64 va, uint64 *newpa)
{
pte_t *pte = walk(pagetable, va, 0);
if(pte == 0) return -1;
if(*pte & PTE_COW){
// copy-on-write
uint flags = (PTE_FLAGS(*pte) & (~PTE_COW)) | PTE_W;
uint64 pa = PTE2PA(*pte);
char *mem = kalloc();
if(mem == 0) return -1;
memmove(mem, (char *)pa, PGSIZE);
*pte = PA2PTE((uint64)mem) | flags;
kfree((void *)pa);
if(newpa) *newpa = (uint64)mem;
return 0;
}
return 1;
}

为了方便,我传入了一个指针以保存新的物理地址,待会儿会用到。
同时在kernel/defs.h声明

copy-on-write 之后,原来的页面怎么办呢?在上面的代码中,我调用了 kfree 试图将其释放。但正如前文所述,只有当所有进程都不用一个页面时才能将其释放,所以我们需要开一个计数数组,对每一个页面统计有多少个进程指向了它。那一共有多少个页面呢?之前做第三个实验(page tables)的时候我们学习了,xv6(内核进程会用到)的内核空间是从 KERNBASE 到 PHYSTOP 的一段,所以一共有 (PHYSTOP-KERNBASE)/PGSIZE 页,并且物理地址 pa 在第 (pa-KERNBASE)/PGSIZE 页中,我们以此作为计数数组下标即可:

1
2
3
// kernel/kalloc.c
#define idx(x) (((uint64)(x)-KERNBASE)/PGSIZE)
int refcount[idx(PHYSTOP)];

每次 kalloc() 时,设置当前计数为 1;kfree 时,计数自减,如果减到了零,才释放内存:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
// kernel/kalloc.c
void
kfree(void *pa)
{
...
if(((uint64)pa % PGSIZE) != 0 || (char*)pa < end || (uint64)pa >= PHYSTOP)
panic("kfree");
// Free only when the last reference goes away.
if((uint64)pa >= KERNBASE){
if(refcount[idx(pa)] <= 0)
panic("kfree: refcount");
if(--refcount[idx(pa)]) return;
}
...
}

void *
kalloc(void)
{
...
if(r)
refcount[idx(r)] = 1;
...
}

值得注意的是,kinit() 初始化是调用 kfree() 实现的,所以为了把计数数组初始化为零,可以在 freerange 中把所有页的计数置为 1,这样调用一次 kfree(),就能把计数置零、加入 kmem.freelist:

1
2
3
4
5
6
7
8
9
10
11
// kernel/kalloc.c
void
freerange(void *pa_start, void *pa_end)
{
char *p;
p = (char*)PGROUNDUP((uint64)pa_start);
for(; p + PGSIZE <= (char*)pa_end; p += PGSIZE){
refcount[idx(p)] = 1;
kfree(p);
}
}

为了让 kalloc.c 之外的函数方便地对 refcount 操作,我写了一个更新函数:

1
2
3
4
5
6
7
8
9
10
11
// kernel/kalloc.c
// Update refcount
void
update_refcount(uint64 pa, int val)
{
if((pa % PGSIZE) != 0 || pa < KERNBASE || pa >= PHYSTOP)
panic("update_refcount");
refcount[idx(pa)] += val;
if(refcount[idx(pa)] < 0)
panic("update_refcount: less than 0");
}

上文 uvmcopy 就调用了它使得计数加一。
在kernel/defs.h添加声明

最后,我们还需要改 copyout,也用 handle_cow 函数即可,这里我设置的第三个参数就发挥了作用,当然不设置这个参数,之后 walkaddr 一下也行。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
// kernel/kalloc.c
int
copyout(pagetable_t pagetable, uint64 dstva, char *src, uint64 len)
{
...
while(len > 0){
va0 = PGROUNDDOWN(dstva);
pa0 = walkaddr(pagetable, va0);
if(pa0 == 0)
return -1;
if(handle_cow(pagetable, va0, &pa0) == -1)
return -1;
...
}

make grade

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
== Test running cowtest == 
$ make qemu-gdb
(26.1s)
== Test simple ==
simple: OK
== Test three ==
three: OK
== Test file ==
file: OK
== Test usertests ==
$ make qemu-gdb
(364.4s)
(Old xv6.out.usertests failure log removed)
== Test usertests: copyin ==
usertests: copyin: OK
== Test usertests: copyout ==
usertests: copyout: OK
== Test usertests: all tests ==
usertests: all tests: OK
== Test time ==
time: OK
Score: 110/110

git diff

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
diff --git a/grade-lab-cow b/grade-lab-cow
index 8eff3b7..0d19e6d 100755
--- a/grade-lab-cow
+++ b/grade-lab-cow
@@ -29,7 +29,7 @@ def test_file():
def test_usertests():
r.run_qemu(shell_script([
'usertests'
- ]), timeout=300)
+ ]), timeout=1000)
r.match('^ALL TESTS PASSED$')

def usertest_check(testcase, nextcase, output):
diff --git a/kernel/defs.h b/kernel/defs.h
index 4b9bbc0..4e8e7bc 100644
--- a/kernel/defs.h
+++ b/kernel/defs.h
@@ -63,6 +63,7 @@ void ramdiskrw(struct buf*);
void* kalloc(void);
void kfree(void *);
void kinit(void);
+void update_refcount(uint64, int);

// log.c
void initlog(int, struct superblock*);
@@ -171,6 +172,7 @@ uint64 walkaddr(pagetable_t, uint64);
int copyout(pagetable_t, uint64, char *, uint64);
int copyin(pagetable_t, char *, uint64, uint64);
int copyinstr(pagetable_t, char *, uint64, uint64);
+int handle_cow(pagetable_t, uint64, uint64 *);

// plic.c
void plicinit(void);
diff --git a/kernel/kalloc.c b/kernel/kalloc.c
index fa6a0ac..8ecec14 100644
--- a/kernel/kalloc.c
+++ b/kernel/kalloc.c
@@ -23,6 +23,9 @@ struct {
struct run *freelist;
} kmem;

+#define idx(x) (((uint64)(x)-KERNBASE)/PGSIZE)
+int refcount[idx(PHYSTOP)];
+
void
kinit()
{
@@ -35,8 +38,10 @@ freerange(void *pa_start, void *pa_end)
{
char *p;
p = (char*)PGROUNDUP((uint64)pa_start);
- for(; p + PGSIZE <= (char*)pa_end; p += PGSIZE)
+ for(; p + PGSIZE <= (char*)pa_end; p += PGSIZE){
+ refcount[idx(p)] = 1;
kfree(p);
+ }
}

// Free the page of physical memory pointed at by v,
@@ -50,7 +55,14 @@ kfree(void *pa)

if(((uint64)pa % PGSIZE) != 0 || (char*)pa < end || (uint64)pa >= PHYSTOP)
panic("kfree");
-
+
+ // Free only when the last reference goes away.
+ if((uint64)pa >= KERNBASE){
+ if(refcount[idx(pa)] <= 0)
+ panic("kfree: refcount");
+ if(--refcount[idx(pa)]) return;
+ }
+
// Fill with junk to catch dangling refs.
memset(pa, 1, PGSIZE);

@@ -78,5 +90,20 @@ kalloc(void)

if(r)
memset((char*)r, 5, PGSIZE); // fill with junk
+
+ if(r)
+ refcount[idx(r)] = 1;
+
return (void*)r;
}
+
+// Update refcount
+void
+update_refcount(uint64 pa, int val)
+{
+ if((pa % PGSIZE) != 0 || pa < KERNBASE || pa >= PHYSTOP)
+ panic("update_refcount");
+ refcount[idx(pa)] += val;
+ if(refcount[idx(pa)] < 0)
+ panic("update_refcount: less than 0");
+}
diff --git a/kernel/riscv.h b/kernel/riscv.h
index 0aec003..8a0ca5f 100644
--- a/kernel/riscv.h
+++ b/kernel/riscv.h
@@ -331,6 +331,7 @@ sfence_vma()
#define PTE_W (1L << 2)
#define PTE_X (1L << 3)
#define PTE_U (1L << 4) // 1 -> user can access
+#define PTE_COW (1L << 8) // used in copy-on-write

// shift a physical address to the right place for a PTE.
#define PA2PTE(pa) ((((uint64)pa) >> 12) << 10)
diff --git a/kernel/trap.c b/kernel/trap.c
index a63249e..b0bf1bb 100644
--- a/kernel/trap.c
+++ b/kernel/trap.c
@@ -68,11 +68,21 @@ usertrap(void)
} else if((which_dev = devintr()) != 0){
// ok
} else {
+ // check cow
+ uint64 cause = r_scause();
+ if(cause == 15){
+ // page fault
+ uint64 stval = r_stval();
+ if(handle_cow(p->pagetable, stval, 0) == 0)
+ goto brk;
+ }
+
printf("usertrap(): unexpected scause %p pid=%d\n", r_scause(), p->pid);
printf(" sepc=%p stval=%p\n", r_sepc(), r_stval());
p->killed = 1;
}

+brk:
if(p->killed)
exit(-1);

diff --git a/kernel/vm.c b/kernel/vm.c
index bccb405..88c444d 100644
--- a/kernel/vm.c
+++ b/kernel/vm.c
@@ -311,7 +311,7 @@ uvmcopy(pagetable_t old, pagetable_t new, uint64 sz)
pte_t *pte;
uint64 pa, i;
uint flags;
- char *mem;
+ //char *mem;

for(i = 0; i < sz; i += PGSIZE){
if((pte = walk(old, i, 0)) == 0)
@@ -320,6 +320,13 @@ uvmcopy(pagetable_t old, pagetable_t new, uint64 sz)
panic("uvmcopy: page not present");
pa = PTE2PA(*pte);
flags = PTE_FLAGS(*pte);
+
+ flags = (flags | PTE_COW) & (~PTE_W);
+ *pte = PA2PTE(pa) | flags;
+ if(mappages(new, i, PGSIZE, pa, flags) != 0)
+ goto err;
+ update_refcount(pa, 1);
+/*
if((mem = kalloc()) == 0)
goto err;
memmove(mem, (char*)pa, PGSIZE);
@@ -327,6 +334,7 @@ uvmcopy(pagetable_t old, pagetable_t new, uint64 sz)
kfree(mem);
goto err;
}
+*/
}
return 0;

@@ -361,6 +369,10 @@ copyout(pagetable_t pagetable, uint64 dstva, char *src, uint64 len)
pa0 = walkaddr(pagetable, va0);
if(pa0 == 0)
return -1;
+
+ if(handle_cow(pagetable, va0, &pa0) == -1)
+ return -1;
+
n = PGSIZE - (dstva - va0);
if(n > len)
n = len;
@@ -440,3 +452,28 @@ copyinstr(pagetable_t pagetable, char *dst, uint64 srcva, uint64 max)
return -1;
}
}
+
+// Check if virtual address refers to a cow page and handle it.
+// Store the physical address of the newly allocated page in parameter newpa.
+// Return: -1 something wrong
+// 0 copy-on-write success
+// 1 no need to copy-on-write
+int
+handle_cow(pagetable_t pagetable, uint64 va, uint64 *newpa)
+{
+ pte_t *pte = walk(pagetable, va, 0);
+ if(pte == 0) return -1;
+ if(*pte & PTE_COW){
+ // copy-on-write
+ uint flags = (PTE_FLAGS(*pte) & (~PTE_COW)) | PTE_W;
+ uint64 pa = PTE2PA(*pte);
+ char *mem = kalloc();
+ if(mem == 0) return -1;
+ memmove(mem, (char *)pa, PGSIZE);
+ *pte = PA2PTE((uint64)mem) | flags;
+ kfree((void *)pa);
+ if(newpa) *newpa = (uint64)mem;
+ return 0;
+ }
+ return 1;
+}
diff --git a/time.txt b/time.txt
new file mode 100644
index 0000000..f599e28
--- /dev/null
+++ b/time.txt
@@ -0,0 +1 @@
+10