summaryrefslogtreecommitdiff
path: root/kernel/power/tuxonice_incremental.c
blob: c5a09789e7f70f078d7076c901ec9505ca03f54f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
/*
 * kernel/power/tuxonice_incremental.c
 *
 * Copyright (C) 2015 Nigel Cunningham (nigel at nigelcunningham com au)
 *
 * This file is released under the GPLv2.
 *
 * This file contains routines related to storing incremental images - that
 * is, retaining an image after an initial cycle and then storing incremental
 * changes on subsequent hibernations.
 *
 * Based in part on on...
 *
 * Debug helper to dump the current kernel pagetables of the system
 * so that we can see what the various memory ranges are set to.
 *
 * (C) Copyright 2008 Intel Corporation
 *
 * Author: Arjan van de Ven <arjan@linux.intel.com>
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; version 2
 * of the License.
 */

#include <linux/mm.h>
#include <linux/tuxonice.h>
#include <linux/sched.h>
#include <asm/pgtable.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
#include <asm/page.h>
#include "tuxonice_pageflags.h"
#include "tuxonice_builtin.h"
#include "power.h"

int toi_do_incremental_initcall;

extern void kdb_init(int level);
extern noinline void kgdb_breakpoint(void);

#undef pr_debug
#if 0
#define pr_debug(a, b...) do { printk(a, ##b); } while(0)
#else
#define pr_debug(a, b...) do { } while(0)
#endif

/* Multipliers for offsets within the PTEs */
#define PTE_LEVEL_MULT (PAGE_SIZE)
#define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT)
#define PUD_LEVEL_MULT (PTRS_PER_PMD * PMD_LEVEL_MULT)
#define PGD_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT)

/*
 * This function gets called on a break in a continuous series
 * of PTE entries; the next one is different so we need to
 * print what we collected so far.
 */
static void note_page(void *addr)
{
    static struct page *lastpage;
    struct page *page;

    page = virt_to_page(addr);

    if (page != lastpage) {
        unsigned int level;
        pte_t *pte = lookup_address((unsigned long) addr, &level);
        struct page *pt_page2 = pte_page(*pte);
        //debug("Note page %p (=> %p => %p|%ld).\n", addr, pte, pt_page2, page_to_pfn(pt_page2));
        SetPageTOI_Untracked(pt_page2);
        lastpage = page;
    }
}

static void walk_pte_level(pmd_t addr)
{
	int i;
	pte_t *start;

	start = (pte_t *) pmd_page_vaddr(addr);
	for (i = 0; i < PTRS_PER_PTE; i++) {
		note_page(start);
		start++;
	}
}

#if PTRS_PER_PMD > 1

static void walk_pmd_level(pud_t addr)
{
	int i;
	pmd_t *start;

	start = (pmd_t *) pud_page_vaddr(addr);
	for (i = 0; i < PTRS_PER_PMD; i++) {
		if (!pmd_none(*start)) {
			if (pmd_large(*start) || !pmd_present(*start))
				note_page(start);
			else
				walk_pte_level(*start);
		} else
			note_page(start);
		start++;
	}
}

#else
#define walk_pmd_level(a) walk_pte_level(__pmd(pud_val(a)))
#define pud_large(a) pmd_large(__pmd(pud_val(a)))
#define pud_none(a)  pmd_none(__pmd(pud_val(a)))
#endif

#if PTRS_PER_PUD > 1

static void walk_pud_level(pgd_t addr)
{
	int i;
	pud_t *start;

	start = (pud_t *) pgd_page_vaddr(addr);

	for (i = 0; i < PTRS_PER_PUD; i++) {
		if (!pud_none(*start)) {
			if (pud_large(*start) || !pud_present(*start))
				note_page(start);
			else
				walk_pmd_level(*start);
		} else
			note_page(start);

		start++;
	}
}

#else
#define walk_pud_level(a) walk_pmd_level(__pud(pgd_val(a)))
#define pgd_large(a) pud_large(__pud(pgd_val(a)))
#define pgd_none(a)  pud_none(__pud(pgd_val(a)))
#endif

/*
 * Not static in the original at the time of writing, so needs renaming here.
 */
static void toi_ptdump_walk_pgd_level(pgd_t *pgd)
{
#ifdef CONFIG_X86_64
	pgd_t *start = (pgd_t *) &init_level4_pgt;
#else
	pgd_t *start = swapper_pg_dir;
#endif
	int i;
	if (pgd) {
		start = pgd;
	}

	for (i = 0; i < PTRS_PER_PGD; i++) {
		if (!pgd_none(*start)) {
			if (pgd_large(*start) || !pgd_present(*start))
				note_page(start);
			else
				walk_pud_level(*start);
		} else
			note_page(start);

		start++;
	}

	/* Flush out the last page */
	note_page(start);
}

#ifdef CONFIG_PARAVIRT
extern struct pv_info pv_info;

static void toi_set_paravirt_ops_untracked(void) {
    int i;

    unsigned long pvpfn = page_to_pfn(virt_to_page(__parainstructions)),
                  pvpfn_end = page_to_pfn(virt_to_page(__parainstructions_end));
    //debug(KERN_EMERG ".parainstructions goes from pfn %ld to %ld.\n", pvpfn, pvpfn_end);
    for (i = pvpfn; i <= pvpfn_end; i++) {
        SetPageTOI_Untracked(pfn_to_page(i));
    }
}
#else
#define toi_set_paravirt_ops_untracked() { do { } while(0) }
#endif

extern void toi_mark_per_cpus_pages_untracked(void);

void toi_untrack_stack(unsigned long *stack)
{
    int i;
    struct page *stack_page = virt_to_page(stack);

    for (i = 0; i < (1 << THREAD_SIZE_ORDER); i++) {
        pr_debug("Untrack stack page %p.\n", page_address(stack_page + i));
        SetPageTOI_Untracked(stack_page + i);
    }
}
void toi_untrack_process(struct task_struct *p)
{
    SetPageTOI_Untracked(virt_to_page(p));
    pr_debug("Untrack process %d page %p.\n", p->pid, page_address(virt_to_page(p)));

    toi_untrack_stack(p->stack);
}

void toi_generate_untracked_map(void)
{
    struct task_struct *p, *t;
    struct page *page;
    pte_t *pte;
    int i;
    unsigned int level;
    static int been_here = 0;

    if (been_here)
        return;

    been_here = 1;

    /* Pagetable pages */
    toi_ptdump_walk_pgd_level(NULL);

    /* Printk buffer - not normally needed but can be helpful for debugging. */
    //toi_set_logbuf_untracked();

    /* Paravirt ops */
    toi_set_paravirt_ops_untracked();

    /* Task structs and stacks */
    for_each_process_thread(p, t) {
        toi_untrack_process(p);
        //toi_untrack_stack((unsigned long *) t->thread.sp);
    }

    for (i = 0; i < NR_CPUS; i++) {
        struct task_struct *idle = idle_task(i);

        if (idle) {
            pr_debug("Untrack idle process for CPU %d.\n", i);
            toi_untrack_process(idle);
        }

        /* IRQ stack */
        pr_debug("Untrack IRQ stack for CPU %d.\n", i);
        toi_untrack_stack((unsigned long *)per_cpu(irq_stack_ptr, i));
    }

    /* Per CPU data */
    //pr_debug("Untracking per CPU variable pages.\n");
    toi_mark_per_cpus_pages_untracked();

    /* Init stack - for bringing up secondary CPUs */
    page = virt_to_page(init_stack);
    for (i = 0; i < DIV_ROUND_UP(sizeof(init_stack), PAGE_SIZE); i++) {
        SetPageTOI_Untracked(page + i);
    }

    pte = lookup_address((unsigned long) &mmu_cr4_features, &level);
    SetPageTOI_Untracked(pte_page(*pte));
    SetPageTOI_Untracked(virt_to_page(trampoline_cr4_features));
}

/**
 * toi_reset_dirtiness_one
 */

void toi_reset_dirtiness_one(unsigned long pfn, int verbose)
{
    struct page *page = pfn_to_page(pfn);

    /**
     * Don't worry about whether the Dirty flag is
     * already set. If this is our first call, it
     * won't be.
     */

    preempt_disable();

    ClearPageTOI_Dirty(page);
    SetPageTOI_RO(page);
    if (verbose)
        printk(KERN_EMERG "Making page %ld (%p|%p) read only.\n", pfn, page, page_address(page));

    set_memory_ro((unsigned long) page_address(page), 1);

    preempt_enable();
}

/**
 * TuxOnIce's incremental image support works by marking all memory apart from
 * the page tables read-only, then in the page-faults that result enabling
 * writing if appropriate and flagging the page as dirty. Free pages are also
 * marked as dirty and not protected so that if allocated, they will be included
 * in the image without further processing.
 *
 * toi_reset_dirtiness is called when and image exists and incremental images are
 * enabled, and each time we resume thereafter. It is not invoked on a fresh boot.
 *
 * This routine should be called from a single-cpu-running context to avoid races in setting
 * page dirty/read only flags.
 *
 * TODO: Make "it is not invoked on a fresh boot" true  when I've finished developing it!
 *
 * TODO: Consider Xen paravirt guest boot issues. See arch/x86/mm/pageattr.c.
 **/

int toi_reset_dirtiness(int verbose)
{
	struct zone *zone;
	unsigned long loop;
        int allocated_map = 0;

        toi_generate_untracked_map();

        if (!free_map) {
            if (!toi_alloc_bitmap(&free_map))
                return -ENOMEM;
            allocated_map = 1;
        }

	toi_generate_free_page_map();

        pr_debug(KERN_EMERG "Reset dirtiness.\n");
        for_each_populated_zone(zone) {
            // 64 bit only. No need to worry about highmem.
            for (loop = 0; loop < zone->spanned_pages; loop++) {
                unsigned long pfn = zone->zone_start_pfn + loop;
                struct page *page;
                int chunk_size;

                if (!pfn_valid(pfn)) {
                    continue;
                }

                chunk_size = toi_size_of_free_region(zone, pfn);
                if (chunk_size) {
                    loop += chunk_size - 1;
                    continue;
                }

                page = pfn_to_page(pfn);

                if (PageNosave(page) || !saveable_page(zone, pfn)) {
                    continue;
                }

                if (PageTOI_Untracked(page)) {
                    continue;
                }

                /**
                 * Do we need to (re)protect the page?
                 * If it is already protected (PageTOI_RO), there is
                 * nothing to do - skip the following.
                 * If it is marked as dirty (PageTOI_Dirty), it was
                 * either free and has been allocated or has been
                 * written to and marked dirty. Reset the dirty flag
                 * and (re)apply the protection.
                 */
                if (!PageTOI_RO(page)) {
                    toi_reset_dirtiness_one(pfn, verbose);
                }
            }
        }

        pr_debug(KERN_EMERG "Done resetting dirtiness.\n");

        if (allocated_map) {
            toi_free_bitmap(&free_map);
        }
        return 0;
}

static int toi_reset_dirtiness_initcall(void)
{
    if (toi_do_incremental_initcall) {
        pr_info("TuxOnIce: Enabling dirty page tracking.\n");
        toi_reset_dirtiness(0);
    }
    return 1;
}
extern void toi_generate_untracked_map(void);

// Leave early_initcall for pages to register untracked sections.
early_initcall(toi_reset_dirtiness_initcall);

static int __init toi_incremental_initcall_setup(char *str)
{
	int value;

	if (sscanf(str, "=%d", &value) && value)
		toi_do_incremental_initcall = value;

	return 1;
}
__setup("toi_incremental_initcall", toi_incremental_initcall_setup);