initial check-in of kp49 work (up to commit date)
[kernel-bfs] / kernel-bfs-2.6.28 / debian / patches / adding-ramzswap-driver.diff
1 ---
2  arch/arm/configs/rx51_defconfig         |    2 
3  drivers/block/Kconfig                   |    2 
4  drivers/block/Makefile                  |    1 
5  drivers/block/ramzswap/Kconfig          |   22 
6  drivers/block/ramzswap/Makefile         |    3 
7  drivers/block/ramzswap/compat.h         |   13 
8  drivers/block/ramzswap/ramzswap.txt     |   51 +
9  drivers/block/ramzswap/ramzswap_drv.c   | 1557 ++++++++++++++++++++++++++++++++
10  drivers/block/ramzswap/ramzswap_drv.h   |  210 ++++
11  drivers/block/ramzswap/ramzswap_ioctl.h |   50 +
12  drivers/block/ramzswap/xvmalloc.c       |  507 ++++++++++
13  drivers/block/ramzswap/xvmalloc.h       |   30 
14  drivers/block/ramzswap/xvmalloc_int.h   |   86 +
15  13 files changed, 2534 insertions(+)
16
17 Index: kernel-power-2.6.28/arch/arm/configs/rx51_defconfig
18 ===================================================================
19 --- kernel-power-2.6.28.orig/arch/arm/configs/rx51_defconfig
20 +++ kernel-power-2.6.28/arch/arm/configs/rx51_defconfig
21 @@ -830,6 +830,8 @@
22  # CONFIG_BLK_DEV_XIP is not set
23  # CONFIG_CDROM_PKTCDVD is not set
24  # CONFIG_ATA_OVER_ETH is not set
25 +CONFIG_RAMZSWAP=m
26 +# CONFIG_RAMZSWAP_STATS is not set
27  CONFIG_MISC_DEVICES=y
28  # CONFIG_EEPROM_93CX6 is not set
29  CONFIG_NOKIA_AV_DETECT=m
30 Index: kernel-power-2.6.28/drivers/block/Kconfig
31 ===================================================================
32 --- kernel-power-2.6.28.orig/drivers/block/Kconfig
33 +++ kernel-power-2.6.28/drivers/block/Kconfig
34 @@ -446,4 +446,6 @@
35  
36           If unsure, say N.
37  
38 +source "drivers/block/ramzswap/Kconfig"
39 +
40  endif # BLK_DEV
41 Index: kernel-power-2.6.28/drivers/block/Makefile
42 ===================================================================
43 --- kernel-power-2.6.28.orig/drivers/block/Makefile
44 +++ kernel-power-2.6.28/drivers/block/Makefile
45 @@ -30,5 +30,6 @@
46  obj-$(CONFIG_BLK_DEV_SX8)      += sx8.o
47  obj-$(CONFIG_BLK_DEV_UB)       += ub.o
48  obj-$(CONFIG_BLK_DEV_HD)       += hd.o
49 +obj-$(CONFIG_RAMZSWAP)         += ramzswap/
50  
51  obj-$(CONFIG_XEN_BLKDEV_FRONTEND)      += xen-blkfront.o
52 Index: kernel-power-2.6.28/drivers/block/ramzswap/Kconfig
53 ===================================================================
54 --- /dev/null
55 +++ kernel-power-2.6.28/drivers/block/ramzswap/Kconfig
56 @@ -0,0 +1,22 @@
57 +
58 +config RAMZSWAP
59 +       tristate "Compressed in-memory swap device (ramzswap)"
60 +       depends on SWAP
61 +       select LZO_COMPRESS
62 +       select LZO_DECOMPRESS
63 +       default n
64 +       help
65 +         Creates virtual block devices which can (only) be used as swap
66 +         disks. Pages swapped to these disks are compressed and stored in
67 +         memory itself.
68 +
69 +         See ramzswap.txt for more information.
70 +         Project home: http://compcache.googlecode.com/
71 +
72 +config RAMZSWAP_STATS
73 +       bool "Enable ramzswap stats"
74 +       depends on RAMZSWAP
75 +       default y
76 +       help
77 +         Enable statistics collection for ramzswap. This adds only a minimal
78 +         overhead. In unsure, say Y.
79 Index: kernel-power-2.6.28/drivers/block/ramzswap/Makefile
80 ===================================================================
81 --- /dev/null
82 +++ kernel-power-2.6.28/drivers/block/ramzswap/Makefile
83 @@ -0,0 +1,3 @@
84 +ramzswap-objs  :=      ramzswap_drv.o xvmalloc.o
85 +
86 +obj-$(CONFIG_RAMZSWAP) +=      ramzswap.o
87 Index: kernel-power-2.6.28/drivers/block/ramzswap/compat.h
88 ===================================================================
89 --- /dev/null
90 +++ kernel-power-2.6.28/drivers/block/ramzswap/compat.h
91 @@ -0,0 +1,13 @@
92 +#ifndef _RAMZSWAP_COMPAT_H_
93 +#define _RAMZSWAP_COMPAT_H_
94 +
95 +/* Uncomment this if you are using swap free notify patch */
96 +#define CONFIG_SWAP_FREE_NOTIFY
97 +
98 +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,31))
99 +#define blk_queue_physical_block_size(q, size) \
100 +       blk_queue_hardsect_size(q, size)
101 +#define blk_queue_logical_block_size(q, size)
102 +#endif
103 +
104 +#endif
105 Index: kernel-power-2.6.28/drivers/block/ramzswap/ramzswap.txt
106 ===================================================================
107 --- /dev/null
108 +++ kernel-power-2.6.28/drivers/block/ramzswap/ramzswap.txt
109 @@ -0,0 +1,51 @@
110 +ramzswap: Compressed RAM based swap device
111 +-------------------------------------------
112 +
113 +Project home: http://compcache.googlecode.com/
114 +
115 +* Introduction
116 +
117 +The ramzswap module creates RAM based block devices which can (only) be used as
118 +swap disks. Pages swapped to these devices are compressed and stored in memory
119 +itself. See project home for use cases, performance numbers and a lot more.
120 +
121 +Individual ramzswap devices are configured and initialized using rzscontrol
122 +userspace utility as shown in examples below. See rzscontrol man page for more
123 +details.
124 +
125 +* Usage
126 +
127 +Following shows a typical sequence of steps for using ramzswap.
128 +
129 +1) Load Modules:
130 +       modprobe ramzswap num_devices=4
131 +       This creates 4 (uninitialized) devices: /dev/ramzswap{0,1,2,3}
132 +       (num_devices parameter is optional. Default: 1)
133 +
134 +2) Initialize:
135 +       Use rzscontrol utility to configure and initialize individual
136 +       ramzswap devices. Example:
137 +       rzscontrol /dev/ramzswap2 --init # uses default value of disksize_kb
138 +
139 +       *See rzscontrol man page for more details and examples*
140 +
141 +3) Activate:
142 +       swapon /dev/ramzswap2 # or any other initialized ramzswap device
143 +
144 +4) Stats:
145 +       rzscontrol /dev/ramzswap2 --stats
146 +
147 +5) Deactivate:
148 +       swapoff /dev/ramzswap2
149 +
150 +6) Reset:
151 +       rzscontrol /dev/ramzswap2 --reset
152 +       (This frees all the memory allocated for this device).
153 +
154 +
155 +Please report any problems at:
156 + - Mailing list: linux-mm-cc at laptop dot org
157 + - Issue tracker: http://code.google.com/p/compcache/issues/list
158 +
159 +Nitin Gupta
160 +ngupta@vflare.org
161 Index: kernel-power-2.6.28/drivers/block/ramzswap/ramzswap_drv.c
162 ===================================================================
163 --- /dev/null
164 +++ kernel-power-2.6.28/drivers/block/ramzswap/ramzswap_drv.c
165 @@ -0,0 +1,1557 @@
166 +/*
167 + * Compressed RAM based swap device
168 + *
169 + * Copyright (C) 2008, 2009, 2010  Nitin Gupta
170 + *
171 + * This code is released using a dual license strategy: BSD/GPL
172 + * You can choose the licence that better fits your requirements.
173 + *
174 + * Released under the terms of 3-clause BSD License
175 + * Released under the terms of GNU General Public License Version 2.0
176 + *
177 + * Project home: http://compcache.googlecode.com
178 + */
179 +
180 +#define KMSG_COMPONENT "ramzswap"
181 +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
182 +
183 +#include <linux/module.h>
184 +#include <linux/kernel.h>
185 +#include <linux/bitops.h>
186 +#include <linux/blkdev.h>
187 +#include <linux/buffer_head.h>
188 +#include <linux/device.h>
189 +#include <linux/genhd.h>
190 +#include <linux/highmem.h>
191 +#include <linux/lzo.h>
192 +#include <linux/string.h>
193 +#include <linux/swap.h>
194 +#include <linux/swapops.h>
195 +#include <linux/vmalloc.h>
196 +#include <linux/version.h>
197 +
198 +#include "compat.h"
199 +#include "ramzswap_drv.h"
200 +
201 +/* Module params (documentation at end) */
202 +static unsigned int num_devices;
203 +static unsigned long disksize_kb;
204 +static unsigned long memlimit_kb;
205 +static char backing_swap[MAX_SWAP_NAME_LEN];
206 +
207 +/* Globals */
208 +static int ramzswap_major;
209 +static struct ramzswap *devices;
210 +
211 +/*
212 + * Pages that compress to larger than this size are
213 + * forwarded to backing swap, if present or stored
214 + * uncompressed in memory otherwise.
215 + */
216 +static unsigned int max_zpage_size;
217 +
218 +static int rzs_test_flag(struct ramzswap *rzs, u32 index,
219 +                       enum rzs_pageflags flag)
220 +{
221 +       return rzs->table[index].flags & BIT(flag);
222 +}
223 +
224 +static void rzs_set_flag(struct ramzswap *rzs, u32 index,
225 +                       enum rzs_pageflags flag)
226 +{
227 +       rzs->table[index].flags |= BIT(flag);
228 +}
229 +
230 +static void rzs_clear_flag(struct ramzswap *rzs, u32 index,
231 +                       enum rzs_pageflags flag)
232 +{
233 +       rzs->table[index].flags &= ~BIT(flag);
234 +}
235 +
236 +static int page_zero_filled(void *ptr)
237 +{
238 +       unsigned int pos;
239 +       unsigned long *page;
240 +
241 +       page = (unsigned long *)ptr;
242 +
243 +       for (pos = 0; pos != PAGE_SIZE / sizeof(*page); pos++) {
244 +               if (page[pos])
245 +                       return 0;
246 +       }
247 +
248 +       return 1;
249 +}
250 +
251 +/*
252 + * memlimit cannot be greater than backing disk size.
253 + */
254 +static void ramzswap_set_memlimit(struct ramzswap *rzs, size_t totalram_bytes)
255 +{
256 +       int memlimit_valid = 1;
257 +
258 +       if (!rzs->memlimit) {
259 +               pr_info("Memory limit not set.\n");
260 +               memlimit_valid = 0;
261 +       }
262 +
263 +       if (rzs->memlimit > rzs->disksize) {
264 +               pr_info("Memory limit cannot be greater than "
265 +                       "disksize: limit=%zu, disksize=%zu\n",
266 +                       rzs->memlimit, rzs->disksize);
267 +               memlimit_valid = 0;
268 +       }
269 +
270 +       if (!memlimit_valid) {
271 +               size_t mempart, disksize;
272 +               pr_info("Using default: smaller of (%u%% of RAM) and "
273 +                       "(backing disk size).\n",
274 +                       default_memlimit_perc_ram);
275 +               mempart = default_memlimit_perc_ram * (totalram_bytes / 100);
276 +               disksize = rzs->disksize;
277 +               rzs->memlimit = mempart > disksize ? disksize : mempart;
278 +       }
279 +
280 +       if (rzs->memlimit > totalram_bytes / 2) {
281 +               pr_info(
282 +               "Its not advisable setting limit more than half of "
283 +               "size of memory since we expect a 2:1 compression ratio. "
284 +               "Limit represents amount of *compressed* data we can keep "
285 +               "in memory!\n"
286 +               "\tMemory Size: %zu kB\n"
287 +               "\tLimit you selected: %zu kB\n"
288 +               "Continuing anyway ...\n",
289 +               totalram_bytes >> 10, rzs->memlimit >> 10
290 +               );
291 +       }
292 +
293 +       rzs->memlimit &= PAGE_MASK;
294 +       BUG_ON(!rzs->memlimit);
295 +}
296 +
297 +static void ramzswap_set_disksize(struct ramzswap *rzs, size_t totalram_bytes)
298 +{
299 +       if (!rzs->disksize) {
300 +               pr_info(
301 +               "disk size not provided. You can use disksize_kb module "
302 +               "param to specify size.\nUsing default: (%u%% of RAM).\n",
303 +               default_disksize_perc_ram
304 +               );
305 +               rzs->disksize = default_disksize_perc_ram *
306 +                                       (totalram_bytes / 100);
307 +       }
308 +
309 +       if (rzs->disksize > 2 * (totalram_bytes)) {
310 +               pr_info(
311 +               "There is little point creating a ramzswap of greater than "
312 +               "twice the size of memory since we expect a 2:1 compression "
313 +               "ratio. Note that ramzswap uses about 0.1%% of the size of "
314 +               "the swap device when not in use so a huge ramzswap is "
315 +               "wasteful.\n"
316 +               "\tMemory Size: %zu kB\n"
317 +               "\tSize you selected: %zu kB\n"
318 +               "Continuing anyway ...\n",
319 +               totalram_bytes >> 10, rzs->disksize
320 +               );
321 +       }
322 +
323 +       rzs->disksize &= PAGE_MASK;
324 +}
325 +
326 +/*
327 + * Swap header (1st page of swap device) contains information
328 + * to indentify it as a swap partition. Prepare such a header
329 + * for ramzswap device (ramzswap0) so that swapon can identify
330 + * it as swap partition. In case backing swap device is provided,
331 + * copy its swap header.
332 + */
333 +static int setup_swap_header(struct ramzswap *rzs, union swap_header *s)
334 +{
335 +       int ret = 0;
336 +       struct page *page;
337 +       struct address_space *mapping;
338 +       union swap_header *backing_swap_header;
339 +
340 +       /*
341 +        * There is no backing swap device. Create a swap header
342 +        * that is acceptable by swapon.
343 +        */
344 +       if (!rzs->backing_swap) {
345 +               s->info.version = 1;
346 +               s->info.last_page = (rzs->disksize >> PAGE_SHIFT) - 1;
347 +               s->info.nr_badpages = 0;
348 +               memcpy(s->magic.magic, "SWAPSPACE2", 10);
349 +               return 0;
350 +       }
351 +
352 +       /*
353 +        * We have a backing swap device. Copy its swap header
354 +        * to ramzswap device header. If this header contains
355 +        * invalid information (backing device not a swap
356 +        * partition, etc.), swapon will fail for ramzswap
357 +        * which is correct behavior - we don't want to swap
358 +        * over filesystem partition!
359 +        */
360 +
361 +       /* Read the backing swap header (code from sys_swapon) */
362 +       mapping = rzs->swap_file->f_mapping;
363 +       if (!mapping->a_ops->readpage) {
364 +               ret = -EINVAL;
365 +               goto out;
366 +       }
367 +
368 +       page = read_mapping_page(mapping, 0, rzs->swap_file);
369 +       if (IS_ERR(page)) {
370 +               ret = PTR_ERR(page);
371 +               goto out;
372 +       }
373 +
374 +       backing_swap_header = kmap(page);
375 +       memcpy(s, backing_swap_header, sizeof(*s));
376 +       if (s->info.nr_badpages) {
377 +               pr_info("Cannot use backing swap with bad pages (%u)\n",
378 +                       s->info.nr_badpages);
379 +               ret = -EINVAL;
380 +       }
381 +       /*
382 +        * ramzswap disksize equals number of usable pages in backing
383 +        * swap. Set last_page in swap header to match this disksize
384 +        * ('last_page' means 0-based index of last usable swap page).
385 +        */
386 +       s->info.last_page = (rzs->disksize >> PAGE_SHIFT) - 1;
387 +       kunmap(page);
388 +
389 +out:
390 +       return ret;
391 +}
392 +
393 +/*static void flush_dcache_page(struct page *page)
394 +{
395 +#if defined(CONFIG_ARM)
396 +       int flag = 0;*/
397 +       /*
398 +        * Ugly hack to get flush_dcache_page() work on ARM.
399 +        * page_mapping(page) == NULL after clearing this swap cache flag.
400 +        * Without clearing this flag, flush_dcache_page() will simply set
401 +        * "PG_dcache_dirty" bit and return.
402 +        */
403 +       /*if (PageSwapCache(page)) {
404 +               flag = 1;
405 +               ClearPageSwapCache(page);
406 +       }
407 +#endif
408 +       flush_dcache_page(page);
409 +#if defined(CONFIG_ARM)
410 +       if (flag)
411 +               SetPageSwapCache(page);
412 +#endif
413 +}*/
414 +
415 +static void ramzswap_ioctl_get_stats(struct ramzswap *rzs,
416 +                       struct ramzswap_ioctl_stats *s)
417 +{
418 +       strncpy(s->backing_swap_name, rzs->backing_swap_name,
419 +               MAX_SWAP_NAME_LEN - 1);
420 +       s->backing_swap_name[MAX_SWAP_NAME_LEN - 1] = '\0';
421 +
422 +       s->disksize = rzs->disksize;
423 +       s->memlimit = rzs->memlimit;
424 +
425 +#if defined(CONFIG_RAMZSWAP_STATS)
426 +       {
427 +       struct ramzswap_stats *rs = &rzs->stats;
428 +       size_t succ_writes, mem_used;
429 +       unsigned int good_compress_perc = 0, no_compress_perc = 0;
430 +
431 +       mem_used = xv_get_total_size_bytes(rzs->mem_pool)
432 +                       + (rs->pages_expand << PAGE_SHIFT);
433 +       succ_writes = stat64_read(rzs, &rs->num_writes) -
434 +                       stat64_read(rzs, &rs->failed_writes);
435 +
436 +       if (succ_writes && rs->pages_stored) {
437 +               good_compress_perc = rs->good_compress * 100
438 +                                       / rs->pages_stored;
439 +               no_compress_perc = rs->pages_expand * 100
440 +                                       / rs->pages_stored;
441 +       }
442 +
443 +       s->num_reads = stat64_read(rzs, &rs->num_reads);
444 +       s->num_writes = stat64_read(rzs, &rs->num_writes);
445 +       s->failed_reads = stat64_read(rzs, &rs->failed_reads);
446 +       s->failed_writes = stat64_read(rzs, &rs->failed_writes);
447 +       s->invalid_io = stat64_read(rzs, &rs->invalid_io);
448 +       s->notify_free = stat64_read(rzs, &rs->notify_free);
449 +       s->pages_zero = rs->pages_zero;
450 +
451 +       s->good_compress_pct = good_compress_perc;
452 +       s->pages_expand_pct = no_compress_perc;
453 +
454 +       s->pages_stored = rs->pages_stored;
455 +       s->pages_used = mem_used >> PAGE_SHIFT;
456 +       s->orig_data_size = rs->pages_stored << PAGE_SHIFT;
457 +       s->compr_data_size = rs->compr_size;
458 +       s->mem_used_total = mem_used;
459 +
460 +       s->bdev_num_reads = stat64_read(rzs, &rs->bdev_num_reads);
461 +       s->bdev_num_writes = stat64_read(rzs, &rs->bdev_num_writes);
462 +       }
463 +#endif /* CONFIG_RAMZSWAP_STATS */
464 +}
465 +
466 +static int add_backing_swap_extent(struct ramzswap *rzs,
467 +                               pgoff_t phy_pagenum,
468 +                               pgoff_t num_pages)
469 +{
470 +       unsigned int idx;
471 +       struct list_head *head;
472 +       struct page *curr_page, *new_page;
473 +       unsigned int extents_per_page = PAGE_SIZE /
474 +                               sizeof(struct ramzswap_backing_extent);
475 +
476 +       idx = rzs->num_extents % extents_per_page;
477 +       if (!idx) {
478 +               new_page = alloc_page(__GFP_ZERO);
479 +               if (!new_page)
480 +                       return -ENOMEM;
481 +
482 +               if (rzs->num_extents) {
483 +                       curr_page = virt_to_page(rzs->curr_extent);
484 +                       head = &curr_page->lru;
485 +               } else {
486 +                       head = &rzs->backing_swap_extent_list;
487 +               }
488 +
489 +               list_add(&new_page->lru, head);
490 +               rzs->curr_extent = page_address(new_page);
491 +       }
492 +
493 +       rzs->curr_extent->phy_pagenum = phy_pagenum;
494 +       rzs->curr_extent->num_pages = num_pages;
495 +
496 +       pr_debug("add_extent: idx=%u, phy_pgnum=%lu, num_pgs=%lu, "
497 +               "pg_last=%lu, curr_ext=%p\n", idx, phy_pagenum, num_pages,
498 +               phy_pagenum + num_pages - 1, rzs->curr_extent);
499 +
500 +       if (idx != extents_per_page - 1)
501 +               rzs->curr_extent++;
502 +
503 +       return 0;
504 +}
505 +
506 +static int setup_backing_swap_extents(struct ramzswap *rzs,
507 +                               struct inode *inode, unsigned long *num_pages)
508 +{
509 +       int ret = 0;
510 +       unsigned blkbits;
511 +       unsigned blocks_per_page;
512 +       pgoff_t contig_pages = 0, total_pages = 0;
513 +       pgoff_t pagenum = 0, prev_pagenum = 0;
514 +       sector_t probe_block = 0;
515 +       sector_t last_block;
516 +
517 +       blkbits = inode->i_blkbits;
518 +       blocks_per_page = PAGE_SIZE >> blkbits;
519 +
520 +       last_block = i_size_read(inode) >> blkbits;
521 +       while (probe_block + blocks_per_page <= last_block) {
522 +               unsigned block_in_page;
523 +               sector_t first_block;
524 +
525 +               first_block = bmap(inode, probe_block);
526 +               if (first_block == 0)
527 +                       goto bad_bmap;
528 +
529 +               /* It must be PAGE_SIZE aligned on-disk */
530 +               if (first_block & (blocks_per_page - 1)) {
531 +                       probe_block++;
532 +                       goto probe_next;
533 +               }
534 +
535 +               /* All blocks within this page must be contiguous on disk */
536 +               for (block_in_page = 1; block_in_page < blocks_per_page;
537 +                                       block_in_page++) {
538 +                       sector_t block;
539 +
540 +                       block = bmap(inode, probe_block + block_in_page);
541 +                       if (block == 0)
542 +                               goto bad_bmap;
543 +                       if (block != first_block + block_in_page) {
544 +                               /* Discontiguity */
545 +                               probe_block++;
546 +                               goto probe_next;
547 +                       }
548 +               }
549 +
550 +               /*
551 +                * We found a PAGE_SIZE length, PAGE_SIZE aligned
552 +                * run of blocks.
553 +                */
554 +               pagenum = first_block >> (PAGE_SHIFT - blkbits);
555 +
556 +               if (total_pages && (pagenum != prev_pagenum + 1)) {
557 +                       ret = add_backing_swap_extent(rzs, prev_pagenum -
558 +                                       (contig_pages - 1), contig_pages);
559 +                       if (ret < 0)
560 +                               goto out;
561 +                       rzs->num_extents++;
562 +                       contig_pages = 0;
563 +               }
564 +               total_pages++;
565 +               contig_pages++;
566 +               prev_pagenum = pagenum;
567 +               probe_block += blocks_per_page;
568 +
569 +probe_next:
570 +               continue;
571 +       }
572 +
573 +       if (contig_pages) {
574 +               pr_debug("adding last extent: pagenum=%lu, "
575 +                       "contig_pages=%lu\n", pagenum, contig_pages);
576 +               ret = add_backing_swap_extent(rzs,
577 +                       prev_pagenum - (contig_pages - 1), contig_pages);
578 +               if (ret < 0)
579 +                       goto out;
580 +               rzs->num_extents++;
581 +       }
582 +       if (!rzs->num_extents) {
583 +               pr_err("No swap extents found!\n");
584 +               ret = -EINVAL;
585 +       }
586 +
587 +       if (!ret) {
588 +               *num_pages = total_pages;
589 +               pr_info("Found %lu extents containing %luk\n",
590 +                       rzs->num_extents, *num_pages << (PAGE_SHIFT - 10));
591 +       }
592 +       goto out;
593 +
594 +bad_bmap:
595 +       pr_err("Backing swapfile has holes\n");
596 +       ret = -EINVAL;
597 +
598 +out:
599 +       while (ret && !list_empty(&rzs->backing_swap_extent_list)) {
600 +               struct page *page;
601 +               struct list_head *entry = rzs->backing_swap_extent_list.next;
602 +               page = list_entry(entry, struct page, lru);
603 +               list_del(entry);
604 +               __free_page(page);
605 +       }
606 +       return ret;
607 +}
608 +
609 +static void map_backing_swap_extents(struct ramzswap *rzs)
610 +{
611 +       struct ramzswap_backing_extent *se;
612 +       struct page *table_page, *se_page;
613 +       unsigned long num_pages, num_table_pages, entry;
614 +       unsigned long se_idx, span;
615 +       unsigned entries_per_page = PAGE_SIZE / sizeof(*rzs->table);
616 +       unsigned extents_per_page = PAGE_SIZE / sizeof(*se);
617 +
618 +       /* True for block device */
619 +       if (!rzs->num_extents)
620 +               return;
621 +
622 +       se_page = list_entry(rzs->backing_swap_extent_list.next,
623 +                                       struct page, lru);
624 +       se = page_address(se_page);
625 +       span = se->num_pages;
626 +       num_pages = rzs->disksize >> PAGE_SHIFT;
627 +       num_table_pages = DIV_ROUND_UP(num_pages * sizeof(*rzs->table),
628 +                                                       PAGE_SIZE);
629 +
630 +       entry = 0;
631 +       se_idx = 0;
632 +       while (num_table_pages--) {
633 +               table_page = vmalloc_to_page(&rzs->table[entry]);
634 +               while (span <= entry) {
635 +                       se_idx++;
636 +                       if (se_idx == rzs->num_extents)
637 +                               BUG();
638 +
639 +                       if (!(se_idx % extents_per_page)) {
640 +                               se_page = list_entry(se_page->lru.next,
641 +                                               struct page, lru);
642 +                               se = page_address(se_page);
643 +                       } else
644 +                               se++;
645 +
646 +                       span += se->num_pages;
647 +               }
648 +               table_page->mapping = (struct address_space *)se;
649 +               table_page->private = se->num_pages - (span - entry);
650 +               pr_debug("map_table: entry=%lu, span=%lu, map=%p, priv=%lu\n",
651 +                       entry, span, table_page->mapping, table_page->private);
652 +               entry += entries_per_page;
653 +       }
654 +}
655 +
656 +/*
657 + * Check if value of backing_swap module param is sane.
658 + * Claim this device and set ramzswap size equal to
659 + * size of this block device.
660 + */
661 +static int setup_backing_swap(struct ramzswap *rzs)
662 +{
663 +       int ret = 0;
664 +       size_t disksize;
665 +       unsigned long num_pages = 0;
666 +       struct inode *inode;
667 +       struct file *swap_file;
668 +       struct address_space *mapping;
669 +       struct block_device *bdev = NULL;
670 +
671 +       if (!rzs->backing_swap_name[0]) {
672 +               pr_debug("backing_swap param not given\n");
673 +               goto out;
674 +       }
675 +
676 +       pr_debug("Using backing swap device: %s\n", rzs->backing_swap_name);
677 +
678 +       swap_file = filp_open(rzs->backing_swap_name,
679 +                               O_RDWR | O_LARGEFILE, 0);
680 +       if (IS_ERR(swap_file)) {
681 +               pr_err("Error opening backing device: %s\n",
682 +                       rzs->backing_swap_name);
683 +               ret = -EINVAL;
684 +               goto out;
685 +       }
686 +
687 +       mapping = swap_file->f_mapping;
688 +       inode = mapping->host;
689 +
690 +       if (S_ISBLK(inode->i_mode)) {
691 +               bdev = I_BDEV(inode);
692 +               ret = bd_claim(bdev, setup_backing_swap);
693 +               if (ret < 0) {
694 +                       bdev = NULL;
695 +                       goto bad_param;
696 +               }
697 +               disksize = i_size_read(inode);
698 +               if (!disksize) {
699 +                       pr_err("Error reading backing swap size.\n");
700 +                       goto bad_param;
701 +               }
702 +       } else if (S_ISREG(inode->i_mode)) {
703 +               bdev = inode->i_sb->s_bdev;
704 +               if (IS_SWAPFILE(inode)) {
705 +                       ret = -EBUSY;
706 +                       goto bad_param;
707 +               }
708 +               ret = setup_backing_swap_extents(rzs, inode, &num_pages);
709 +               if (ret < 0)
710 +                       goto bad_param;
711 +               disksize = num_pages << PAGE_SHIFT;
712 +       } else {
713 +               goto bad_param;
714 +       }
715 +
716 +       rzs->swap_file = swap_file;
717 +       rzs->backing_swap = bdev;
718 +       rzs->disksize = disksize;
719 +
720 +       return 0;
721 +
722 +bad_param:
723 +       if (bdev)
724 +               bd_release(bdev);
725 +       filp_close(swap_file, NULL);
726 +
727 +out:
728 +       rzs->backing_swap = NULL;
729 +       return ret;
730 +}
731 +
732 +/*
733 + * Map logical page number 'pagenum' to physical page number
734 + * on backing swap device. For block device, this is a nop.
735 + */
736 +static u32 map_backing_swap_page(struct ramzswap *rzs, u32 pagenum)
737 +{
738 +       u32 skip_pages, entries_per_page;
739 +       size_t delta, se_offset, skipped;
740 +       struct page *table_page, *se_page;
741 +       struct ramzswap_backing_extent *se;
742 +
743 +       if (!rzs->num_extents)
744 +               return pagenum;
745 +
746 +       entries_per_page = PAGE_SIZE / sizeof(*rzs->table);
747 +
748 +       table_page = vmalloc_to_page(&rzs->table[pagenum]);
749 +       se = (struct ramzswap_backing_extent *)table_page->mapping;
750 +       se_page = virt_to_page(se);
751 +
752 +       skip_pages = pagenum - (pagenum / entries_per_page * entries_per_page);
753 +       se_offset = table_page->private + skip_pages;
754 +
755 +       if (se_offset < se->num_pages)
756 +               return se->phy_pagenum + se_offset;
757 +
758 +       skipped = se->num_pages - table_page->private;
759 +       do {
760 +               struct ramzswap_backing_extent *se_base;
761 +               u32 se_entries_per_page = PAGE_SIZE / sizeof(*se);
762 +
763 +               /* Get next swap extent */
764 +               se_base = (struct ramzswap_backing_extent *)
765 +                                               page_address(se_page);
766 +               if (se - se_base == se_entries_per_page - 1) {
767 +                       se_page = list_entry(se_page->lru.next,
768 +                                               struct page, lru);
769 +                       se = page_address(se_page);
770 +               } else {
771 +                       se++;
772 +               }
773 +
774 +               skipped += se->num_pages;
775 +       } while (skipped < skip_pages);
776 +
777 +       delta = skipped - skip_pages;
778 +       se_offset = se->num_pages - delta;
779 +
780 +       return se->phy_pagenum + se_offset;
781 +}
782 +
783 +static void ramzswap_free_page(struct ramzswap *rzs, size_t index)
784 +{
785 +#if defined(CONFIG_RAMZSWAP_STATS)
786 +       u32 clen;
787 +       void *obj;
788 +#endif
789 +       struct page *page = rzs->table[index].page;
790 +       u32 offset = rzs->table[index].offset;
791 +
792 +       if (unlikely(!page)) {
793 +               /*
794 +                * No memory is allocated for zero filled pages.
795 +                * Simply clear zero page flag.
796 +                */
797 +               if (rzs_test_flag(rzs, index, RZS_ZERO)) {
798 +                       rzs_clear_flag(rzs, index, RZS_ZERO);
799 +                       stat_dec(&rzs->stats.pages_zero);
800 +               }
801 +               return;
802 +       }
803 +
804 +       if (unlikely(rzs_test_flag(rzs, index, RZS_UNCOMPRESSED))) {
805 +#if defined(CONFIG_RAMZSWAP_STATS)
806 +               clen = PAGE_SIZE;
807 +#endif
808 +               stat_dec(&rzs->stats.pages_expand);
809 +               __free_page(page);
810 +               rzs_clear_flag(rzs, index, RZS_UNCOMPRESSED);
811 +               goto out;
812 +       }
813 +
814 +#if defined(CONFIG_RAMZSWAP_STATS)
815 +       obj = kmap_atomic(page, KM_USER0) + offset;
816 +       clen = xv_get_object_size(obj) - sizeof(struct zobj_header);
817 +       kunmap_atomic(obj, KM_USER0);
818 +       if (clen <= PAGE_SIZE / 2)
819 +               stat_dec(&rzs->stats.good_compress);
820 +#endif
821 +       xv_free(rzs->mem_pool, page, offset);
822 +
823 +out:
824 +#if defined(CONFIG_RAMZSWAP_STATS)
825 +       rzs->stats.compr_size -= clen;
826 +       stat_dec(&rzs->stats.pages_stored);
827 +#endif
828 +
829 +       rzs->table[index].page = NULL;
830 +       rzs->table[index].offset = 0;
831 +}
832 +
833 +static int handle_zero_page(struct bio *bio)
834 +{
835 +       void *user_mem;
836 +       struct page *page = bio->bi_io_vec[0].bv_page;
837 +
838 +       user_mem = kmap_atomic(page, KM_USER0);
839 +       memset(user_mem, 0, PAGE_SIZE);
840 +       kunmap_atomic(user_mem, KM_USER0);
841 +
842 +       flush_dcache_page(page);
843 +
844 +       set_bit(BIO_UPTODATE, &bio->bi_flags);
845 +       bio_endio(bio, 0);
846 +       return 0;
847 +}
848 +
849 +static int handle_uncompressed_page(struct ramzswap *rzs, struct bio *bio)
850 +{
851 +       u32 index;
852 +       struct page *page;
853 +       unsigned char *user_mem, *cmem;
854 +
855 +       page = bio->bi_io_vec[0].bv_page;
856 +       index = bio->bi_sector >> SECTORS_PER_PAGE_SHIFT;
857 +
858 +       user_mem = kmap_atomic(page, KM_USER0);
859 +       cmem = kmap_atomic(rzs->table[index].page, KM_USER1) +
860 +                       rzs->table[index].offset;
861 +
862 +       memcpy(user_mem, cmem, PAGE_SIZE);
863 +       kunmap_atomic(user_mem, KM_USER0);
864 +       kunmap_atomic(cmem, KM_USER1);
865 +
866 +       flush_dcache_page(page);
867 +
868 +       set_bit(BIO_UPTODATE, &bio->bi_flags);
869 +       bio_endio(bio, 0);
870 +       return 0;
871 +}
872 +
873 +
874 +/*
875 + * Called when request page is not present in ramzswap.
876 + * Its either in backing swap device (if present) or
877 + * this is an attempt to read before any previous write
878 + * to this location - this happens due to readahead when
879 + * swap device is read from user-space (e.g. during swapon)
880 + */
881 +static int handle_ramzswap_fault(struct ramzswap *rzs, struct bio *bio)
882 +{
883 +       /*
884 +        * Always forward such requests to backing swap
885 +        * device (if present)
886 +        */
887 +       if (rzs->backing_swap) {
888 +               u32 pagenum;
889 +               stat64_dec(rzs, &rzs->stats.num_reads);
890 +               stat64_inc(rzs, &rzs->stats.bdev_num_reads);
891 +               bio->bi_bdev = rzs->backing_swap;
892 +
893 +               /*
894 +                * In case backing swap is a file, find the right offset within
895 +                * the file corresponding to logical position 'index'. For block
896 +                * device, this is a nop.
897 +                */
898 +               pagenum = bio->bi_sector >> SECTORS_PER_PAGE_SHIFT;
899 +               bio->bi_sector = map_backing_swap_page(rzs, pagenum)
900 +                                       << SECTORS_PER_PAGE_SHIFT;
901 +               return 1;
902 +       }
903 +
904 +       /*
905 +        * Its unlikely event in case backing dev is
906 +        * not present
907 +        */
908 +       pr_debug("Read before write on swap device: "
909 +               "sector=%lu, size=%u, offset=%u\n",
910 +               (ulong)(bio->bi_sector), bio->bi_size,
911 +               bio->bi_io_vec[0].bv_offset);
912 +
913 +       /* Do nothing. Just return success */
914 +       set_bit(BIO_UPTODATE, &bio->bi_flags);
915 +       bio_endio(bio, 0);
916 +       return 0;
917 +}
918 +
919 +static int ramzswap_read(struct ramzswap *rzs, struct bio *bio)
920 +{
921 +       int ret;
922 +       u32 index;
923 +       size_t clen;
924 +       struct page *page;
925 +       struct zobj_header *zheader;
926 +       unsigned char *user_mem, *cmem;
927 +
928 +       stat64_inc(rzs, &rzs->stats.num_reads);
929 +
930 +       page = bio->bi_io_vec[0].bv_page;
931 +       index = bio->bi_sector >> SECTORS_PER_PAGE_SHIFT;
932 +
933 +       if (rzs_test_flag(rzs, index, RZS_ZERO))
934 +               return handle_zero_page(bio);
935 +
936 +       /* Requested page is not present in compressed area */
937 +       if (!rzs->table[index].page)
938 +               return handle_ramzswap_fault(rzs, bio);
939 +
940 +       /* Page is stored uncompressed since it's incompressible */
941 +       if (unlikely(rzs_test_flag(rzs, index, RZS_UNCOMPRESSED)))
942 +               return handle_uncompressed_page(rzs, bio);
943 +
944 +       user_mem = kmap_atomic(page, KM_USER0);
945 +       clen = PAGE_SIZE;
946 +
947 +       cmem = kmap_atomic(rzs->table[index].page, KM_USER1) +
948 +                       rzs->table[index].offset;
949 +
950 +       ret = lzo1x_decompress_safe(
951 +               cmem + sizeof(*zheader),
952 +               xv_get_object_size(cmem) - sizeof(*zheader),
953 +               user_mem, &clen);
954 +
955 +       kunmap_atomic(user_mem, KM_USER0);
956 +       kunmap_atomic(cmem, KM_USER1);
957 +
958 +       /* should NEVER happen */
959 +       if (unlikely(ret != LZO_E_OK)) {
960 +               pr_err("Decompression failed! err=%d, page=%u\n",
961 +                       ret, index);
962 +               stat64_inc(rzs, &rzs->stats.failed_reads);
963 +               goto out;
964 +       }
965 +
966 +       flush_dcache_page(page);
967 +
968 +       set_bit(BIO_UPTODATE, &bio->bi_flags);
969 +       bio_endio(bio, 0);
970 +       return 0;
971 +
972 +out:
973 +       bio_io_error(bio);
974 +       return 0;
975 +}
976 +
977 +static int ramzswap_write(struct ramzswap *rzs, struct bio *bio)
978 +{
979 +       int ret, fwd_write_request = 0;
980 +       u32 offset, index;
981 +       size_t clen;
982 +       struct zobj_header *zheader;
983 +       struct page *page, *page_store;
984 +       unsigned char *user_mem, *cmem, *src;
985 +
986 +       stat64_inc(rzs, &rzs->stats.num_writes);
987 +
988 +       page = bio->bi_io_vec[0].bv_page;
989 +       index = bio->bi_sector >> SECTORS_PER_PAGE_SHIFT;
990 +
991 +       src = rzs->compress_buffer;
992 +
993 +       if (rzs->table[index].page || rzs_test_flag(rzs, index, RZS_ZERO))
994 +               ramzswap_free_page(rzs, index);
995 +
996 +       mutex_lock(&rzs->lock);
997 +
998 +       user_mem = kmap_atomic(page, KM_USER0);
999 +       if (page_zero_filled(user_mem)) {
1000 +               kunmap_atomic(user_mem, KM_USER0);
1001 +               rzs_set_flag(rzs, index, RZS_ZERO);
1002 +               mutex_unlock(&rzs->lock);
1003 +               stat_inc(&rzs->stats.pages_zero);
1004 +
1005 +               set_bit(BIO_UPTODATE, &bio->bi_flags);
1006 +               bio_endio(bio, 0);
1007 +               return 0;
1008 +       }
1009 +
1010 +       if (rzs->backing_swap &&
1011 +               (rzs->stats.compr_size > rzs->memlimit - PAGE_SIZE)) {
1012 +               kunmap_atomic(user_mem, KM_USER0);
1013 +               mutex_unlock(&rzs->lock);
1014 +               fwd_write_request = 1;
1015 +               goto out;
1016 +       }
1017 +
1018 +       ret = lzo1x_1_compress(user_mem, PAGE_SIZE, src, &clen,
1019 +                               rzs->compress_workmem);
1020 +
1021 +       kunmap_atomic(user_mem, KM_USER0);
1022 +
1023 +       if (unlikely(ret != LZO_E_OK)) {
1024 +               mutex_unlock(&rzs->lock);
1025 +               pr_err("Compression failed! err=%d\n", ret);
1026 +               stat64_inc(rzs, &rzs->stats.failed_writes);
1027 +               goto out;
1028 +       }
1029 +
1030 +       /*
1031 +        * Page is incompressible. Forward it to backing swap
1032 +        * if present. Otherwise, store it as-is (uncompressed)
1033 +        * since we do not want to return too many swap write
1034 +        * errors which has side effect of hanging the system.
1035 +        */
1036 +       if (unlikely(clen > max_zpage_size)) {
1037 +               if (rzs->backing_swap) {
1038 +                       mutex_unlock(&rzs->lock);
1039 +                       fwd_write_request = 1;
1040 +                       goto out;
1041 +               }
1042 +
1043 +               clen = PAGE_SIZE;
1044 +               page_store = alloc_page(GFP_NOIO | __GFP_HIGHMEM);
1045 +               if (unlikely(!page_store)) {
1046 +                       mutex_unlock(&rzs->lock);
1047 +                       pr_info("Error allocating memory for incompressible "
1048 +                               "page: %u\n", index);
1049 +                       stat64_inc(rzs, &rzs->stats.failed_writes);
1050 +                       goto out;
1051 +               }
1052 +
1053 +               offset = 0;
1054 +               rzs_set_flag(rzs, index, RZS_UNCOMPRESSED);
1055 +               stat_inc(&rzs->stats.pages_expand);
1056 +               rzs->table[index].page = page_store;
1057 +               src = kmap_atomic(page, KM_USER0);
1058 +               goto memstore;
1059 +       }
1060 +
1061 +       if (xv_malloc(rzs->mem_pool, clen + sizeof(*zheader),
1062 +                       &rzs->table[index].page, &offset,
1063 +                       GFP_NOIO | __GFP_HIGHMEM)) {
1064 +               mutex_unlock(&rzs->lock);
1065 +               pr_info("Error allocating memory for compressed "
1066 +                       "page: %u, size=%zu\n", index, clen);
1067 +               stat64_inc(rzs, &rzs->stats.failed_writes);
1068 +               if (rzs->backing_swap)
1069 +                       fwd_write_request = 1;
1070 +               goto out;
1071 +       }
1072 +
1073 +memstore:
1074 +       rzs->table[index].offset = offset;
1075 +
1076 +       cmem = kmap_atomic(rzs->table[index].page, KM_USER1) +
1077 +                       rzs->table[index].offset;
1078 +
1079 +#if 0
1080 +       /* Back-reference needed for memory defragmentation */
1081 +       if (!rzs_test_flag(rzs, index, RZS_UNCOMPRESSED)) {
1082 +               zheader = (struct zobj_header *)cmem;
1083 +               zheader->table_idx = index;
1084 +               cmem += sizeof(*zheader);
1085 +       }
1086 +#endif
1087 +
1088 +       memcpy(cmem, src, clen);
1089 +
1090 +       kunmap_atomic(cmem, KM_USER1);
1091 +       if (unlikely(rzs_test_flag(rzs, index, RZS_UNCOMPRESSED)))
1092 +               kunmap_atomic(src, KM_USER0);
1093 +
1094 +       /* Update stats */
1095 +       rzs->stats.compr_size += clen;
1096 +       stat_inc(&rzs->stats.pages_stored);
1097 +       if (clen <= PAGE_SIZE / 2)
1098 +               stat_inc(&rzs->stats.good_compress);
1099 +
1100 +       mutex_unlock(&rzs->lock);
1101 +
1102 +       set_bit(BIO_UPTODATE, &bio->bi_flags);
1103 +       bio_endio(bio, 0);
1104 +       return 0;
1105 +
1106 +out:
1107 +       if (fwd_write_request) {
1108 +               stat64_inc(rzs, &rzs->stats.bdev_num_writes);
1109 +               bio->bi_bdev = rzs->backing_swap;
1110 +#if 0
1111 +               /*
1112 +                * TODO: We currently have linear mapping of ramzswap and
1113 +                * backing swap sectors. This is not desired since we want
1114 +                * to optimize writes to backing swap to minimize disk seeks
1115 +                * or have effective wear leveling (for SSDs). Also, a
1116 +                * non-linear mapping is required to implement compressed
1117 +                * on-disk swapping.
1118 +                */
1119 +                bio->bi_sector = get_backing_swap_page()
1120 +                                       << SECTORS_PER_PAGE_SHIFT;
1121 +#endif
1122 +               /*
1123 +                * In case backing swap is a file, find the right offset within
1124 +                * the file corresponding to logical position 'index'. For block
1125 +                * device, this is a nop.
1126 +                */
1127 +               bio->bi_sector = map_backing_swap_page(rzs, index)
1128 +                                       << SECTORS_PER_PAGE_SHIFT;
1129 +               return 1;
1130 +       }
1131 +
1132 +       bio_io_error(bio);
1133 +       return 0;
1134 +}
1135 +
1136 +
1137 +/*
1138 + * Check if request is within bounds and page aligned.
1139 + */
1140 +static inline int valid_swap_request(struct ramzswap *rzs, struct bio *bio)
1141 +{
1142 +       if (unlikely(
1143 +               (bio->bi_sector >= (rzs->disksize >> SECTOR_SHIFT)) ||
1144 +               (bio->bi_sector & (SECTORS_PER_PAGE - 1)) ||
1145 +               (bio->bi_vcnt != 1) ||
1146 +               (bio->bi_size != PAGE_SIZE) ||
1147 +               (bio->bi_io_vec[0].bv_offset != 0))) {
1148 +
1149 +               return 0;
1150 +       }
1151 +
1152 +       /* swap request is valid */
1153 +       return 1;
1154 +}
1155 +
1156 +/*
1157 + * Handler function for all ramzswap I/O requests.
1158 + */
1159 +static int ramzswap_make_request(struct request_queue *queue, struct bio *bio)
1160 +{
1161 +       int ret = 0;
1162 +       struct ramzswap *rzs = queue->queuedata;
1163 +
1164 +       if (unlikely(!rzs->init_done)) {
1165 +               bio_io_error(bio);
1166 +               return 0;
1167 +       }
1168 +
1169 +       if (!valid_swap_request(rzs, bio)) {
1170 +               stat64_inc(rzs, &rzs->stats.invalid_io);
1171 +               bio_io_error(bio);
1172 +               return 0;
1173 +       }
1174 +
1175 +       switch (bio_data_dir(bio)) {
1176 +       case READ:
1177 +               ret = ramzswap_read(rzs, bio);
1178 +               break;
1179 +
1180 +       case WRITE:
1181 +               ret = ramzswap_write(rzs, bio);
1182 +               break;
1183 +       }
1184 +
1185 +       return ret;
1186 +}
1187 +
1188 +static void reset_device(struct ramzswap *rzs, struct block_device *bdev)
1189 +{
1190 +       int is_backing_blkdev = 0;
1191 +       size_t index, num_pages;
1192 +       unsigned entries_per_page;
1193 +       unsigned long num_table_pages, entry = 0;
1194 +
1195 +       if (bdev)
1196 +               fsync_bdev(bdev);
1197 +
1198 +       rzs->init_done = 0;
1199 +
1200 +       if (rzs->backing_swap && !rzs->num_extents)
1201 +               is_backing_blkdev = 1;
1202 +
1203 +       num_pages = rzs->disksize >> PAGE_SHIFT;
1204 +
1205 +       /* Free various per-device buffers */
1206 +       kfree(rzs->compress_workmem);
1207 +       free_pages((unsigned long)rzs->compress_buffer, 1);
1208 +
1209 +       rzs->compress_workmem = NULL;
1210 +       rzs->compress_buffer = NULL;
1211 +
1212 +       /* Free all pages that are still in this ramzswap device */
1213 +       for (index = 0; index < num_pages; index++) {
1214 +               struct page *page;
1215 +               u16 offset;
1216 +
1217 +               page = rzs->table[index].page;
1218 +               offset = rzs->table[index].offset;
1219 +
1220 +               if (!page)
1221 +                       continue;
1222 +
1223 +               if (unlikely(rzs_test_flag(rzs, index, RZS_UNCOMPRESSED)))
1224 +                       __free_page(page);
1225 +               else
1226 +                       xv_free(rzs->mem_pool, page, offset);
1227 +       }
1228 +
1229 +       entries_per_page = PAGE_SIZE / sizeof(*rzs->table);
1230 +       num_table_pages = DIV_ROUND_UP(num_pages * sizeof(*rzs->table),
1231 +                                       PAGE_SIZE);
1232 +       /*
1233 +        * Set page->mapping to NULL for every table page.
1234 +        * Otherwise, we will hit bad_page() during free.
1235 +        */
1236 +       while (rzs->num_extents && num_table_pages--) {
1237 +               struct page *page;
1238 +               page = vmalloc_to_page(&rzs->table[entry]);
1239 +               page->mapping = NULL;
1240 +               entry += entries_per_page;
1241 +       }
1242 +       vfree(rzs->table);
1243 +       rzs->table = NULL;
1244 +
1245 +       xv_destroy_pool(rzs->mem_pool);
1246 +       rzs->mem_pool = NULL;
1247 +
1248 +       /* Free all swap extent pages */
1249 +       while (!list_empty(&rzs->backing_swap_extent_list)) {
1250 +               struct page *page;
1251 +               struct list_head *entry;
1252 +               entry = rzs->backing_swap_extent_list.next;
1253 +               page = list_entry(entry, struct page, lru);
1254 +               list_del(entry);
1255 +               __free_page(page);
1256 +       }
1257 +       INIT_LIST_HEAD(&rzs->backing_swap_extent_list);
1258 +       rzs->num_extents = 0;
1259 +
1260 +       /* Close backing swap device, if present */
1261 +       if (rzs->backing_swap) {
1262 +               if (is_backing_blkdev)
1263 +                       bd_release(rzs->backing_swap);
1264 +               filp_close(rzs->swap_file, NULL);
1265 +               rzs->backing_swap = NULL;
1266 +               memset(rzs->backing_swap_name, 0, MAX_SWAP_NAME_LEN);
1267 +       }
1268 +
1269 +       /* Reset stats */
1270 +       memset(&rzs->stats, 0, sizeof(rzs->stats));
1271 +
1272 +       rzs->disksize = 0;
1273 +       rzs->memlimit = 0;
1274 +}
1275 +
1276 +static int ramzswap_ioctl_init_device(struct ramzswap *rzs)
1277 +{
1278 +       int ret, dev_id;
1279 +       size_t num_pages;
1280 +       struct page *page;
1281 +       union swap_header *swap_header;
1282 +
1283 +       if (rzs->init_done) {
1284 +               pr_info("Device already initialized!\n");
1285 +               return -EBUSY;
1286 +       }
1287 +
1288 +       dev_id = rzs - devices;
1289 +
1290 +       ret = setup_backing_swap(rzs);
1291 +       if (ret)
1292 +               goto fail;
1293 +
1294 +       if (rzs->backing_swap)
1295 +               ramzswap_set_memlimit(rzs, totalram_pages << PAGE_SHIFT);
1296 +       else
1297 +               ramzswap_set_disksize(rzs, totalram_pages << PAGE_SHIFT);
1298 +
1299 +       rzs->compress_workmem = kzalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL);
1300 +       if (!rzs->compress_workmem) {
1301 +               pr_err("Error allocating compressor working memory!\n");
1302 +               ret = -ENOMEM;
1303 +               goto fail;
1304 +       }
1305 +
1306 +       rzs->compress_buffer = (void *)__get_free_pages(__GFP_ZERO, 1);
1307 +       if (!rzs->compress_buffer) {
1308 +               pr_err("Error allocating compressor buffer space\n");
1309 +               ret = -ENOMEM;
1310 +               goto fail;
1311 +       }
1312 +
1313 +       num_pages = rzs->disksize >> PAGE_SHIFT;
1314 +       rzs->table = vmalloc(num_pages * sizeof(*rzs->table));
1315 +       if (!rzs->table) {
1316 +               pr_err("Error allocating ramzswap address table\n");
1317 +               /* To prevent accessing table entries during cleanup */
1318 +               rzs->disksize = 0;
1319 +               ret = -ENOMEM;
1320 +               goto fail;
1321 +       }
1322 +       memset(rzs->table, 0, num_pages * sizeof(*rzs->table));
1323 +
1324 +       map_backing_swap_extents(rzs);
1325 +
1326 +       page = alloc_page(__GFP_ZERO);
1327 +       if (!page) {
1328 +               pr_err("Error allocating swap header page\n");
1329 +               ret = -ENOMEM;
1330 +               goto fail;
1331 +       }
1332 +       rzs->table[0].page = page;
1333 +       rzs_set_flag(rzs, 0, RZS_UNCOMPRESSED);
1334 +
1335 +       swap_header = kmap(page);
1336 +       ret = setup_swap_header(rzs, swap_header);
1337 +       kunmap(page);
1338 +       if (ret) {
1339 +               pr_err("Error setting swap header\n");
1340 +               goto fail;
1341 +       }
1342 +
1343 +       set_capacity(rzs->disk, rzs->disksize >> SECTOR_SHIFT);
1344 +
1345 +       /*
1346 +        * We have ident mapping of sectors for ramzswap and
1347 +        * and the backing swap device. So, this queue flag
1348 +        * should be according to backing dev.
1349 +        */
1350 +       if (!rzs->backing_swap ||
1351 +                       blk_queue_nonrot(rzs->backing_swap->bd_disk->queue))
1352 +               queue_flag_set_unlocked(QUEUE_FLAG_NONROT, rzs->disk->queue);
1353 +
1354 +       rzs->mem_pool = xv_create_pool();
1355 +       if (!rzs->mem_pool) {
1356 +               pr_err("Error creating memory pool\n");
1357 +               ret = -ENOMEM;
1358 +               goto fail;
1359 +       }
1360 +
1361 +       /*
1362 +        * Pages that compress to size greater than this are forwarded
1363 +        * to physical swap disk (if backing dev is provided)
1364 +        * TODO: make this configurable
1365 +        */
1366 +       if (rzs->backing_swap)
1367 +               max_zpage_size = max_zpage_size_bdev;
1368 +       else
1369 +               max_zpage_size = max_zpage_size_nobdev;
1370 +       pr_debug("Max compressed page size: %u bytes\n", max_zpage_size);
1371 +
1372 +       rzs->init_done = 1;
1373 +
1374 +       if (rzs->backing_swap) {
1375 +               pr_info("/dev/ramzswap%d initialized: "
1376 +                       "backing_swap=%s, memlimit_kb=%zu\n",
1377 +                       dev_id, rzs->backing_swap_name, rzs->memlimit >> 10);
1378 +       } else {
1379 +               pr_info("/dev/ramzswap%d initialized: "
1380 +                       "disksize_kb=%zu", dev_id, rzs->disksize >> 10);
1381 +       }
1382 +       return 0;
1383 +
1384 +fail:
1385 +       reset_device(rzs, NULL);
1386 +
1387 +       pr_err("Initialization failed: err=%d\n", ret);
1388 +       return ret;
1389 +}
1390 +
1391 +static int ramzswap_ioctl_reset_device(struct ramzswap *rzs,
1392 +                               struct block_device *bdev)
1393 +{
1394 +       if (rzs->init_done)
1395 +               reset_device(rzs, bdev);
1396 +
1397 +       return 0;
1398 +}
1399 +
1400 +static int ramzswap_ioctl(struct block_device *bdev, fmode_t mode,
1401 +                       unsigned int cmd, unsigned long arg)
1402 +{
1403 +       int ret = 0;
1404 +       size_t disksize_kb, memlimit_kb;
1405 +
1406 +       struct ramzswap *rzs = bdev->bd_disk->private_data;
1407 +
1408 +       switch (cmd) {
1409 +       case RZSIO_SET_DISKSIZE_KB:
1410 +               if (rzs->init_done) {
1411 +                       ret = -EBUSY;
1412 +                       goto out;
1413 +               }
1414 +               if (copy_from_user(&disksize_kb, (void *)arg,
1415 +                                               _IOC_SIZE(cmd))) {
1416 +                       ret = -EFAULT;
1417 +                       goto out;
1418 +               }
1419 +               rzs->disksize = disksize_kb << 10;
1420 +               pr_debug("Disk size set to %zu kB\n", disksize_kb);
1421 +               break;
1422 +
1423 +       case RZSIO_SET_MEMLIMIT_KB:
1424 +               if (rzs->init_done) {
1425 +                       /* TODO: allow changing memlimit */
1426 +                       ret = -EBUSY;
1427 +                       goto out;
1428 +               }
1429 +               if (copy_from_user(&memlimit_kb, (void *)arg,
1430 +                                               _IOC_SIZE(cmd))) {
1431 +                       ret = -EFAULT;
1432 +                       goto out;
1433 +               }
1434 +               rzs->memlimit = memlimit_kb << 10;
1435 +               pr_debug("Memory limit set to %zu kB\n", memlimit_kb);
1436 +               break;
1437 +
1438 +       case RZSIO_SET_BACKING_SWAP:
1439 +               if (rzs->init_done) {
1440 +                       ret = -EBUSY;
1441 +                       goto out;
1442 +               }
1443 +
1444 +               if (copy_from_user(&rzs->backing_swap_name, (void *)arg,
1445 +                                               _IOC_SIZE(cmd))) {
1446 +                       ret = -EFAULT;
1447 +                       goto out;
1448 +               }
1449 +               rzs->backing_swap_name[MAX_SWAP_NAME_LEN - 1] = '\0';
1450 +               pr_debug("Backing swap set to %s\n", rzs->backing_swap_name);
1451 +               break;
1452 +
1453 +       case RZSIO_GET_STATS:
1454 +       {
1455 +               struct ramzswap_ioctl_stats *stats;
1456 +               if (!rzs->init_done) {
1457 +                       ret = -ENOTTY;
1458 +                       goto out;
1459 +               }
1460 +               stats = kzalloc(sizeof(*stats), GFP_KERNEL);
1461 +               if (!stats) {
1462 +                       ret = -ENOMEM;
1463 +                       goto out;
1464 +               }
1465 +               ramzswap_ioctl_get_stats(rzs, stats);
1466 +               if (copy_to_user((void *)arg, stats, sizeof(*stats))) {
1467 +                       kfree(stats);
1468 +                       ret = -EFAULT;
1469 +                       goto out;
1470 +               }
1471 +               kfree(stats);
1472 +               break;
1473 +       }
1474 +       case RZSIO_INIT:
1475 +               ret = ramzswap_ioctl_init_device(rzs);
1476 +               break;
1477 +
1478 +       case RZSIO_RESET:
1479 +               /* Do not reset an active device! */
1480 +               if (bdev->bd_holders) {
1481 +                       ret = -EBUSY;
1482 +                       goto out;
1483 +               }
1484 +               ret = ramzswap_ioctl_reset_device(rzs, bdev);
1485 +               break;
1486 +
1487 +       default:
1488 +               pr_info("Invalid ioctl %u\n", cmd);
1489 +               ret = -ENOTTY;
1490 +       }
1491 +
1492 +out:
1493 +       return ret;
1494 +}
1495 +
1496 +#if defined(CONFIG_SWAP_FREE_NOTIFY)
1497 +
1498 +void ramzswap_slot_free_notify(struct block_device *bdev, sector_t bi_sector)
1499 +{
1500 +       struct ramzswap *rzs = bdev->bd_disk->private_data;
1501 +       ramzswap_free_page(rzs, bi_sector >> SECTORS_PER_PAGE_SHIFT);
1502 +       stat64_inc(rzs, &rzs->stats.notify_free);
1503 +}
1504 +
1505 +#endif
1506 +
1507 +static struct block_device_operations ramzswap_devops = {
1508 +       .ioctl = ramzswap_ioctl,
1509 +#if defined(CONFIG_SWAP_FREE_NOTIFY)
1510 +       .swap_slot_free_notify = ramzswap_slot_free_notify,
1511 +#endif
1512 +       .owner = THIS_MODULE
1513 +};
1514 +
1515 +static int create_device(struct ramzswap *rzs, int device_id)
1516 +{
1517 +       int ret = 0;
1518 +
1519 +       mutex_init(&rzs->lock);
1520 +       spin_lock_init(&rzs->stat64_lock);
1521 +       INIT_LIST_HEAD(&rzs->backing_swap_extent_list);
1522 +
1523 +       rzs->queue = blk_alloc_queue(GFP_KERNEL);
1524 +       if (!rzs->queue) {
1525 +               pr_err("Error allocating disk queue for device %d\n",
1526 +                       device_id);
1527 +               ret = -ENOMEM;
1528 +               goto out;
1529 +       }
1530 +
1531 +       blk_queue_make_request(rzs->queue, ramzswap_make_request);
1532 +       rzs->queue->queuedata = rzs;
1533 +
1534 +        /* gendisk structure */
1535 +       rzs->disk = alloc_disk(1);
1536 +       if (!rzs->disk) {
1537 +               blk_cleanup_queue(rzs->queue);
1538 +               pr_warning("Error allocating disk structure for device %d\n",
1539 +                       device_id);
1540 +               ret = -ENOMEM;
1541 +               goto out;
1542 +       }
1543 +
1544 +       rzs->disk->major = ramzswap_major;
1545 +       rzs->disk->first_minor = device_id;
1546 +       rzs->disk->fops = &ramzswap_devops;
1547 +       rzs->disk->queue = rzs->queue;
1548 +       rzs->disk->private_data = rzs;
1549 +       snprintf(rzs->disk->disk_name, 16, "ramzswap%d", device_id);
1550 +       /*
1551 +        * Actual capacity set using RZSIO_SET_DISKSIZE_KB ioctl
1552 +        * or set equal to backing swap device (if provided)
1553 +        */
1554 +       set_capacity(rzs->disk, 0);
1555 +
1556 +       blk_queue_physical_block_size(rzs->disk->queue, PAGE_SIZE);
1557 +       blk_queue_logical_block_size(rzs->disk->queue, PAGE_SIZE);
1558 +
1559 +       add_disk(rzs->disk);
1560 +       rzs->disk->flags &= ~GENHD_FL_REMAP_SWAPPED_PAGES;
1561 +
1562 +       rzs->init_done = 0;
1563 +
1564 +out:
1565 +       return ret;
1566 +}
1567 +
1568 +static void destroy_device(struct ramzswap *rzs)
1569 +{
1570 +       if (rzs->disk) {
1571 +               del_gendisk(rzs->disk);
1572 +               put_disk(rzs->disk);
1573 +       }
1574 +
1575 +       if (rzs->queue)
1576 +               blk_cleanup_queue(rzs->queue);
1577 +}
1578 +
1579 +static int __init ramzswap_init(void)
1580 +{
1581 +       int ret, dev_id;
1582 +       struct ramzswap *rzs;
1583 +
1584 +       if (num_devices > max_num_devices) {
1585 +               pr_warning("Invalid value for num_devices: %u\n",
1586 +                               num_devices);
1587 +               ret = -EINVAL;
1588 +               goto out;
1589 +       }
1590 +
1591 +       ramzswap_major = register_blkdev(0, "ramzswap");
1592 +       if (ramzswap_major <= 0) {
1593 +               pr_warning("Unable to get major number\n");
1594 +               ret = -EBUSY;
1595 +               goto out;
1596 +       }
1597 +
1598 +       if (!num_devices) {
1599 +               pr_info("num_devices not specified. Using default: 1\n");
1600 +               num_devices = 1;
1601 +       }
1602 +
1603 +       /* Allocate the device array and initialize each one */
1604 +       pr_debug("Creating %u devices ...\n", num_devices);
1605 +       devices = kzalloc(num_devices * sizeof(struct ramzswap), GFP_KERNEL);
1606 +       if (!devices) {
1607 +               ret = -ENOMEM;
1608 +               goto unregister;
1609 +       }
1610 +
1611 +       for (dev_id = 0; dev_id < num_devices; dev_id++) {
1612 +               if (create_device(&devices[dev_id], dev_id)) {
1613 +                       ret = -ENOMEM;
1614 +                       goto free_devices;
1615 +               }
1616 +       }
1617 +
1618 +       /*
1619 +        * Initialize the first device (/dev/ramzswap0)
1620 +        * if parameters are provided
1621 +        */
1622 +       rzs = &devices[0];
1623 +
1624 +       /*
1625 +        * User specifies either <disksize_kb> or <backing_swap, memlimit_kb>
1626 +        */
1627 +       if (disksize_kb) {
1628 +               rzs->disksize = disksize_kb << 10;
1629 +               ret = ramzswap_ioctl_init_device(rzs);
1630 +               if (ret)
1631 +                       goto free_devices;
1632 +               goto out;
1633 +       }
1634 +
1635 +       if (backing_swap[0]) {
1636 +               rzs->memlimit = memlimit_kb << 10;
1637 +               strncpy(rzs->backing_swap_name, backing_swap,
1638 +                       MAX_SWAP_NAME_LEN);
1639 +               rzs->backing_swap_name[MAX_SWAP_NAME_LEN - 1] = '\0';
1640 +               ret = ramzswap_ioctl_init_device(rzs);
1641 +               if (ret)
1642 +                       goto free_devices;
1643 +               goto out;
1644 +       }
1645 +
1646 +       /* User specified memlimit_kb but not backing_swap */
1647 +       if (memlimit_kb) {
1648 +               pr_info("memlimit_kb parameter is valid only when "
1649 +                       "backing_swap is also specified. Aborting.\n");
1650 +               ret = -EINVAL;
1651 +               goto free_devices;
1652 +       }
1653 +
1654 +       return 0;
1655 +
1656 +free_devices:
1657 +       while(dev_id)
1658 +               destroy_device(&devices[--dev_id]);
1659 +unregister:
1660 +       unregister_blkdev(ramzswap_major, "ramzswap");
1661 +out:
1662 +       return ret;
1663 +}
1664 +
1665 +static void __exit ramzswap_exit(void)
1666 +{
1667 +       int i;
1668 +       struct ramzswap *rzs;
1669 +
1670 +       for (i = 0; i < num_devices; i++) {
1671 +               rzs = &devices[i];
1672 +
1673 +               destroy_device(rzs);
1674 +               if (rzs->init_done)
1675 +                       reset_device(rzs, NULL);
1676 +       }
1677 +
1678 +       unregister_blkdev(ramzswap_major, "ramzswap");
1679 +
1680 +       kfree(devices);
1681 +       pr_debug("Cleanup done!\n");
1682 +}
1683 +
1684 +/*
1685 + * Module parameters
1686 + */
1687 +
1688 +/* Optional: default = 1 */
1689 +module_param(num_devices, uint, 0);
1690 +MODULE_PARM_DESC(num_devices, "Number of ramzswap devices");
1691 +
1692 +/*
1693 + * User specifies either <disksize_kb> or <backing_swap, memlimit_kb>
1694 + * parameters. You must specify these parameters if the first device
1695 + * has to be initialized on module load without using rzscontrol utility.
1696 + * This is useful for embedded system, where shipping an additional binary
1697 + * (rzscontrol) might not be desirable.
1698 + *
1699 + * These parameters are used to initialize just the first (/dev/ramzswap0)
1700 + * device. To initialize additional devices, use rzscontrol utility. If
1701 + * these parameters are not provided, then the first device is also
1702 + * left in unitialized state.
1703 + */
1704 +
1705 +/* Optional: default = 25% of RAM */
1706 +module_param(disksize_kb, ulong, 0);
1707 +MODULE_PARM_DESC(disksize_kb, "Disksize in KB");
1708 +
1709 +/* Optional: default = 15% of RAM */
1710 +module_param(memlimit_kb, ulong, 0);
1711 +MODULE_PARM_DESC(memlimit_kb, "Memlimit in KB");
1712 +
1713 +/* Optional: default = <NULL> */
1714 +module_param_string(backing_swap, backing_swap, sizeof(backing_swap), 0);
1715 +MODULE_PARM_DESC(backing_swap, "Backing swap name");
1716 +
1717 +module_init(ramzswap_init);
1718 +module_exit(ramzswap_exit);
1719 +
1720 +MODULE_LICENSE("Dual BSD/GPL");
1721 +MODULE_AUTHOR("Nitin Gupta <ngupta@vflare.org>");
1722 +MODULE_DESCRIPTION("Compressed RAM Based Swap Device");
1723 Index: kernel-power-2.6.28/drivers/block/ramzswap/ramzswap_drv.h
1724 ===================================================================
1725 --- /dev/null
1726 +++ kernel-power-2.6.28/drivers/block/ramzswap/ramzswap_drv.h
1727 @@ -0,0 +1,210 @@
1728 +/*
1729 + * Compressed RAM based swap device
1730 + *
1731 + * Copyright (C) 2008, 2009, 2010  Nitin Gupta
1732 + *
1733 + * This code is released using a dual license strategy: BSD/GPL
1734 + * You can choose the licence that better fits your requirements.
1735 + *
1736 + * Released under the terms of 3-clause BSD License
1737 + * Released under the terms of GNU General Public License Version 2.0
1738 + *
1739 + * Project home: http://compcache.googlecode.com
1740 + */
1741 +
1742 +#ifndef _RAMZSWAP_DRV_H_
1743 +#define _RAMZSWAP_DRV_H_
1744 +
1745 +#include <linux/spinlock.h>
1746 +#include <linux/mutex.h>
1747 +
1748 +#include "ramzswap_ioctl.h"
1749 +#include "xvmalloc.h"
1750 +
1751 +/*
1752 + * Some arbitrary value. This is just to catch
1753 + * invalid value for num_devices module parameter.
1754 + */
1755 +static const unsigned max_num_devices = 32;
1756 +
1757 +/*
1758 + * Stored at beginning of each compressed object.
1759 + *
1760 + * It stores back-reference to table entry which points to this
1761 + * object. This is required to support memory defragmentation or
1762 + * migrating compressed pages to backing swap disk.
1763 + */
1764 +struct zobj_header {
1765 +#if 0
1766 +       u32 table_idx;
1767 +#endif
1768 +};
1769 +
1770 +/*-- Configurable parameters */
1771 +
1772 +/* Default ramzswap disk size: 25% of total RAM */
1773 +static const unsigned default_disksize_perc_ram = 25;
1774 +static const unsigned default_memlimit_perc_ram = 15;
1775 +
1776 +/*
1777 + * Max compressed page size when backing device is provided.
1778 + * Pages that compress to size greater than this are sent to
1779 + * physical swap disk.
1780 + */
1781 +static const unsigned max_zpage_size_bdev = PAGE_SIZE / 2;
1782 +
1783 +/*
1784 + * Max compressed page size when there is no backing dev.
1785 + * Pages that compress to size greater than this are stored
1786 + * uncompressed in memory.
1787 + */
1788 +static const unsigned max_zpage_size_nobdev = PAGE_SIZE / 4 * 3;
1789 +
1790 +/*
1791 + * NOTE: max_zpage_size_{bdev,nobdev} sizes must be
1792 + * less than or equal to:
1793 + *   XV_MAX_ALLOC_SIZE - sizeof(struct zobj_header)
1794 + * since otherwise xv_malloc would always return failure.
1795 + */
1796 +
1797 +/*-- End of configurable params */
1798 +
1799 +#define SECTOR_SHIFT           9
1800 +#define SECTOR_SIZE            (1 << SECTOR_SHIFT)
1801 +#define SECTORS_PER_PAGE_SHIFT (PAGE_SHIFT - SECTOR_SHIFT)
1802 +#define SECTORS_PER_PAGE       (1 << SECTORS_PER_PAGE_SHIFT)
1803 +
1804 +/* Flags for ramzswap pages (table[page_no].flags) */
1805 +enum rzs_pageflags {
1806 +       /* Page is stored uncompressed */
1807 +       RZS_UNCOMPRESSED,
1808 +
1809 +       /* Page consists entirely of zeros */
1810 +       RZS_ZERO,
1811 +
1812 +       __NR_RZS_PAGEFLAGS,
1813 +};
1814 +
1815 +/*-- Data structures */
1816 +
1817 +/*
1818 + * Allocated for each swap slot, indexed by page no.
1819 + * These table entries must fit exactly in a page.
1820 + */
1821 +struct table {
1822 +       struct page *page;
1823 +       u16 offset;
1824 +       u8 count;       /* object ref count (not yet used) */
1825 +       u8 flags;
1826 +} __attribute__((aligned(4)));
1827 +
1828 +/*
1829 + * Swap extent information in case backing swap is a regular
1830 + * file. These extent entries must fit exactly in a page.
1831 + */
1832 +struct ramzswap_backing_extent {
1833 +       pgoff_t phy_pagenum;
1834 +       pgoff_t num_pages;
1835 +} __attribute__((aligned(4)));
1836 +
1837 +struct ramzswap_stats {
1838 +       /* basic stats */
1839 +       size_t compr_size;      /* compressed size of pages stored -
1840 +                                * needed to enforce memlimit */
1841 +       /* more stats */
1842 +#if defined(CONFIG_RAMZSWAP_STATS)
1843 +       u64 num_reads;          /* failed + successful */
1844 +       u64 num_writes;         /* --do-- */
1845 +       u64 failed_reads;       /* should NEVER! happen */
1846 +       u64 failed_writes;      /* can happen when memory is too low */
1847 +       u64 invalid_io;         /* non-swap I/O requests */
1848 +       u64 notify_free;        /* no. of swap slot free notifications */
1849 +       u32 pages_zero;         /* no. of zero filled pages */
1850 +       u32 pages_stored;       /* no. of pages currently stored */
1851 +       u32 good_compress;      /* % of pages with compression ratio<=50% */
1852 +       u32 pages_expand;       /* % of incompressible pages */
1853 +       u64 bdev_num_reads;     /* no. of reads on backing dev */
1854 +       u64 bdev_num_writes;    /* no. of writes on backing dev */
1855 +#endif
1856 +};
1857 +
1858 +struct ramzswap {
1859 +       struct xv_pool *mem_pool;
1860 +       void *compress_workmem;
1861 +       void *compress_buffer;
1862 +       struct table *table;
1863 +       spinlock_t stat64_lock; /* protect 64-bit stats */
1864 +       struct mutex lock;
1865 +       struct request_queue *queue;
1866 +       struct gendisk *disk;
1867 +       int init_done;
1868 +       /*
1869 +        * This is limit on compressed data size (stats.compr_size)
1870 +        * Its applicable only when backing swap device is present.
1871 +        */
1872 +       size_t memlimit;        /* bytes */
1873 +       /*
1874 +        * This is limit on amount of *uncompressed* worth of data
1875 +        * we can hold. When backing swap device is provided, it is
1876 +        * set equal to device size.
1877 +        */
1878 +       size_t disksize;        /* bytes */
1879 +
1880 +       struct ramzswap_stats stats;
1881 +
1882 +       /* backing swap device info */
1883 +       struct ramzswap_backing_extent *curr_extent;
1884 +       struct list_head backing_swap_extent_list;
1885 +       unsigned long num_extents;
1886 +       char backing_swap_name[MAX_SWAP_NAME_LEN];
1887 +       struct block_device *backing_swap;
1888 +       struct file *swap_file;
1889 +};
1890 +
1891 +/*-- */
1892 +
1893 +/* Debugging and Stats */
1894 +#if defined(CONFIG_RAMZSWAP_STATS)
1895 +static void stat_inc(u32 *v)
1896 +{
1897 +       *v = *v + 1;
1898 +}
1899 +
1900 +static void stat_dec(u32 *v)
1901 +{
1902 +       *v = *v - 1;
1903 +}
1904 +
1905 +static void stat64_inc(struct ramzswap *rzs, u64 *v)
1906 +{
1907 +       spin_lock(&rzs->stat64_lock);
1908 +       *v = *v + 1;
1909 +       spin_unlock(&rzs->stat64_lock);
1910 +}
1911 +
1912 +static void stat64_dec(struct ramzswap *rzs, u64 *v)
1913 +{
1914 +       spin_lock(&rzs->stat64_lock);
1915 +       *v = *v - 1;
1916 +       spin_unlock(&rzs->stat64_lock);
1917 +}
1918 +
1919 +static u64 stat64_read(struct ramzswap *rzs, u64 *v)
1920 +{
1921 +       u64 val;
1922 +
1923 +       spin_lock(&rzs->stat64_lock);
1924 +       val = *v;
1925 +       spin_unlock(&rzs->stat64_lock);
1926 +
1927 +       return val;
1928 +}
1929 +#else
1930 +#define stat_inc(v)
1931 +#define stat_dec(v)
1932 +#define stat64_inc(r, v)
1933 +#define stat64_dec(r, v)
1934 +#define stat64_read(r, v)
1935 +#endif /* CONFIG_RAMZSWAP_STATS */
1936 +
1937 +#endif
1938 Index: kernel-power-2.6.28/drivers/block/ramzswap/ramzswap_ioctl.h
1939 ===================================================================
1940 --- /dev/null
1941 +++ kernel-power-2.6.28/drivers/block/ramzswap/ramzswap_ioctl.h
1942 @@ -0,0 +1,50 @@
1943 +/*
1944 + * Compressed RAM based swap device
1945 + *
1946 + * Copyright (C) 2008, 2009, 2010  Nitin Gupta
1947 + *
1948 + * This code is released using a dual license strategy: BSD/GPL
1949 + * You can choose the licence that better fits your requirements.
1950 + *
1951 + * Released under the terms of 3-clause BSD License
1952 + * Released under the terms of GNU General Public License Version 2.0
1953 + *
1954 + * Project home: http://compcache.googlecode.com
1955 + */
1956 +
1957 +#ifndef _RAMZSWAP_IOCTL_H_
1958 +#define _RAMZSWAP_IOCTL_H_
1959 +
1960 +#define MAX_SWAP_NAME_LEN 128
1961 +
1962 +struct ramzswap_ioctl_stats {
1963 +       char backing_swap_name[MAX_SWAP_NAME_LEN];
1964 +       u64 memlimit;           /* only applicable if backing swap present */
1965 +       u64 disksize;           /* user specified or equal to backing swap
1966 +                                * size (if present) */
1967 +       u64 num_reads;          /* failed + successful */
1968 +       u64 num_writes;         /* --do-- */
1969 +       u64 failed_reads;       /* should NEVER! happen */
1970 +       u64 failed_writes;      /* can happen when memory is too low */
1971 +       u64 invalid_io;         /* non-swap I/O requests */
1972 +       u64 notify_free;        /* no. of swap slot free notifications */
1973 +       u32 pages_zero;         /* no. of zero filled pages */
1974 +       u32 good_compress_pct;  /* no. of pages with compression ratio<=50% */
1975 +       u32 pages_expand_pct;   /* no. of incompressible pages */
1976 +       u32 pages_stored;
1977 +       u32 pages_used;
1978 +       u64 orig_data_size;
1979 +       u64 compr_data_size;
1980 +       u64 mem_used_total;
1981 +       u64 bdev_num_reads;     /* no. of reads on backing dev */
1982 +       u64 bdev_num_writes;    /* no. of writes on backing dev */
1983 +} __attribute__ ((packed, aligned(4)));
1984 +
1985 +#define RZSIO_SET_DISKSIZE_KB  _IOW('z', 0, size_t)
1986 +#define RZSIO_SET_MEMLIMIT_KB  _IOW('z', 1, size_t)
1987 +#define RZSIO_SET_BACKING_SWAP _IOW('z', 2, unsigned char[MAX_SWAP_NAME_LEN])
1988 +#define RZSIO_GET_STATS                _IOR('z', 3, struct ramzswap_ioctl_stats)
1989 +#define RZSIO_INIT             _IO('z', 4)
1990 +#define RZSIO_RESET            _IO('z', 5)
1991 +
1992 +#endif
1993 Index: kernel-power-2.6.28/drivers/block/ramzswap/xvmalloc.c
1994 ===================================================================
1995 --- /dev/null
1996 +++ kernel-power-2.6.28/drivers/block/ramzswap/xvmalloc.c
1997 @@ -0,0 +1,507 @@
1998 +/*
1999 + * xvmalloc memory allocator
2000 + *
2001 + * Copyright (C) 2008, 2009, 2010  Nitin Gupta
2002 + *
2003 + * This code is released using a dual license strategy: BSD/GPL
2004 + * You can choose the licence that better fits your requirements.
2005 + *
2006 + * Released under the terms of 3-clause BSD License
2007 + * Released under the terms of GNU General Public License Version 2.0
2008 + */
2009 +
2010 +#include <linux/bitops.h>
2011 +#include <linux/errno.h>
2012 +#include <linux/highmem.h>
2013 +#include <linux/init.h>
2014 +#include <linux/string.h>
2015 +#include <linux/slab.h>
2016 +
2017 +#include "xvmalloc.h"
2018 +#include "xvmalloc_int.h"
2019 +
2020 +static void stat_inc(u64 *value)
2021 +{
2022 +       *value = *value + 1;
2023 +}
2024 +
2025 +static void stat_dec(u64 *value)
2026 +{
2027 +       *value = *value - 1;
2028 +}
2029 +
2030 +static int test_flag(struct block_header *block, enum blockflags flag)
2031 +{
2032 +       return block->prev & BIT(flag);
2033 +}
2034 +
2035 +static void set_flag(struct block_header *block, enum blockflags flag)
2036 +{
2037 +       block->prev |= BIT(flag);
2038 +}
2039 +
2040 +static void clear_flag(struct block_header *block, enum blockflags flag)
2041 +{
2042 +       block->prev &= ~BIT(flag);
2043 +}
2044 +
2045 +/*
2046 + * Given <page, offset> pair, provide a derefrencable pointer.
2047 + * This is called from xv_malloc/xv_free path, so it
2048 + * needs to be fast.
2049 + */
2050 +static void *get_ptr_atomic(struct page *page, u16 offset, enum km_type type)
2051 +{
2052 +       unsigned char *base;
2053 +
2054 +       base = kmap_atomic(page, type);
2055 +       return base + offset;
2056 +}
2057 +
2058 +static void put_ptr_atomic(void *ptr, enum km_type type)
2059 +{
2060 +       kunmap_atomic(ptr, type);
2061 +}
2062 +
2063 +static u32 get_blockprev(struct block_header *block)
2064 +{
2065 +       return block->prev & PREV_MASK;
2066 +}
2067 +
2068 +static void set_blockprev(struct block_header *block, u16 new_offset)
2069 +{
2070 +       block->prev = new_offset | (block->prev & FLAGS_MASK);
2071 +}
2072 +
2073 +static struct block_header *BLOCK_NEXT(struct block_header *block)
2074 +{
2075 +       return (struct block_header *)
2076 +               ((char *)block + block->size + XV_ALIGN);
2077 +}
2078 +
2079 +/*
2080 + * Get index of free list containing blocks of maximum size
2081 + * which is less than or equal to given size.
2082 + */
2083 +static u32 get_index_for_insert(u32 size)
2084 +{
2085 +       if (unlikely(size > XV_MAX_ALLOC_SIZE))
2086 +               size = XV_MAX_ALLOC_SIZE;
2087 +       size &= ~FL_DELTA_MASK;
2088 +       return (size - XV_MIN_ALLOC_SIZE) >> FL_DELTA_SHIFT;
2089 +}
2090 +
2091 +/*
2092 + * Get index of free list having blocks of size greater than
2093 + * or equal to requested size.
2094 + */
2095 +static u32 get_index(u32 size)
2096 +{
2097 +       if (unlikely(size < XV_MIN_ALLOC_SIZE))
2098 +               size = XV_MIN_ALLOC_SIZE;
2099 +       size = ALIGN(size, FL_DELTA);
2100 +       return (size - XV_MIN_ALLOC_SIZE) >> FL_DELTA_SHIFT;
2101 +}
2102 +
2103 +/**
2104 + * find_block - find block of at least given size
2105 + * @pool: memory pool to search from
2106 + * @size: size of block required
2107 + * @page: page containing required block
2108 + * @offset: offset within the page where block is located.
2109 + *
2110 + * Searches two level bitmap to locate block of at least
2111 + * the given size. If such a block is found, it provides
2112 + * <page, offset> to identify this block and returns index
2113 + * in freelist where we found this block.
2114 + * Otherwise, returns 0 and <page, offset> params are not touched.
2115 + */
2116 +static u32 find_block(struct xv_pool *pool, u32 size,
2117 +                       struct page **page, u32 *offset)
2118 +{
2119 +       ulong flbitmap, slbitmap;
2120 +       u32 flindex, slindex, slbitstart;
2121 +
2122 +       /* There are no free blocks in this pool */
2123 +       if (!pool->flbitmap)
2124 +               return 0;
2125 +
2126 +       /* Get freelist index correspoding to this size */
2127 +       slindex = get_index(size);
2128 +       slbitmap = pool->slbitmap[slindex / BITS_PER_LONG];
2129 +       slbitstart = slindex % BITS_PER_LONG;
2130 +
2131 +       /*
2132 +        * If freelist is not empty at this index, we found the
2133 +        * block - head of this list. This is approximate best-fit match.
2134 +        */
2135 +       if (test_bit(slbitstart, &slbitmap)) {
2136 +               *page = pool->freelist[slindex].page;
2137 +               *offset = pool->freelist[slindex].offset;
2138 +               return slindex;
2139 +       }
2140 +
2141 +       /*
2142 +        * No best-fit found. Search a bit further in bitmap for a free block.
2143 +        * Second level bitmap consists of series of 32-bit chunks. Search
2144 +        * further in the chunk where we expected a best-fit, starting from
2145 +        * index location found above.
2146 +        */
2147 +       slbitstart++;
2148 +       slbitmap >>= slbitstart;
2149 +
2150 +       /* Skip this search if we were already at end of this bitmap chunk */
2151 +       if ((slbitstart != BITS_PER_LONG) && slbitmap) {
2152 +               slindex += __ffs(slbitmap) + 1;
2153 +               *page = pool->freelist[slindex].page;
2154 +               *offset = pool->freelist[slindex].offset;
2155 +               return slindex;
2156 +       }
2157 +
2158 +       /* Now do a full two-level bitmap search to find next nearest fit */
2159 +       flindex = slindex / BITS_PER_LONG;
2160 +
2161 +       flbitmap = (pool->flbitmap) >> (flindex + 1);
2162 +       if (!flbitmap)
2163 +               return 0;
2164 +
2165 +       flindex += __ffs(flbitmap) + 1;
2166 +       slbitmap = pool->slbitmap[flindex];
2167 +       slindex = (flindex * BITS_PER_LONG) + __ffs(slbitmap);
2168 +       *page = pool->freelist[slindex].page;
2169 +       *offset = pool->freelist[slindex].offset;
2170 +
2171 +       return slindex;
2172 +}
2173 +
2174 +/*
2175 + * Insert block at <page, offset> in freelist of given pool.
2176 + * freelist used depends on block size.
2177 + */
2178 +static void insert_block(struct xv_pool *pool, struct page *page, u32 offset,
2179 +                       struct block_header *block)
2180 +{
2181 +       u32 flindex, slindex;
2182 +       struct block_header *nextblock;
2183 +
2184 +       slindex = get_index_for_insert(block->size);
2185 +       flindex = slindex / BITS_PER_LONG;
2186 +
2187 +       block->link.prev_page = 0;
2188 +       block->link.prev_offset = 0;
2189 +       block->link.next_page = pool->freelist[slindex].page;
2190 +       block->link.next_offset = pool->freelist[slindex].offset;
2191 +       pool->freelist[slindex].page = page;
2192 +       pool->freelist[slindex].offset = offset;
2193 +
2194 +       if (block->link.next_page) {
2195 +               nextblock = get_ptr_atomic(block->link.next_page,
2196 +                                       block->link.next_offset, KM_USER1);
2197 +               nextblock->link.prev_page = page;
2198 +               nextblock->link.prev_offset = offset;
2199 +               put_ptr_atomic(nextblock, KM_USER1);
2200 +       }
2201 +
2202 +       __set_bit(slindex % BITS_PER_LONG, &pool->slbitmap[flindex]);
2203 +       __set_bit(flindex, &pool->flbitmap);
2204 +}
2205 +
2206 +/*
2207 + * Remove block from head of freelist. Index 'slindex' identifies the freelist.
2208 + */
2209 +static void remove_block_head(struct xv_pool *pool,
2210 +                       struct block_header *block, u32 slindex)
2211 +{
2212 +       struct block_header *tmpblock;
2213 +       u32 flindex = slindex / BITS_PER_LONG;
2214 +
2215 +       pool->freelist[slindex].page = block->link.next_page;
2216 +       pool->freelist[slindex].offset = block->link.next_offset;
2217 +       block->link.prev_page = 0;
2218 +       block->link.prev_offset = 0;
2219 +
2220 +       if (!pool->freelist[slindex].page) {
2221 +               __clear_bit(slindex % BITS_PER_LONG, &pool->slbitmap[flindex]);
2222 +               if (!pool->slbitmap[flindex])
2223 +                       __clear_bit(flindex, &pool->flbitmap);
2224 +       } else {
2225 +               /*
2226 +                * DEBUG ONLY: We need not reinitialize freelist head previous
2227 +                * pointer to 0 - we never depend on its value. But just for
2228 +                * sanity, lets do it.
2229 +                */
2230 +               tmpblock = get_ptr_atomic(pool->freelist[slindex].page,
2231 +                               pool->freelist[slindex].offset, KM_USER1);
2232 +               tmpblock->link.prev_page = 0;
2233 +               tmpblock->link.prev_offset = 0;
2234 +               put_ptr_atomic(tmpblock, KM_USER1);
2235 +       }
2236 +}
2237 +
2238 +/*
2239 + * Remove block from freelist. Index 'slindex' identifies the freelist.
2240 + */
2241 +static void remove_block(struct xv_pool *pool, struct page *page, u32 offset,
2242 +                       struct block_header *block, u32 slindex)
2243 +{
2244 +       u32 flindex;
2245 +       struct block_header *tmpblock;
2246 +
2247 +       if (pool->freelist[slindex].page == page
2248 +          && pool->freelist[slindex].offset == offset) {
2249 +               remove_block_head(pool, block, slindex);
2250 +               return;
2251 +       }
2252 +
2253 +       flindex = slindex / BITS_PER_LONG;
2254 +
2255 +       if (block->link.prev_page) {
2256 +               tmpblock = get_ptr_atomic(block->link.prev_page,
2257 +                               block->link.prev_offset, KM_USER1);
2258 +               tmpblock->link.next_page = block->link.next_page;
2259 +               tmpblock->link.next_offset = block->link.next_offset;
2260 +               put_ptr_atomic(tmpblock, KM_USER1);
2261 +       }
2262 +
2263 +       if (block->link.next_page) {
2264 +               tmpblock = get_ptr_atomic(block->link.next_page,
2265 +                               block->link.next_offset, KM_USER1);
2266 +               tmpblock->link.prev_page = block->link.prev_page;
2267 +               tmpblock->link.prev_offset = block->link.prev_offset;
2268 +               put_ptr_atomic(tmpblock, KM_USER1);
2269 +       }
2270 +}
2271 +
2272 +/*
2273 + * Allocate a page and add it to freelist of given pool.
2274 + */
2275 +static int grow_pool(struct xv_pool *pool, gfp_t flags)
2276 +{
2277 +       struct page *page;
2278 +       struct block_header *block;
2279 +
2280 +       page = alloc_page(flags);
2281 +       if (unlikely(!page))
2282 +               return -ENOMEM;
2283 +
2284 +       stat_inc(&pool->total_pages);
2285 +
2286 +       spin_lock(&pool->lock);
2287 +       block = get_ptr_atomic(page, 0, KM_USER0);
2288 +
2289 +       block->size = PAGE_SIZE - XV_ALIGN;
2290 +       set_flag(block, BLOCK_FREE);
2291 +       clear_flag(block, PREV_FREE);
2292 +       set_blockprev(block, 0);
2293 +
2294 +       insert_block(pool, page, 0, block);
2295 +
2296 +       put_ptr_atomic(block, KM_USER0);
2297 +       spin_unlock(&pool->lock);
2298 +
2299 +       return 0;
2300 +}
2301 +
2302 +/*
2303 + * Create a memory pool. Allocates freelist, bitmaps and other
2304 + * per-pool metadata.
2305 + */
2306 +struct xv_pool *xv_create_pool(void)
2307 +{
2308 +       u32 ovhd_size;
2309 +       struct xv_pool *pool;
2310 +
2311 +       ovhd_size = roundup(sizeof(*pool), PAGE_SIZE);
2312 +       pool = kzalloc(ovhd_size, GFP_KERNEL);
2313 +       if (!pool)
2314 +               return NULL;
2315 +
2316 +       spin_lock_init(&pool->lock);
2317 +
2318 +       return pool;
2319 +}
2320 +
2321 +void xv_destroy_pool(struct xv_pool *pool)
2322 +{
2323 +       kfree(pool);
2324 +}
2325 +
2326 +/**
2327 + * xv_malloc - Allocate block of given size from pool.
2328 + * @pool: pool to allocate from
2329 + * @size: size of block to allocate
2330 + * @page: page no. that holds the object
2331 + * @offset: location of object within page
2332 + *
2333 + * On success, <page, offset> identifies block allocated
2334 + * and 0 is returned. On failure, <page, offset> is set to
2335 + * 0 and -ENOMEM is returned.
2336 + *
2337 + * Allocation requests with size > XV_MAX_ALLOC_SIZE will fail.
2338 + */
2339 +int xv_malloc(struct xv_pool *pool, u32 size, struct page **page,
2340 +               u32 *offset, gfp_t flags)
2341 +{
2342 +       int error;
2343 +       u32 index, tmpsize, origsize, tmpoffset;
2344 +       struct block_header *block, *tmpblock;
2345 +
2346 +       *page = NULL;
2347 +       *offset = 0;
2348 +       origsize = size;
2349 +
2350 +       if (unlikely(!size || size > XV_MAX_ALLOC_SIZE))
2351 +               return -ENOMEM;
2352 +
2353 +       size = ALIGN(size, XV_ALIGN);
2354 +
2355 +       spin_lock(&pool->lock);
2356 +
2357 +       index = find_block(pool, size, page, offset);
2358 +
2359 +       if (!*page) {
2360 +               spin_unlock(&pool->lock);
2361 +               if (flags & GFP_NOWAIT)
2362 +                       return -ENOMEM;
2363 +               error = grow_pool(pool, flags);
2364 +               if (unlikely(error))
2365 +                       return error;
2366 +
2367 +               spin_lock(&pool->lock);
2368 +               index = find_block(pool, size, page, offset);
2369 +       }
2370 +
2371 +       if (!*page) {
2372 +               spin_unlock(&pool->lock);
2373 +               return -ENOMEM;
2374 +       }
2375 +
2376 +       block = get_ptr_atomic(*page, *offset, KM_USER0);
2377 +
2378 +       remove_block_head(pool, block, index);
2379 +
2380 +       /* Split the block if required */
2381 +       tmpoffset = *offset + size + XV_ALIGN;
2382 +       tmpsize = block->size - size;
2383 +       tmpblock = (struct block_header *)((char *)block + size + XV_ALIGN);
2384 +       if (tmpsize) {
2385 +               tmpblock->size = tmpsize - XV_ALIGN;
2386 +               set_flag(tmpblock, BLOCK_FREE);
2387 +               clear_flag(tmpblock, PREV_FREE);
2388 +
2389 +               set_blockprev(tmpblock, *offset);
2390 +               if (tmpblock->size >= XV_MIN_ALLOC_SIZE)
2391 +                       insert_block(pool, *page, tmpoffset, tmpblock);
2392 +
2393 +               if (tmpoffset + XV_ALIGN + tmpblock->size != PAGE_SIZE) {
2394 +                       tmpblock = BLOCK_NEXT(tmpblock);
2395 +                       set_blockprev(tmpblock, tmpoffset);
2396 +               }
2397 +       } else {
2398 +               /* This block is exact fit */
2399 +               if (tmpoffset != PAGE_SIZE)
2400 +                       clear_flag(tmpblock, PREV_FREE);
2401 +       }
2402 +
2403 +       block->size = origsize;
2404 +       clear_flag(block, BLOCK_FREE);
2405 +
2406 +       put_ptr_atomic(block, KM_USER0);
2407 +       spin_unlock(&pool->lock);
2408 +
2409 +       *offset += XV_ALIGN;
2410 +
2411 +       return 0;
2412 +}
2413 +
2414 +/*
2415 + * Free block identified with <page, offset>
2416 + */
2417 +void xv_free(struct xv_pool *pool, struct page *page, u32 offset)
2418 +{
2419 +       void *page_start;
2420 +       struct block_header *block, *tmpblock;
2421 +
2422 +       offset -= XV_ALIGN;
2423 +
2424 +       spin_lock(&pool->lock);
2425 +
2426 +       page_start = get_ptr_atomic(page, 0, KM_USER0);
2427 +       block = (struct block_header *)((char *)page_start + offset);
2428 +
2429 +       /* Catch double free bugs */
2430 +       BUG_ON(test_flag(block, BLOCK_FREE));
2431 +
2432 +       block->size = ALIGN(block->size, XV_ALIGN);
2433 +
2434 +       tmpblock = BLOCK_NEXT(block);
2435 +       if (offset + block->size + XV_ALIGN == PAGE_SIZE)
2436 +               tmpblock = NULL;
2437 +
2438 +       /* Merge next block if its free */
2439 +       if (tmpblock && test_flag(tmpblock, BLOCK_FREE)) {
2440 +               /*
2441 +                * Blocks smaller than XV_MIN_ALLOC_SIZE
2442 +                * are not inserted in any free list.
2443 +                */
2444 +               if (tmpblock->size >= XV_MIN_ALLOC_SIZE) {
2445 +                       remove_block(pool, page,
2446 +                                   offset + block->size + XV_ALIGN, tmpblock,
2447 +                                   get_index_for_insert(tmpblock->size));
2448 +               }
2449 +               block->size += tmpblock->size + XV_ALIGN;
2450 +       }
2451 +
2452 +       /* Merge previous block if its free */
2453 +       if (test_flag(block, PREV_FREE)) {
2454 +               tmpblock = (struct block_header *)((char *)(page_start) +
2455 +                                               get_blockprev(block));
2456 +               offset = offset - tmpblock->size - XV_ALIGN;
2457 +
2458 +               if (tmpblock->size >= XV_MIN_ALLOC_SIZE)
2459 +                       remove_block(pool, page, offset, tmpblock,
2460 +                                   get_index_for_insert(tmpblock->size));
2461 +
2462 +               tmpblock->size += block->size + XV_ALIGN;
2463 +               block = tmpblock;
2464 +       }
2465 +
2466 +       /* No used objects in this page. Free it. */
2467 +       if (block->size == PAGE_SIZE - XV_ALIGN) {
2468 +               put_ptr_atomic(page_start, KM_USER0);
2469 +               spin_unlock(&pool->lock);
2470 +
2471 +               __free_page(page);
2472 +               stat_dec(&pool->total_pages);
2473 +               return;
2474 +       }
2475 +
2476 +       set_flag(block, BLOCK_FREE);
2477 +       if (block->size >= XV_MIN_ALLOC_SIZE)
2478 +               insert_block(pool, page, offset, block);
2479 +
2480 +       if (offset + block->size + XV_ALIGN != PAGE_SIZE) {
2481 +               tmpblock = BLOCK_NEXT(block);
2482 +               set_flag(tmpblock, PREV_FREE);
2483 +               set_blockprev(tmpblock, offset);
2484 +       }
2485 +
2486 +       put_ptr_atomic(page_start, KM_USER0);
2487 +       spin_unlock(&pool->lock);
2488 +}
2489 +
2490 +u32 xv_get_object_size(void *obj)
2491 +{
2492 +       struct block_header *blk;
2493 +
2494 +       blk = (struct block_header *)((char *)(obj) - XV_ALIGN);
2495 +       return blk->size;
2496 +}
2497 +
2498 +/*
2499 + * Returns total memory used by allocator (userdata + metadata)
2500 + */
2501 +u64 xv_get_total_size_bytes(struct xv_pool *pool)
2502 +{
2503 +       return pool->total_pages << PAGE_SHIFT;
2504 +}
2505 Index: kernel-power-2.6.28/drivers/block/ramzswap/xvmalloc.h
2506 ===================================================================
2507 --- /dev/null
2508 +++ kernel-power-2.6.28/drivers/block/ramzswap/xvmalloc.h
2509 @@ -0,0 +1,30 @@
2510 +/*
2511 + * xvmalloc memory allocator
2512 + *
2513 + * Copyright (C) 2008, 2009, 2010  Nitin Gupta
2514 + *
2515 + * This code is released using a dual license strategy: BSD/GPL
2516 + * You can choose the licence that better fits your requirements.
2517 + *
2518 + * Released under the terms of 3-clause BSD License
2519 + * Released under the terms of GNU General Public License Version 2.0
2520 + */
2521 +
2522 +#ifndef _XV_MALLOC_H_
2523 +#define _XV_MALLOC_H_
2524 +
2525 +#include <linux/types.h>
2526 +
2527 +struct xv_pool;
2528 +
2529 +struct xv_pool *xv_create_pool(void);
2530 +void xv_destroy_pool(struct xv_pool *pool);
2531 +
2532 +int xv_malloc(struct xv_pool *pool, u32 size, struct page **page,
2533 +                       u32 *offset, gfp_t flags);
2534 +void xv_free(struct xv_pool *pool, struct page *page, u32 offset);
2535 +
2536 +u32 xv_get_object_size(void *obj);
2537 +u64 xv_get_total_size_bytes(struct xv_pool *pool);
2538 +
2539 +#endif
2540 Index: kernel-power-2.6.28/drivers/block/ramzswap/xvmalloc_int.h
2541 ===================================================================
2542 --- /dev/null
2543 +++ kernel-power-2.6.28/drivers/block/ramzswap/xvmalloc_int.h
2544 @@ -0,0 +1,86 @@
2545 +/*
2546 + * xvmalloc memory allocator
2547 + *
2548 + * Copyright (C) 2008, 2009, 2010  Nitin Gupta
2549 + *
2550 + * This code is released using a dual license strategy: BSD/GPL
2551 + * You can choose the licence that better fits your requirements.
2552 + *
2553 + * Released under the terms of 3-clause BSD License
2554 + * Released under the terms of GNU General Public License Version 2.0
2555 + */
2556 +
2557 +#ifndef _XV_MALLOC_INT_H_
2558 +#define _XV_MALLOC_INT_H_
2559 +
2560 +#include <linux/kernel.h>
2561 +#include <linux/types.h>
2562 +
2563 +/* User configurable params */
2564 +
2565 +/* Must be power of two */
2566 +#define XV_ALIGN_SHIFT 2
2567 +#define XV_ALIGN       (1 << XV_ALIGN_SHIFT)
2568 +#define XV_ALIGN_MASK  (XV_ALIGN - 1)
2569 +
2570 +/* This must be greater than sizeof(link_free) */
2571 +#define XV_MIN_ALLOC_SIZE      32
2572 +#define XV_MAX_ALLOC_SIZE      (PAGE_SIZE - XV_ALIGN)
2573 +
2574 +/* Free lists are separated by FL_DELTA bytes */
2575 +#define FL_DELTA_SHIFT 3
2576 +#define FL_DELTA       (1 << FL_DELTA_SHIFT)
2577 +#define FL_DELTA_MASK  (FL_DELTA - 1)
2578 +#define NUM_FREE_LISTS ((XV_MAX_ALLOC_SIZE - XV_MIN_ALLOC_SIZE) \
2579 +                               / FL_DELTA + 1)
2580 +
2581 +#define MAX_FLI                DIV_ROUND_UP(NUM_FREE_LISTS, BITS_PER_LONG)
2582 +
2583 +/* End of user params */
2584 +
2585 +enum blockflags {
2586 +       BLOCK_FREE,
2587 +       PREV_FREE,
2588 +       __NR_BLOCKFLAGS,
2589 +};
2590 +
2591 +#define FLAGS_MASK     XV_ALIGN_MASK
2592 +#define PREV_MASK      (~FLAGS_MASK)
2593 +
2594 +struct freelist_entry {
2595 +       struct page *page;
2596 +       u16 offset;
2597 +       u16 pad;
2598 +};
2599 +
2600 +struct link_free {
2601 +       struct page *prev_page;
2602 +       struct page *next_page;
2603 +       u16 prev_offset;
2604 +       u16 next_offset;
2605 +};
2606 +
2607 +struct block_header {
2608 +       union {
2609 +               /* This common header must be XV_ALIGN bytes */
2610 +               u8 common[XV_ALIGN];
2611 +               struct {
2612 +                       u16 size;
2613 +                       u16 prev;
2614 +               };
2615 +       };
2616 +       struct link_free link;
2617 +};
2618 +
2619 +struct xv_pool {
2620 +       ulong flbitmap;
2621 +       ulong slbitmap[MAX_FLI];
2622 +       spinlock_t lock;
2623 +
2624 +       struct freelist_entry freelist[NUM_FREE_LISTS];
2625 +
2626 +       /* stats */
2627 +       u64 total_pages;
2628 +};
2629 +
2630 +#endif