1 /***************************************
2 $Header: /home/amb/routino/src/RCS/sorting.c,v 1.8 2010/04/09 15:15:02 amb Exp $
6 Part of the Routino routing software.
7 ******************/ /******************
8 This file Copyright 2009 Andrew M. Bishop
10 This program is free software: you can redistribute it and/or modify
11 it under the terms of the GNU Affero General Public License as published by
12 the Free Software Foundation, either version 3 of the License, or
13 (at your option) any later version.
15 This program is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU Affero General Public License for more details.
20 You should have received a copy of the GNU Affero General Public License
21 along with this program. If not, see <http://www.gnu.org/licenses/>.
22 ***************************************/
30 #include "functions.h"
35 /*+ The command line '--tmpdir' option or its default value. +*/
36 extern char *option_tmpdirname;
38 /*+ The amount of RAM to use for filesorting. +*/
39 extern size_t option_filesort_ramsize;
42 /*++++++++++++++++++++++++++++++++++++++
43 A function to sort the contents of a file of fixed length objects using a
44 limited amount of RAM.
46 The data is sorted using a "Merge sort" http://en.wikipedia.org/wiki/Merge_sort
47 and in particular an "external sort" http://en.wikipedia.org/wiki/External_sorting.
48 The individual sort steps and the merge step both use a "Heap sort"
49 http://en.wikipedia.org/wiki/Heapsort. The combination of the two should work well
50 if the data is already partially sorted.
52 int fd_in The file descriptor of the input file (opened for reading and at the beginning).
54 int fd_out The file descriptor of the output file (opened for writing and empty).
56 size_t itemsize The size of each item in the file that needs sorting.
58 int (*compare)(const void*, const void*) The comparison function (identical to qsort if the
59 data to be sorted is an array of things not pointers).
61 int (*buildindex)(void *,index_t) If non-NULL then this function is called for each item, if it
62 returns 1 then it is written to the output file.
63 ++++++++++++++++++++++++++++++++++++++*/
65 void filesort_fixed(int fd_in,int fd_out,size_t itemsize,int (*compare)(const void*,const void*),int (*buildindex)(void*,index_t))
67 int *fds=NULL,*heap=NULL;
69 index_t count=0,total=0;
70 size_t nitems=option_filesort_ramsize/(itemsize+sizeof(void*));
71 void *data=NULL,**datap=NULL;
75 /* Allocate the RAM buffer and other bits */
77 data=malloc(nitems*itemsize);
78 datap=malloc(nitems*sizeof(void*));
80 filename=(char*)malloc(strlen(option_tmpdirname)+24);
82 /* Loop around, fill the buffer, sort the data and write a temporary file */
88 /* Read in the data and create pointers */
92 datap[i]=data+i*itemsize;
94 if(ReadFile(fd_in,datap[i],itemsize))
108 /* Sort the data pointers using a heap sort */
110 heapsort(datap,n,compare);
112 /* Shortcut if all read in and sorted at once */
114 if(nfiles==0 && !more)
118 if(!buildindex || buildindex(datap[i],count))
120 WriteFile(fd_out,datap[i],itemsize);
128 /* Create a temporary file and write the result */
130 sprintf(filename,"%s/filesort.%d.tmp",option_tmpdirname,nfiles);
132 fd=OpenFile(filename);
135 WriteFile(fd,datap[i],itemsize);
143 /* Shortcut if only one file (unlucky for us there must have been exactly
144 nitems, lucky for us we still have the data in RAM) */
148 for(i=0;i<nitems;i++)
150 if(!buildindex || buildindex(datap[i],count))
152 WriteFile(fd_out,datap[i],itemsize);
157 DeleteFile(filename);
162 /* Check that number of files is less than file size */
164 assert(nfiles<nitems);
166 /* Open all of the temporary files */
168 fds=(int*)malloc(nfiles*sizeof(int));
170 for(i=0;i<nfiles;i++)
172 sprintf(filename,"%s/filesort.%d.tmp",option_tmpdirname,i);
174 fds[i]=ReOpenFile(filename);
176 DeleteFile(filename);
179 /* Perform an n-way merge using a binary heap */
181 heap=(int*)malloc(nfiles*sizeof(int));
183 /* Fill the heap to start with */
185 for(i=0;i<nfiles;i++)
189 datap[i]=data+i*itemsize;
191 ReadFile(fds[i],datap[i],itemsize);
197 /* Bubble up the new value */
200 compare(datap[heap[index]],datap[heap[(index-1)/2]])<0)
205 newindex=(index-1)/2;
208 heap[index]=heap[newindex];
215 /* Repeatedly pull out the root of the heap and refill from the same file */
223 if(!buildindex || buildindex(datap[heap[0]],count))
225 WriteFile(fd_out,datap[heap[0]],itemsize);
229 if(ReadFile(fds[heap[0]],datap[heap[0]],itemsize))
235 /* Bubble down the new value */
237 while((2*index+2)<ndata &&
238 (compare(datap[heap[index]],datap[heap[2*index+1]])>0 ||
239 compare(datap[heap[index]],datap[heap[2*index+2]])>0))
244 if(compare(datap[heap[2*index+1]],datap[heap[2*index+2]])<0)
250 heap[newindex]=heap[index];
256 if((2*index+2)==ndata &&
257 compare(datap[heap[index]],datap[heap[2*index+1]])>0)
265 heap[newindex]=heap[index];
277 for(i=0;i<nfiles;i++)
291 /*++++++++++++++++++++++++++++++++++++++
292 A function to sort the contents of a file of variable length objects (each
293 preceded by its length in 2 bytes) using a limited amount of RAM.
295 The data is sorted using a "Merge sort" http://en.wikipedia.org/wiki/Merge_sort
296 and in particular an "external sort" http://en.wikipedia.org/wiki/External_sorting.
297 The individual sort steps and the merge step both use a "Heap sort"
298 http://en.wikipedia.org/wiki/Heapsort. The combination of the two should work well
299 if the data is already partially sorted.
301 int fd_in The file descriptor of the input file (opened for reading and at the beginning).
303 int fd_out The file descriptor of the output file (opened for writing and empty).
305 int (*compare)(const void*, const void*) The comparison function (identical to qsort if the
306 data to be sorted is an array of things not pointers).
308 int (*buildindex)(void *,index_t) If non-NULL then this function is called for each item, if it
309 returns 1 then it is written to the output file.
310 ++++++++++++++++++++++++++++++++++++++*/
312 void filesort_vary(int fd_in,int fd_out,int (*compare)(const void*,const void*),int (*buildindex)(void*,index_t))
314 int *fds=NULL,*heap=NULL;
315 int nfiles=0,ndata=0;
316 index_t count=0,total=0;
317 FILESORT_VARINT nextitemsize,largestitemsize=0;
318 void *data=NULL,**datap=NULL;
322 /* Allocate the RAM buffer and other bits */
324 data=malloc(option_filesort_ramsize);
326 filename=(char*)malloc(strlen(option_tmpdirname)+24);
328 /* Loop around, fill the buffer, sort the data and write a temporary file */
330 if(ReadFile(fd_in,&nextitemsize,FILESORT_VARSIZE)) /* Always have the next item size known in advance */
336 size_t ramused=FILESORT_VARALIGN-FILESORT_VARSIZE;
338 datap=data+option_filesort_ramsize;
340 /* Read in the data and create pointers */
342 while((ramused+FILESORT_VARSIZE+nextitemsize)<=((void*)datap-sizeof(void*)-data))
344 FILESORT_VARINT itemsize=nextitemsize;
346 if(itemsize>largestitemsize)
347 largestitemsize=itemsize;
349 *(FILESORT_VARINT*)(data+ramused)=itemsize;
351 ramused+=FILESORT_VARSIZE;
353 ReadFile(fd_in,data+ramused,itemsize);
355 *--datap=data+ramused; /* points to real data */
359 ramused =FILESORT_VARALIGN*((ramused+FILESORT_VARSIZE-1)/FILESORT_VARALIGN);
360 ramused+=FILESORT_VARALIGN-FILESORT_VARSIZE;
365 if(ReadFile(fd_in,&nextitemsize,FILESORT_VARSIZE))
375 /* Sort the data pointers using a heap sort */
377 heapsort(datap,n,compare);
379 /* Shortcut if all read in and sorted at once */
381 if(nfiles==0 && !more)
385 if(!buildindex || buildindex(datap[i],count))
387 FILESORT_VARINT itemsize=*(FILESORT_VARINT*)(datap[i]-FILESORT_VARSIZE);
389 WriteFile(fd_out,datap[i]-FILESORT_VARSIZE,itemsize+FILESORT_VARSIZE);
397 /* Create a temporary file and write the result */
399 sprintf(filename,"%s/filesort.%d.tmp",option_tmpdirname,nfiles);
401 fd=OpenFile(filename);
405 FILESORT_VARINT itemsize=*(FILESORT_VARINT*)(datap[i]-FILESORT_VARSIZE);
407 WriteFile(fd,datap[i]-FILESORT_VARSIZE,itemsize+FILESORT_VARSIZE);
416 /* Check that number of files is less than file size */
418 largestitemsize=FILESORT_VARALIGN*(1+(largestitemsize+FILESORT_VARALIGN-FILESORT_VARSIZE)/FILESORT_VARALIGN);
420 assert(nfiles<((option_filesort_ramsize-nfiles*sizeof(void*))/largestitemsize));
422 /* Open all of the temporary files */
424 fds=(int*)malloc(nfiles*sizeof(int));
426 for(i=0;i<nfiles;i++)
428 sprintf(filename,"%s/filesort.%d.tmp",option_tmpdirname,i);
430 fds[i]=ReOpenFile(filename);
432 DeleteFile(filename);
435 /* Perform an n-way merge using a binary heap */
437 heap=(int*)malloc(nfiles*sizeof(int));
439 datap=data+option_filesort_ramsize-nfiles*sizeof(void*);
441 /* Fill the heap to start with */
443 for(i=0;i<nfiles;i++)
446 FILESORT_VARINT itemsize;
448 datap[i]=data+FILESORT_VARALIGN-FILESORT_VARSIZE+i*largestitemsize;
450 ReadFile(fds[i],&itemsize,FILESORT_VARSIZE);
452 *(FILESORT_VARINT*)(datap[i]-FILESORT_VARSIZE)=itemsize;
454 ReadFile(fds[i],datap[i],itemsize);
460 /* Bubble up the new value */
463 compare(datap[heap[index]],datap[heap[(index-1)/2]])<0)
468 newindex=(index-1)/2;
471 heap[index]=heap[newindex];
478 /* Repeatedly pull out the root of the heap and refill from the same file */
485 FILESORT_VARINT itemsize;
487 if(!buildindex || buildindex(datap[heap[0]],count))
489 itemsize=*(FILESORT_VARINT*)(datap[heap[0]]-FILESORT_VARSIZE);
491 WriteFile(fd_out,datap[heap[0]]-FILESORT_VARSIZE,itemsize+FILESORT_VARSIZE);
495 if(ReadFile(fds[heap[0]],&itemsize,FILESORT_VARSIZE))
502 *(FILESORT_VARINT*)(datap[heap[0]]-FILESORT_VARSIZE)=itemsize;
504 ReadFile(fds[heap[0]],datap[heap[0]],itemsize);
507 /* Bubble down the new value */
509 while((2*index+2)<ndata &&
510 (compare(datap[heap[index]],datap[heap[2*index+1]])>0 ||
511 compare(datap[heap[index]],datap[heap[2*index+2]])>0))
516 if(compare(datap[heap[2*index+1]],datap[heap[2*index+2]])<0)
522 heap[newindex]=heap[index];
528 if((2*index+2)==ndata &&
529 compare(datap[heap[index]],datap[heap[2*index+1]])>0)
537 heap[newindex]=heap[index];
549 for(i=0;i<nfiles;i++)
562 /*++++++++++++++++++++++++++++++++++++++
563 A function to sort an array of pointers efficiently.
565 The data is sorted using a "Heap sort" http://en.wikipedia.org/wiki/Heapsort,
566 in particular an this good because it can operate in-place and doesn't
567 allocate more memory like using qsort() does.
569 void **datap A pointer to the array of pointers to sort.
571 size_t nitems The number of items of data to sort.
573 int(*compare)(const void *, const void *) The comparison function (identical to qsort if the
574 data to be sorted was an array of things not pointers).
575 ++++++++++++++++++++++++++++++++++++++*/
577 void heapsort(void **datap,size_t nitems,int(*compare)(const void*, const void*))
581 /* Fill the heap by pretending to insert the data that is already there */
583 for(i=1;i<nitems;i++)
587 /* Bubble up the new value (upside-down, put largest at top) */
590 compare(datap[index],datap[(index-1)/2])>0) /* reversed compared to filesort() above */
595 newindex=(index-1)/2;
598 datap[index]=datap[newindex];
599 datap[newindex]=temp;
605 /* Repeatedly pull out the root of the heap and swap with the bottom item */
607 for(i=nitems-1;i>0;i--)
613 datap[index]=datap[i];
616 /* Bubble down the new value (upside-down, put largest at top) */
618 while((2*index+2)<i &&
619 (compare(datap[index],datap[2*index+1])<0 || /* reversed compared to filesort() above */
620 compare(datap[index],datap[2*index+2])<0)) /* reversed compared to filesort() above */
625 if(compare(datap[2*index+1],datap[2*index+2])>0) /* reversed compared to filesort() above */
630 temp=datap[newindex];
631 datap[newindex]=datap[index];
638 compare(datap[index],datap[2*index+1])<0) /* reversed compared to filesort() above */
645 temp=datap[newindex];
646 datap[newindex]=datap[index];