Line data Source code
1 : /*
2 : * Copyright (c) 2011, Secure Endpoints Inc.
3 : * All rights reserved.
4 : *
5 : * Redistribution and use in source and binary forms, with or without
6 : * modification, are permitted provided that the following conditions
7 : * are met:
8 : *
9 : * - Redistributions of source code must retain the above copyright
10 : * notice, this list of conditions and the following disclaimer.
11 : *
12 : * - Redistributions in binary form must reproduce the above copyright
13 : * notice, this list of conditions and the following disclaimer in
14 : * the documentation and/or other materials provided with the
15 : * distribution.
16 : *
17 : * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 : * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 : * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
20 : * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
21 : * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
22 : * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
23 : * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 : * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 : * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
26 : * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27 : * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
28 : * OF THE POSSIBILITY OF SUCH DAMAGE.
29 : *
30 : */
31 :
32 : #include "baselocl.h"
33 :
34 : #include <sys/types.h>
35 : #include <sys/stat.h>
36 : #ifdef HAVE_IO_H
37 : #include <io.h>
38 : #endif
39 : #ifdef HAVE_UNISTD_H
40 : #include <unistd.h>
41 : #endif
42 : #include <fcntl.h>
43 : #include <ctype.h>
44 : #include <stdio.h>
45 : #include <stdlib.h>
46 : #include <string.h>
47 : #ifdef HAVE_STRINGS_H
48 : #include <strings.h>
49 : #endif
50 : #include <errno.h>
51 : #include <assert.h>
52 :
53 : /*
54 : * This file contains functions for binary searching flat text in memory
55 : * and in text files where each line is a [variable length] record.
56 : * Each record has a key and an optional value separated from the key by
57 : * unquoted whitespace. Whitespace in the key, and leading whitespace
58 : * for the value, can be quoted with backslashes (but CR and LF must be
59 : * quoted in such a way that they don't appear in the quoted result).
60 : *
61 : * Binary searching a tree are normally a dead simple algorithm. It
62 : * turns out that binary searching flat text with *variable* length
63 : * records is... tricky. There's no indexes to record beginning bytes,
64 : * thus any index selected during the search is likely to fall in the
65 : * middle of a record. When deciding to search a left sub-tree one
66 : * might fail to find the last record in that sub-tree on account of the
67 : * right boundary falling in the middle of it -- the chosen solution to
68 : * this makes left sub-tree searches slightly less efficient than right
69 : * sub-tree searches.
70 : *
71 : * If binary searching flat text in memory is tricky, using block-wise
72 : * I/O instead is trickier! But it's necessary in order to support
73 : * large files (which we either can't or wouldn't want to read or map
74 : * into memory). Each block we read has to be large enough that the
75 : * largest record can fit in it. And each block might start and/or end
76 : * in the middle of a record. Here it is the right sub-tree searches
77 : * that are less efficient than left sub-tree searches.
78 : *
79 : * bsearch_common() contains the common text block binary search code.
80 : *
81 : * _bsearch_text() is the interface for searching in-core text.
82 : * _bsearch_file() is the interface for block-wise searching files.
83 : */
84 :
85 : struct bsearch_file_handle {
86 : int fd; /* file descriptor */
87 : char *cache; /* cache bytes */
88 : char *page; /* one double-size page worth of bytes */
89 : size_t file_sz; /* file size */
90 : size_t cache_sz; /* cache size */
91 : size_t page_sz; /* page size */
92 : };
93 :
94 : /* Find a new-line */
95 : static const char *
96 0 : find_line(const char *buf, size_t i, size_t right)
97 : {
98 0 : if (i == 0)
99 0 : return &buf[i];
100 0 : for (; i < right; i++) {
101 0 : if (buf[i] == '\n') {
102 0 : if ((i + 1) < right)
103 0 : return &buf[i + 1];
104 0 : return NULL;
105 : }
106 : }
107 0 : return NULL;
108 : }
109 :
110 : /*
111 : * Common routine for binary searching text in core.
112 : *
113 : * Perform a binary search of a char array containing a block from a
114 : * text file where each line is a record (LF and CRLF supported). Each
115 : * record consists of a key followed by an optional value separated from
116 : * the key by whitespace. Whitespace can be quoted with backslashes.
117 : * It's the caller's responsibility to encode/decode keys/values if
118 : * quoting is desired; newlines should be encoded such that a newline
119 : * does not appear in the result.
120 : *
121 : * All output arguments are optional.
122 : *
123 : * Returns 0 if key is found, -1 if not found, or an error code such as
124 : * ENOMEM in case of error.
125 : *
126 : * Inputs:
127 : *
128 : * @buf String to search
129 : * @sz Size of string to search
130 : * @key Key string to search for
131 : * @buf_is_start True if the buffer starts with a record, false if it
132 : * starts in the middle of a record or if the caller
133 : * doesn't know.
134 : *
135 : * Outputs:
136 : *
137 : * @value Location to store a copy of the value (caller must free)
138 : * @location Record location if found else the location where the
139 : * record should be inserted (index into @buf)
140 : * @cmp Set to less than or greater than 0 to indicate that a
141 : * key not found would have fit in an earlier or later
142 : * part of a file. Callers should use this to decide
143 : * whether to read a block to the left or to the right and
144 : * search that.
145 : * @loops Location to store a count of bisections required for
146 : * search (useful for confirming logarithmic performance)
147 : */
148 : static int
149 0 : bsearch_common(const char *buf, size_t sz, const char *key,
150 : int buf_is_start, char **value, size_t *location,
151 : int *cmp, size_t *loops)
152 : {
153 0 : const char *linep;
154 0 : size_t key_start, key_len; /* key string in buf */
155 0 : size_t val_start, val_len; /* value string in buf */
156 0 : int key_cmp = -1;
157 0 : size_t k;
158 0 : size_t l; /* left side of buffer for binary search */
159 0 : size_t r; /* right side of buffer for binary search */
160 0 : size_t rmax; /* right side of buffer for binary search */
161 0 : size_t i; /* index into buffer, typically in the middle of l and r */
162 0 : size_t loop_count = 0;
163 0 : int ret = -1;
164 :
165 0 : if (value)
166 0 : *value = NULL;
167 0 : if (cmp)
168 0 : *cmp = 0;
169 0 : if (loops)
170 0 : *loops = 0;
171 :
172 : /* Binary search; file should be sorted */
173 0 : for (l = 0, r = rmax = sz, i = sz >> 1; i >= l && i < rmax; loop_count++) {
174 0 : heim_assert(i < sz, "invalid aname2lname db index");
175 :
176 : /* buf[i] is likely in the middle of a line; find the next line */
177 0 : linep = find_line(buf, i, rmax);
178 0 : k = linep ? linep - buf : i;
179 0 : if (linep == NULL || k >= rmax) {
180 : /*
181 : * No new line found to the right; search to the left then
182 : * but don't change rmax (this isn't optimal, but it's
183 : * simple).
184 : */
185 0 : if (i == l)
186 0 : break;
187 0 : r = i;
188 0 : i = l + ((r - l) >> 1);
189 0 : continue;
190 : }
191 0 : i = k;
192 0 : heim_assert(i >= l && i < rmax, "invalid aname2lname db index");
193 :
194 : /* Got a line; check it */
195 :
196 : /* Search for and split on unquoted whitespace */
197 0 : val_start = 0;
198 0 : for (key_start = i, key_len = 0, val_len = 0, k = i; k < rmax; k++) {
199 0 : if (buf[k] == '\\') {
200 0 : k++;
201 0 : continue;
202 : }
203 0 : if (buf[k] == '\r' || buf[k] == '\n') {
204 : /* We now know where the key ends, and there's no value */
205 0 : key_len = k - i;
206 0 : break;
207 : }
208 0 : if (!isspace((unsigned char)buf[k]))
209 0 : continue;
210 :
211 0 : while (k < rmax && isspace((unsigned char)buf[k])) {
212 0 : key_len = k - i;
213 0 : k++;
214 : }
215 0 : if (k < rmax)
216 0 : val_start = k;
217 : /* Find end of value */
218 0 : for (; k < rmax && buf[k] != '\0'; k++) {
219 0 : if (buf[k] == '\r' || buf[k] == '\n') {
220 0 : val_len = k - val_start;
221 0 : break;
222 : }
223 : }
224 0 : break;
225 : }
226 :
227 : /*
228 : * The following logic is for dealing with partial buffers,
229 : * which we use for block-wise binary searches of large files
230 : */
231 0 : if (key_start == 0 && !buf_is_start) {
232 : /*
233 : * We're at the beginning of a block that might have started
234 : * in the middle of a record whose "key" might well compare
235 : * as greater than the key we're looking for, so we don't
236 : * bother comparing -- we know key_cmp must be -1 here.
237 : */
238 0 : key_cmp = -1;
239 0 : break;
240 : }
241 0 : if ((val_len && buf[val_start + val_len] != '\n') ||
242 0 : (!val_len && buf[key_start + key_len] != '\n')) {
243 : /*
244 : * We're at the end of a block that ends in the middle of a
245 : * record whose "key" might well compare as less than the
246 : * key we're looking for, so we don't bother comparing -- we
247 : * know key_cmp must be >= 0 but we can't tell. Our caller
248 : * will end up reading a double-size block to handle this.
249 : */
250 0 : key_cmp = 1;
251 0 : break;
252 : }
253 :
254 0 : key_cmp = strncmp(key, &buf[key_start], key_len);
255 0 : if (key_cmp == 0 && strlen(key) != key_len)
256 0 : key_cmp = 1;
257 0 : if (key_cmp < 0) {
258 : /* search left */
259 0 : r = rmax = (linep - buf);
260 0 : i = l + ((r - l) >> 1);
261 0 : if (location)
262 0 : *location = key_start;
263 0 : } else if (key_cmp > 0) {
264 : /* search right */
265 0 : if (l == i)
266 0 : break; /* not found */
267 0 : l = i;
268 0 : i = l + ((r - l) >> 1);
269 0 : if (location)
270 0 : *location = val_start + val_len;
271 : } else {
272 : /* match! */
273 0 : if (location)
274 0 : *location = key_start;
275 0 : ret = 0;
276 0 : if (val_len && value) {
277 : /* Avoid strndup() so we don't need libroken here yet */
278 0 : if ((*value = malloc(val_len + 1))) {
279 0 : (void) memcpy(*value, &buf[val_start], val_len);
280 0 : (*value)[val_len] = '\0';
281 : } else {
282 0 : ret = errno;
283 : }
284 : }
285 0 : break;
286 : }
287 : }
288 :
289 0 : if (cmp)
290 0 : *cmp = key_cmp;
291 0 : if (loops)
292 0 : *loops = loop_count;
293 :
294 0 : return ret;
295 : }
296 :
297 : /*
298 : * Binary search a char array containing sorted text records separated
299 : * by new-lines (or CRLF). Each record consists of a key and an
300 : * optional value following the key, separated from the key by unquoted
301 : * whitespace.
302 : *
303 : * All output arguments are optional.
304 : *
305 : * Returns 0 if key is found, -1 if not found, or an error code such as
306 : * ENOMEM in case of error.
307 : *
308 : * Inputs:
309 : *
310 : * @buf Char array pointer
311 : * @buf_sz Size of buf
312 : * @key Key to search for
313 : *
314 : * Outputs:
315 : *
316 : * @value Location where to put the value, if any (caller must free)
317 : * @location Record location if found else the location where the record
318 : * should be inserted (index into @buf)
319 : * @loops Location where to put a number of loops (or comparisons)
320 : * needed for the search (useful for benchmarking)
321 : */
322 : int
323 0 : _bsearch_text(const char *buf, size_t buf_sz, const char *key,
324 : char **value, size_t *location, size_t *loops)
325 : {
326 0 : return bsearch_common(buf, buf_sz, key, 1, value, location, NULL, loops);
327 : }
328 :
329 : #define MAX_BLOCK_SIZE (1024 * 1024)
330 : #define DEFAULT_MAX_FILE_SIZE (1024 * 1024)
331 : /*
332 : * Open a file for binary searching. The file will be read in entirely
333 : * if it is smaller than @max_sz, else a cache of @max_sz bytes will be
334 : * allocated.
335 : *
336 : * Returns 0 on success, else an error number or -1 if the file is empty.
337 : *
338 : * Inputs:
339 : *
340 : * @fname Name of file to open
341 : * @max_sz Maximum size of cache to allocate, in bytes (if zero, default)
342 : * @page_sz Page size (must be a power of two, larger than 256, smaller
343 : * than 1MB; if zero use default)
344 : *
345 : * Outputs:
346 : *
347 : * @bfh Handle for use with _bsearch_file() and _bsearch_file_close()
348 : * @reads Number of reads performed
349 : */
350 : int
351 0 : _bsearch_file_open(const char *fname, size_t max_sz, size_t page_sz,
352 : bsearch_file_handle *bfh, size_t *reads)
353 : {
354 0 : bsearch_file_handle new_bfh = NULL;
355 0 : struct stat st;
356 0 : size_t i;
357 0 : int fd;
358 0 : int ret;
359 :
360 0 : *bfh = NULL;
361 :
362 0 : if (reads)
363 0 : *reads = 0;
364 :
365 0 : fd = open(fname, O_RDONLY);
366 0 : if (fd == -1)
367 0 : return errno;
368 :
369 0 : if (fstat(fd, &st) == -1) {
370 0 : ret = errno;
371 0 : goto err;
372 : }
373 :
374 0 : if (st.st_size == 0) {
375 0 : ret = -1; /* no data -> no binary search */
376 0 : goto err;
377 : }
378 :
379 : /* Validate / default arguments */
380 0 : if (max_sz == 0)
381 0 : max_sz = DEFAULT_MAX_FILE_SIZE;
382 0 : for (i = page_sz; i; i >>= 1) {
383 : /* Make sure page_sz is a power of two */
384 0 : if ((i % 2) && (i >> 1)) {
385 0 : page_sz = 0;
386 0 : break;
387 : }
388 : }
389 0 : if (page_sz == 0)
390 : #ifdef HAVE_STRUCT_STAT_ST_BLKSIZE
391 : page_sz = st.st_blksize;
392 : #else
393 0 : page_sz = 4096;
394 : #endif
395 0 : for (i = page_sz; i; i >>= 1) {
396 : /* Make sure page_sz is a power of two */
397 0 : if ((i % 2) && (i >> 1)) {
398 : /* Can't happen! Filesystems always use powers of two! */
399 0 : page_sz = 4096;
400 0 : break;
401 : }
402 : }
403 0 : if (page_sz > MAX_BLOCK_SIZE)
404 0 : page_sz = MAX_BLOCK_SIZE;
405 :
406 0 : new_bfh = calloc(1, sizeof (*new_bfh));
407 0 : if (new_bfh == NULL) {
408 0 : ret = ENOMEM;
409 0 : goto err;
410 : }
411 :
412 0 : new_bfh->fd = fd;
413 0 : new_bfh->page_sz = page_sz;
414 0 : new_bfh->file_sz = st.st_size;
415 :
416 0 : if (max_sz >= st.st_size) {
417 : /* Whole-file method */
418 0 : new_bfh->cache = malloc(st.st_size + 1);
419 0 : if (new_bfh->cache) {
420 0 : new_bfh->cache[st.st_size] = '\0';
421 0 : new_bfh->cache_sz = st.st_size;
422 0 : ret = read(fd, new_bfh->cache, st.st_size);
423 0 : if (ret < 0) {
424 0 : ret = errno;
425 0 : goto err;
426 : }
427 0 : if (ret != st.st_size) {
428 0 : ret = EIO; /* XXX ??? */
429 0 : goto err;
430 : }
431 0 : if (reads)
432 0 : *reads = 1;
433 0 : (void) close(fd);
434 0 : new_bfh->fd = -1;
435 0 : *bfh = new_bfh;
436 0 : return 0;
437 : }
438 : }
439 :
440 : /* Block-size method, or above malloc() failed */
441 0 : new_bfh->page = malloc(new_bfh->page_sz << 1);
442 0 : if (new_bfh->page == NULL) {
443 : /* Can't even allocate a single double-size page! */
444 0 : ret = ENOMEM;
445 0 : goto err;
446 : }
447 :
448 0 : new_bfh->cache_sz = max_sz < st.st_size ? max_sz : st.st_size;
449 0 : new_bfh->cache = malloc(new_bfh->cache_sz);
450 0 : *bfh = new_bfh;
451 :
452 : /*
453 : * malloc() may have failed because we were asking for a lot of
454 : * memory, but we may still be able to operate without a cache,
455 : * so let's not fail.
456 : */
457 0 : if (new_bfh->cache == NULL) {
458 0 : new_bfh->cache_sz = 0;
459 0 : return 0;
460 : }
461 :
462 : /* Initialize cache */
463 0 : for (i = 0; i < new_bfh->cache_sz; i += new_bfh->page_sz)
464 0 : new_bfh->cache[i] = '\0';
465 0 : return 0;
466 :
467 0 : err:
468 0 : (void) close(fd);
469 0 : if (new_bfh) {
470 0 : free(new_bfh->page);
471 0 : free(new_bfh->cache);
472 0 : free(new_bfh);
473 : }
474 0 : return ret;
475 : }
476 :
477 : /*
478 : * Indicate whether the given binary search file handle will be searched
479 : * with block-wise method.
480 : */
481 : void
482 0 : _bsearch_file_info(bsearch_file_handle bfh,
483 : size_t *page_sz, size_t *max_sz, int *blockwise)
484 : {
485 0 : if (page_sz)
486 0 : *page_sz = bfh->page_sz;
487 0 : if (max_sz)
488 0 : *max_sz = bfh->cache_sz;
489 0 : if (blockwise)
490 0 : *blockwise = (bfh->file_sz != bfh->cache_sz);
491 0 : }
492 :
493 : /*
494 : * Close the given binary file search handle.
495 : *
496 : * Inputs:
497 : *
498 : * @bfh Pointer to variable containing handle to close.
499 : */
500 : void
501 0 : _bsearch_file_close(bsearch_file_handle *bfh)
502 : {
503 0 : if (!*bfh)
504 0 : return;
505 0 : if ((*bfh)->fd >= 0)
506 0 : (void) close((*bfh)->fd);
507 0 : if ((*bfh)->page)
508 0 : free((*bfh)->page);
509 0 : if ((*bfh)->cache)
510 0 : free((*bfh)->cache);
511 0 : free(*bfh);
512 0 : *bfh = NULL;
513 : }
514 :
515 : /*
516 : * Private function to get a page from a cache. The cache is a char
517 : * array of 2^n - 1 double-size page worth of bytes, where n is the
518 : * number of tree levels that the cache stores. The cache can be
519 : * smaller than n implies.
520 : *
521 : * The page may or may not be valid. If the first byte of it is NUL
522 : * then it's not valid, else it is.
523 : *
524 : * Returns 1 if page is in cache and valid, 0 if the cache is too small
525 : * or the page is invalid. The page address is output in @buf if the
526 : * cache is large enough to contain it regardless of whether the page is
527 : * valid.
528 : *
529 : * Inputs:
530 : *
531 : * @bfh Binary search file handle
532 : * @level Level in the tree that we want a page for
533 : * @page_idx Page number in the given level (0..2^level - 1)
534 : *
535 : * Outputs:
536 : *
537 : * @buf Set to address of page if the cache is large enough
538 : */
539 : static int
540 0 : get_page_from_cache(bsearch_file_handle bfh, size_t level, size_t page_idx,
541 : char **buf)
542 : {
543 0 : size_t idx = 0;
544 0 : size_t page_sz;
545 :
546 0 : page_sz = bfh->page_sz << 1; /* we use double-size pages in the cache */
547 :
548 0 : *buf = NULL;
549 :
550 : /*
551 : * Compute index into cache. The cache is basically an array of
552 : * double-size pages. The first (zeroth) double-size page in the
553 : * cache will be the middle page of the file -- the root of the
554 : * tree. The next two double-size pages will be the left and right
555 : * pages of the second level in the tree. The next four double-size
556 : * pages will be the four pages at the next level. And so on for as
557 : * many pages as fit in the cache.
558 : *
559 : * The page index is the number of the page at the given level. We
560 : * then compute (2^level - 1 + page index) * 2page size, check that
561 : * we have that in the cache, check that the page has been read (it
562 : * doesn't start with NUL).
563 : */
564 0 : if (level)
565 0 : idx = (1 << level) - 1 + page_idx;
566 0 : if (((idx + 1) * page_sz * 2) > bfh->cache_sz)
567 0 : return 0;
568 :
569 0 : *buf = &bfh->cache[idx * page_sz * 2];
570 0 : if (bfh->cache[idx * page_sz * 2] == '\0')
571 0 : return 0; /* cache[idx] == NUL -> page not loaded in cache */
572 0 : return 1;
573 : }
574 :
575 : /*
576 : * Private function to read a page of @page_sz from @fd at offset @off
577 : * into @buf, outputing the number of bytes read, which will be the same
578 : * as @page_sz unless the page being read is the last page, in which
579 : * case the number of remaining bytes in the file will be output.
580 : *
581 : * Returns 0 on success or an errno value otherwise (EIO if reads are
582 : * short).
583 : *
584 : * Inputs:
585 : *
586 : * @bfh Binary search file handle
587 : * @level Level in the binary search tree that we're at
588 : * @page_idx Page "index" at the @level of the tree that we want
589 : * @page Actual page number that we want
590 : * want_double Whether we need a page or double page read
591 : *
592 : * Outputs:
593 : *
594 : * @buf Page read or cached
595 : * @bytes Bytes read (may be less than page or double page size in
596 : * the case of the last page, of course)
597 : */
598 : static int
599 0 : read_page(bsearch_file_handle bfh, size_t level, size_t page_idx, size_t page,
600 : int want_double, const char **buf, size_t *bytes)
601 : {
602 0 : int ret;
603 0 : off_t off;
604 0 : size_t expected;
605 0 : size_t wanted;
606 0 : char *page_buf;
607 :
608 : /* Figure out where we're reading and how much */
609 0 : off = page * bfh->page_sz;
610 0 : if (off < 0)
611 0 : return EOVERFLOW;
612 :
613 0 : wanted = bfh->page_sz << want_double;
614 0 : expected = ((bfh->file_sz - off) > wanted) ? wanted : bfh->file_sz - off;
615 :
616 0 : if (get_page_from_cache(bfh, level, page_idx, &page_buf)) {
617 0 : *buf = page_buf;
618 0 : *bytes = expected;
619 0 : return 0; /* found in cache */
620 : }
621 :
622 :
623 0 : *bytes = 0;
624 0 : *buf = NULL;
625 :
626 : /* OK, we have to read a page or double-size page */
627 :
628 0 : if (page_buf)
629 0 : want_double = 1; /* we'll be caching; we cache double-size pages */
630 : else
631 0 : page_buf = bfh->page; /* we won't cache this page */
632 :
633 0 : wanted = bfh->page_sz << want_double;
634 0 : expected = ((bfh->file_sz - off) > wanted) ? wanted : bfh->file_sz - off;
635 :
636 : #ifdef HAVE_PREAD
637 0 : ret = pread(bfh->fd, page_buf, expected, off);
638 : #else
639 : if (lseek(bfh->fd, off, SEEK_SET) == (off_t)-1)
640 : return errno;
641 : ret = read(bfh->fd, page_buf, expected);
642 : #endif
643 0 : if (ret < 0)
644 0 : return errno;
645 :
646 0 : if (ret != expected)
647 0 : return EIO; /* XXX ??? */
648 :
649 0 : *buf = page_buf;
650 0 : *bytes = expected;
651 0 : return 0;
652 : }
653 :
654 : /*
655 : * Perform a binary search of a file where each line is a record (LF and
656 : * CRLF supported). Each record consists of a key followed by an
657 : * optional value separated from the key by whitespace. Whitespace can
658 : * be quoted with backslashes. It's the caller's responsibility to
659 : * encode/decode keys/values if quoting is desired; newlines should be
660 : * encoded such that a newline does not appear in the result.
661 : *
662 : * The search is done with block-wise I/O (i.e., the whole file is not
663 : * read into memory).
664 : *
665 : * All output arguments are optional.
666 : *
667 : * Returns 0 if key is found, -1 if not found, or an error code such as
668 : * ENOMEM in case of error.
669 : *
670 : * NOTE: We could improve this by not freeing the buffer, instead
671 : * requiring that the caller provide it. Further, we could cache
672 : * the top N levels of [double-size] pages (2^N - 1 pages), which
673 : * should speed up most searches by reducing the number of reads
674 : * by N.
675 : *
676 : * Inputs:
677 : *
678 : * @fd File descriptor (file to search)
679 : * @page_sz Page size (if zero then the file's st_blksize will be used)
680 : * @key Key string to search for
681 : *
682 : * Outputs:
683 : *
684 : * @value Location to store a copy of the value (caller must free)
685 : * @location Record location if found else the location where the
686 : * record should be inserted (index into @buf)
687 : * @loops Location to store a count of bisections required for
688 : * search (useful for confirming logarithmic performance)
689 : * @reads Location to store a count of pages read during search
690 : * (useful for confirming logarithmic performance)
691 : */
692 : int
693 0 : _bsearch_file(bsearch_file_handle bfh, const char *key,
694 : char **value, size_t *location, size_t *loops, size_t *reads)
695 : {
696 0 : int ret;
697 0 : const char *buf;
698 0 : size_t buf_sz;
699 0 : size_t page, l, r;
700 0 : size_t my_reads = 0;
701 0 : size_t my_loops_total = 0;
702 0 : size_t my_loops;
703 0 : size_t level; /* level in the tree */
704 0 : size_t page_idx = 0; /* page number in the tree level */
705 0 : size_t buf_location;
706 0 : int cmp;
707 0 : int buf_ends_in_eol = 0;
708 0 : int buf_is_start = 0;
709 :
710 0 : if (reads)
711 0 : *reads = 0;
712 0 : if (value)
713 0 : *value = NULL;
714 0 : if (loops)
715 0 : *loops = 0;
716 :
717 : /* If whole file is in memory then search that and we're done */
718 0 : if (bfh->file_sz == bfh->cache_sz)
719 0 : return _bsearch_text(bfh->cache, bfh->cache_sz, key, value, location, loops);
720 :
721 : /* Else block-wise binary search */
722 :
723 0 : l = 0;
724 0 : r = (bfh->file_sz / bfh->page_sz) + 1;
725 0 : for (level = 0, page = r >> 1; page >= l && page < r ; level++) {
726 0 : ret = read_page(bfh, level, page_idx, page, 0, &buf, &buf_sz);
727 0 : if (ret != 0)
728 0 : return ret;
729 0 : my_reads++;
730 0 : if (buf[buf_sz - 1] == '\r' || buf[buf_sz - 1] == '\n')
731 0 : buf_ends_in_eol = 1;
732 : else
733 0 : buf_ends_in_eol = 0;
734 :
735 0 : buf_is_start = page == 0 ? 1 : 0;
736 0 : ret = bsearch_common(buf, (size_t)buf_sz, key, buf_is_start,
737 : value, &buf_location, &cmp, &my_loops);
738 0 : if (ret > 0)
739 0 : return ret;
740 : /* Found or no we update stats */
741 0 : my_loops_total += my_loops;
742 0 : if (loops)
743 0 : *loops = my_loops_total;
744 0 : if (reads)
745 0 : *reads = my_reads;
746 0 : if (location)
747 0 : *location = page * bfh->page_sz + buf_location;
748 0 : if (ret == 0)
749 0 : return 0; /* found! */
750 : /* Not found */
751 0 : if (cmp < 0) {
752 : /* Search left */
753 0 : page_idx <<= 1;
754 0 : r = page;
755 0 : page = l + ((r - l) >> 1);
756 0 : continue;
757 : } else {
758 : /*
759 : * Search right, but first search the current and next
760 : * blocks in case that the record we're looking for either
761 : * straddles the boundary between this and the next record,
762 : * or in case the record starts exactly at the next page.
763 : */
764 0 : heim_assert(cmp > 0, "cmp > 0");
765 :
766 0 : if (!buf_ends_in_eol || page == l || page == (r - 1)) {
767 0 : ret = read_page(bfh, level, page_idx, page, 1, &buf, &buf_sz);
768 0 : if (ret != 0)
769 0 : return ret;
770 0 : my_reads++;
771 :
772 0 : buf_is_start = page == l ? 1 : 0;
773 :
774 0 : ret = bsearch_common(buf, (size_t)buf_sz, key, buf_is_start,
775 : value, &buf_location, &cmp, &my_loops);
776 0 : if (ret > 0)
777 0 : return ret;
778 0 : my_loops_total += my_loops;
779 0 : if (loops)
780 0 : *loops = my_loops_total;
781 0 : if (reads)
782 0 : *reads = my_reads;
783 0 : if (location)
784 0 : *location = page * bfh->page_sz + buf_location;
785 0 : if (ret == 0)
786 0 : return 0;
787 : }
788 :
789 : /* Oh well, search right */
790 0 : if (l == page && r == (l + 1))
791 0 : break;
792 0 : page_idx = (page_idx << 1) + 1;
793 0 : l = page;
794 0 : page = l + ((r - l) >> 1);
795 0 : continue;
796 : }
797 : }
798 0 : return -1;
799 : }
800 :
801 :
802 : static int
803 0 : stdb_open(void *plug, const char *dbtype, const char *dbname,
804 : heim_dict_t options, void **db, heim_error_t *error)
805 : {
806 0 : bsearch_file_handle bfh;
807 0 : char *p;
808 0 : int ret;
809 :
810 0 : if (error)
811 0 : *error = NULL;
812 0 : if (dbname == NULL || *dbname == '\0') {
813 0 : if (error)
814 0 : *error = heim_error_create(EINVAL,
815 0 : N_("DB name required for sorted-text DB "
816 : "plugin", ""));
817 0 : return EINVAL;
818 : }
819 0 : p = strrchr(dbname, '.');
820 0 : if (p == NULL || strcmp(p, ".txt") != 0) {
821 0 : if (error)
822 0 : *error = heim_error_create(ENOTSUP,
823 0 : N_("Text file (name ending in .txt) "
824 : "required for sorted-text DB plugin",
825 : ""));
826 0 : return ENOTSUP;
827 : }
828 :
829 0 : ret = _bsearch_file_open(dbname, 0, 0, &bfh, NULL);
830 0 : if (ret)
831 0 : return ret;
832 :
833 0 : *db = bfh;
834 0 : return 0;
835 : }
836 :
837 : static int
838 0 : stdb_close(void *db, heim_error_t *error)
839 : {
840 0 : bsearch_file_handle bfh = db;
841 :
842 0 : if (error)
843 0 : *error = NULL;
844 0 : _bsearch_file_close(&bfh);
845 0 : return 0;
846 : }
847 :
848 : static heim_data_t
849 0 : stdb_copy_value(void *db, heim_string_t table, heim_data_t key,
850 : heim_error_t *error)
851 : {
852 0 : bsearch_file_handle bfh = db;
853 0 : const char *k;
854 0 : char *v = NULL;
855 0 : heim_data_t value;
856 0 : int ret;
857 :
858 0 : if (error)
859 0 : *error = NULL;
860 :
861 0 : if (table == NULL)
862 0 : table = HSTR("");
863 :
864 0 : if (table != HSTR(""))
865 0 : return NULL;
866 :
867 0 : if (heim_get_tid(key) == HEIM_TID_STRING)
868 0 : k = heim_string_get_utf8((heim_string_t)key);
869 : else
870 0 : k = (const char *)heim_data_get_ptr(key);
871 0 : ret = _bsearch_file(bfh, k, &v, NULL, NULL, NULL);
872 0 : if (ret == 0 && v == NULL)
873 0 : ret = -1; /* Quiet lint */
874 0 : if (ret != 0) {
875 0 : if (ret > 0 && error)
876 0 : *error = heim_error_create(ret, "%s", strerror(ret));
877 0 : return NULL;
878 : }
879 0 : value = heim_data_create(v, strlen(v));
880 0 : free(v);
881 : /* XXX Handle ENOMEM */
882 0 : return value;
883 : }
884 :
885 : struct heim_db_type heim_sorted_text_file_dbtype = {
886 : 1, stdb_open, NULL, stdb_close, NULL, NULL, NULL, NULL, NULL, NULL,
887 : stdb_copy_value, NULL, NULL, NULL
888 : };
|