Line data Source code
1 : /*
2 : Unix SMB/CIFS implementation.
3 :
4 : trivial database library
5 :
6 : Copyright (C) Andrew Tridgell 2005
7 :
8 : ** NOTE! The following LGPL license applies to the tdb
9 : ** library. This does NOT imply that all of Samba is released
10 : ** under the LGPL
11 :
12 : This library is free software; you can redistribute it and/or
13 : modify it under the terms of the GNU Lesser General Public
14 : License as published by the Free Software Foundation; either
15 : version 3 of the License, or (at your option) any later version.
16 :
17 : This library is distributed in the hope that it will be useful,
18 : but WITHOUT ANY WARRANTY; without even the implied warranty of
19 : MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 : Lesser General Public License for more details.
21 :
22 : You should have received a copy of the GNU Lesser General Public
23 : License along with this library; if not, see <http://www.gnu.org/licenses/>.
24 : */
25 :
26 : #include "tdb_private.h"
27 :
28 : /*
29 : transaction design:
30 :
31 : - only allow a single transaction at a time per database. This makes
32 : using the transaction API simpler, as otherwise the caller would
33 : have to cope with temporary failures in transactions that conflict
34 : with other current transactions
35 :
36 : - keep the transaction recovery information in the same file as the
37 : database, using a special 'transaction recovery' record pointed at
38 : by the header. This removes the need for extra journal files as
39 : used by some other databases
40 :
41 : - dynamically allocated the transaction recover record, re-using it
42 : for subsequent transactions. If a larger record is needed then
43 : tdb_free() the old record to place it on the normal tdb freelist
44 : before allocating the new record
45 :
46 : - during transactions, keep a linked list of all writes that have
47 : been performed by intercepting all tdb_write() calls. The hooked
48 : transaction versions of tdb_read() and tdb_write() check this
49 : linked list and try to use the elements of the list in preference
50 : to the real database.
51 :
52 : - don't allow any locks to be held when a transaction starts,
53 : otherwise we can end up with deadlock (plus lack of lock nesting
54 : in posix locks would mean the lock is lost)
55 :
56 : - if the caller gains a lock during the transaction but doesn't
57 : release it then fail the commit
58 :
59 : - allow for nested calls to tdb_transaction_start(), re-using the
60 : existing transaction record. If the inner transaction is cancelled
61 : then a subsequent commit will fail
62 :
63 : - keep a mirrored copy of the tdb hash chain heads to allow for the
64 : fast hash heads scan on traverse, updating the mirrored copy in
65 : the transaction version of tdb_write
66 :
67 : - allow callers to mix transaction and non-transaction use of tdb,
68 : although once a transaction is started then an exclusive lock is
69 : gained until the transaction is committed or cancelled
70 :
71 : - the commit strategy involves first saving away all modified data
72 : into a linearised buffer in the transaction recovery area, then
73 : marking the transaction recovery area with a magic value to
74 : indicate a valid recovery record. In total 4 fsync/msync calls are
75 : needed per commit to prevent race conditions. It might be possible
76 : to reduce this to 3 or even 2 with some more work.
77 :
78 : - check for a valid recovery record on open of the tdb, while the
79 : open lock is held. Automatically recover from the transaction
80 : recovery area if needed, then continue with the open as
81 : usual. This allows for smooth crash recovery with no administrator
82 : intervention.
83 :
84 : - if TDB_NOSYNC is passed to flags in tdb_open then transactions are
85 : still available, but no fsync/msync calls are made. This means we
86 : are still proof against a process dying during transaction commit,
87 : but not against machine reboot.
88 :
89 : - if TDB_ALLOW_NESTING is passed to flags in tdb open, or added using
90 : tdb_add_flags() transaction nesting is enabled.
91 : It resets the TDB_DISALLOW_NESTING flag, as both cannot be used together.
92 : The default is that transaction nesting is allowed.
93 : Note: this default may change in future versions of tdb.
94 :
95 : Beware. when transactions are nested a transaction successfully
96 : completed with tdb_transaction_commit() can be silently unrolled later.
97 :
98 : - if TDB_DISALLOW_NESTING is passed to flags in tdb open, or added using
99 : tdb_add_flags() transaction nesting is disabled.
100 : It resets the TDB_ALLOW_NESTING flag, as both cannot be used together.
101 : An attempt create a nested transaction will fail with TDB_ERR_NESTING.
102 : The default is that transaction nesting is allowed.
103 : Note: this default may change in future versions of tdb.
104 : */
105 :
106 :
107 : /*
108 : hold the context of any current transaction
109 : */
110 : struct tdb_transaction {
111 : /* we keep a mirrored copy of the tdb hash heads here so
112 : tdb_next_hash_chain() can operate efficiently */
113 : uint32_t *hash_heads;
114 :
115 : /* the original io methods - used to do IOs to the real db */
116 : const struct tdb_methods *io_methods;
117 :
118 : /* the list of transaction blocks. When a block is first
119 : written to, it gets created in this list */
120 : uint8_t **blocks;
121 : uint32_t num_blocks;
122 : uint32_t block_size; /* bytes in each block */
123 : uint32_t last_block_size; /* number of valid bytes in the last block */
124 :
125 : /* non-zero when an internal transaction error has
126 : occurred. All write operations will then fail until the
127 : transaction is ended */
128 : int transaction_error;
129 :
130 : /* when inside a transaction we need to keep track of any
131 : nested tdb_transaction_start() calls, as these are allowed,
132 : but don't create a new transaction */
133 : int nesting;
134 :
135 : /* set when a prepare has already occurred */
136 : bool prepared;
137 : tdb_off_t magic_offset;
138 :
139 : /* old file size before transaction */
140 : tdb_len_t old_map_size;
141 :
142 : /* did we expand in this transaction */
143 : bool expanded;
144 : };
145 :
146 :
147 : /*
148 : read while in a transaction. We need to check first if the data is in our list
149 : of transaction elements, then if not do a real read
150 : */
151 3027189020 : static int transaction_read(struct tdb_context *tdb, tdb_off_t off, void *buf,
152 : tdb_len_t len, int cv)
153 : {
154 325513155 : uint32_t blk;
155 :
156 : /* break it down into block sized ops */
157 3127283275 : while (len + (off % tdb->transaction->block_size) > tdb->transaction->block_size) {
158 100094255 : tdb_len_t len2 = tdb->transaction->block_size - (off % tdb->transaction->block_size);
159 100094255 : if (transaction_read(tdb, off, buf, len2, cv) != 0) {
160 0 : return -1;
161 : }
162 100094255 : len -= len2;
163 100094255 : off += len2;
164 100094255 : buf = (void *)(len2 + (char *)buf);
165 : }
166 :
167 3027189020 : if (len == 0) {
168 335 : return 0;
169 : }
170 :
171 3027188685 : blk = off / tdb->transaction->block_size;
172 :
173 : /* see if we have it in the block list */
174 3027188685 : if (tdb->transaction->num_blocks <= blk ||
175 2232217479 : tdb->transaction->blocks[blk] == NULL) {
176 : /* nope, do a real read */
177 1867732479 : if (tdb->transaction->io_methods->tdb_read(tdb, off, buf, len, cv) != 0) {
178 0 : goto fail;
179 : }
180 1785990719 : return 0;
181 : }
182 :
183 : /* it is in the block list. Now check for the last block */
184 1159456206 : if (blk == tdb->transaction->num_blocks-1) {
185 109225021 : if (len > tdb->transaction->last_block_size) {
186 0 : goto fail;
187 : }
188 : }
189 :
190 : /* now copy it out of this block */
191 1159456206 : memcpy(buf, tdb->transaction->blocks[blk] + (off % tdb->transaction->block_size), len);
192 1159456206 : if (cv) {
193 0 : tdb_convert(buf, len);
194 : }
195 915684811 : return 0;
196 :
197 0 : fail:
198 0 : TDB_LOG((tdb, TDB_DEBUG_FATAL, "transaction_read: failed at off=%u len=%u\n", off, len));
199 0 : tdb->ecode = TDB_ERR_IO;
200 0 : tdb->transaction->transaction_error = 1;
201 0 : return -1;
202 : }
203 :
204 :
205 : /*
206 : write while in a transaction
207 : */
208 118851893 : static int transaction_write(struct tdb_context *tdb, tdb_off_t off,
209 : const void *buf, tdb_len_t len)
210 : {
211 9263613 : uint32_t blk;
212 :
213 118851893 : if (buf == NULL) {
214 0 : return -1;
215 : }
216 :
217 : /* Only a commit is allowed on a prepared transaction */
218 118851893 : if (tdb->transaction->prepared) {
219 0 : tdb->ecode = TDB_ERR_EINVAL;
220 0 : TDB_LOG((tdb, TDB_DEBUG_FATAL, "transaction_write: transaction already prepared, write not allowed\n"));
221 0 : tdb->transaction->transaction_error = 1;
222 0 : return -1;
223 : }
224 :
225 : /* if the write is to a hash head, then update the transaction
226 : hash heads */
227 118851893 : if (len == sizeof(tdb_off_t) && off >= FREELIST_TOP &&
228 32882901 : off < FREELIST_TOP+TDB_HASHTABLE_SIZE(tdb)) {
229 11497352 : uint32_t chain = (off-FREELIST_TOP) / sizeof(tdb_off_t);
230 11497352 : memcpy(&tdb->transaction->hash_heads[chain], buf, len);
231 : }
232 :
233 : /* break it up into block sized chunks */
234 125044657 : while (len + (off % tdb->transaction->block_size) > tdb->transaction->block_size) {
235 6192764 : tdb_len_t len2 = tdb->transaction->block_size - (off % tdb->transaction->block_size);
236 6192764 : if (transaction_write(tdb, off, buf, len2) != 0) {
237 0 : return -1;
238 : }
239 6192764 : len -= len2;
240 6192764 : off += len2;
241 6192764 : buf = (const void *)(len2 + (const char *)buf);
242 : }
243 :
244 118851893 : if (len == 0) {
245 1 : return 0;
246 : }
247 :
248 118851892 : blk = off / tdb->transaction->block_size;
249 118851892 : off = off % tdb->transaction->block_size;
250 :
251 118851892 : if (tdb->transaction->num_blocks <= blk) {
252 219926 : uint8_t **new_blocks;
253 : /* expand the blocks array */
254 6249979 : new_blocks = (uint8_t **)realloc(tdb->transaction->blocks,
255 6249979 : (blk+1)*sizeof(uint8_t *));
256 6249979 : if (new_blocks == NULL) {
257 0 : tdb->ecode = TDB_ERR_OOM;
258 0 : goto fail;
259 : }
260 6249979 : memset(&new_blocks[tdb->transaction->num_blocks], 0,
261 6249979 : (1+(blk - tdb->transaction->num_blocks))*sizeof(uint8_t *));
262 6249979 : tdb->transaction->blocks = new_blocks;
263 6249979 : tdb->transaction->num_blocks = blk+1;
264 6249979 : tdb->transaction->last_block_size = 0;
265 : }
266 :
267 : /* allocate and fill a block? */
268 118851892 : if (tdb->transaction->blocks[blk] == NULL) {
269 12133989 : tdb->transaction->blocks[blk] = (uint8_t *)calloc(tdb->transaction->block_size, 1);
270 12133989 : if (tdb->transaction->blocks[blk] == NULL) {
271 0 : tdb->ecode = TDB_ERR_OOM;
272 0 : tdb->transaction->transaction_error = 1;
273 0 : return -1;
274 : }
275 12133989 : if (tdb->transaction->old_map_size > blk * tdb->transaction->block_size) {
276 9684486 : tdb_len_t len2 = tdb->transaction->block_size;
277 9684486 : if (len2 + (blk * tdb->transaction->block_size) > tdb->transaction->old_map_size) {
278 74647 : len2 = tdb->transaction->old_map_size - (blk * tdb->transaction->block_size);
279 : }
280 9684486 : if (tdb->transaction->io_methods->tdb_read(tdb, blk * tdb->transaction->block_size,
281 9541432 : tdb->transaction->blocks[blk],
282 : len2, 0) != 0) {
283 0 : SAFE_FREE(tdb->transaction->blocks[blk]);
284 0 : tdb->ecode = TDB_ERR_IO;
285 0 : goto fail;
286 : }
287 9684486 : if (blk == tdb->transaction->num_blocks-1) {
288 3800476 : tdb->transaction->last_block_size = len2;
289 : }
290 : }
291 : }
292 :
293 : /* overwrite part of an existing block */
294 118851892 : memcpy(tdb->transaction->blocks[blk] + off, buf, len);
295 118851892 : if (blk == tdb->transaction->num_blocks-1) {
296 27181477 : if (len + off > tdb->transaction->last_block_size) {
297 2536310 : tdb->transaction->last_block_size = len + off;
298 : }
299 : }
300 :
301 109588279 : return 0;
302 :
303 0 : fail:
304 0 : TDB_LOG((tdb, TDB_DEBUG_FATAL, "transaction_write: failed at off=%u len=%u\n",
305 : (blk*tdb->transaction->block_size) + off, len));
306 0 : tdb->transaction->transaction_error = 1;
307 0 : return -1;
308 : }
309 :
310 :
311 : /*
312 : write while in a transaction - this variant never expands the transaction blocks, it only
313 : updates existing blocks. This means it cannot change the recovery size
314 : */
315 12020014 : static int transaction_write_existing(struct tdb_context *tdb, tdb_off_t off,
316 : const void *buf, tdb_len_t len)
317 : {
318 179403 : uint32_t blk;
319 :
320 : /* break it up into block sized chunks */
321 21698983 : while (len + (off % tdb->transaction->block_size) > tdb->transaction->block_size) {
322 9678969 : tdb_len_t len2 = tdb->transaction->block_size - (off % tdb->transaction->block_size);
323 9678969 : if (transaction_write_existing(tdb, off, buf, len2) != 0) {
324 0 : return -1;
325 : }
326 9678969 : len -= len2;
327 9678969 : off += len2;
328 9678969 : if (buf != NULL) {
329 9678969 : buf = (const void *)(len2 + (const char *)buf);
330 : }
331 : }
332 :
333 12020014 : if (len == 0 || buf == NULL) {
334 0 : return 0;
335 : }
336 :
337 12020014 : blk = off / tdb->transaction->block_size;
338 12020014 : off = off % tdb->transaction->block_size;
339 :
340 12020014 : if (tdb->transaction->num_blocks <= blk ||
341 6153491 : tdb->transaction->blocks[blk] == NULL) {
342 11768774 : return 0;
343 : }
344 :
345 75010 : if (blk == tdb->transaction->num_blocks-1 &&
346 4 : off + len > tdb->transaction->last_block_size) {
347 0 : if (off >= tdb->transaction->last_block_size) {
348 0 : return 0;
349 : }
350 0 : len = tdb->transaction->last_block_size - off;
351 : }
352 :
353 : /* overwrite part of an existing block */
354 75010 : memcpy(tdb->transaction->blocks[blk] + off, buf, len);
355 :
356 75010 : return 0;
357 : }
358 :
359 :
360 : /*
361 : accelerated hash chain head search, using the cached hash heads
362 : */
363 60264815 : static void transaction_next_hash_chain(struct tdb_context *tdb, uint32_t *chain)
364 : {
365 60264815 : uint32_t h = *chain;
366 410963894 : for (;h < tdb->hash_size;h++) {
367 : /* the +1 takes account of the freelist */
368 410888252 : if (0 != tdb->transaction->hash_heads[h+1]) {
369 59650749 : break;
370 : }
371 : }
372 60264815 : (*chain) = h;
373 60264815 : }
374 :
375 : /*
376 : out of bounds check during a transaction
377 : */
378 84794 : static int transaction_oob(struct tdb_context *tdb, tdb_off_t off,
379 : tdb_len_t len, int probe)
380 : {
381 : /*
382 : * This duplicates functionality from tdb_oob(). Don't remove:
383 : * we still have direct callers of tdb->methods->tdb_oob()
384 : * inside transaction.c.
385 : */
386 84794 : if (off + len >= off && off + len <= tdb->map_size) {
387 0 : return 0;
388 : }
389 84794 : tdb->ecode = TDB_ERR_IO;
390 84794 : return -1;
391 : }
392 :
393 : /*
394 : transaction version of tdb_expand().
395 : */
396 84776 : static int transaction_expand_file(struct tdb_context *tdb, tdb_off_t size,
397 : tdb_off_t addition)
398 : {
399 84776 : const char buf_zero[8192] = {0};
400 84776 : size_t buf_len = sizeof(buf_zero);
401 :
402 1354119 : while (addition > 0) {
403 1269343 : size_t n = MIN(addition, buf_len);
404 97731 : int ret;
405 :
406 1269343 : ret = transaction_write(tdb, size, buf_zero, n);
407 1269343 : if (ret != 0) {
408 0 : return ret;
409 : }
410 :
411 1269343 : addition -= n;
412 1269343 : size += n;
413 : }
414 :
415 84776 : tdb->transaction->expanded = true;
416 :
417 84776 : return 0;
418 : }
419 :
420 : static const struct tdb_methods transaction_methods = {
421 : transaction_read,
422 : transaction_write,
423 : transaction_next_hash_chain,
424 : transaction_oob,
425 : transaction_expand_file,
426 : };
427 :
428 : /*
429 : * Is a transaction currently active on this context?
430 : *
431 : */
432 998948031 : _PUBLIC_ bool tdb_transaction_active(struct tdb_context *tdb)
433 : {
434 998948031 : return (tdb->transaction != NULL);
435 : }
436 :
437 : /*
438 : start a tdb transaction. No token is returned, as only a single
439 : transaction is allowed to be pending per tdb_context
440 : */
441 6206990 : static int _tdb_transaction_start(struct tdb_context *tdb,
442 : enum tdb_lock_flags lockflags)
443 : {
444 : /* some sanity checks */
445 6206990 : if (tdb->read_only || (tdb->flags & TDB_INTERNAL)
446 6206990 : || tdb->traverse_read) {
447 0 : TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_start: cannot start a transaction on a read-only or internal db\n"));
448 0 : tdb->ecode = TDB_ERR_EINVAL;
449 0 : return -1;
450 : }
451 :
452 : /* cope with nested tdb_transaction_start() calls */
453 6206990 : if (tdb->transaction != NULL) {
454 2840232 : if (!(tdb->flags & TDB_ALLOW_NESTING)) {
455 1 : tdb->ecode = TDB_ERR_NESTING;
456 1 : return -1;
457 : }
458 2840231 : tdb->transaction->nesting++;
459 2840231 : TDB_LOG((tdb, TDB_DEBUG_TRACE, "tdb_transaction_start: nesting %d\n",
460 : tdb->transaction->nesting));
461 2840231 : return 0;
462 : }
463 :
464 3366758 : if (tdb_have_extra_locks(tdb)) {
465 : /* the caller must not have any locks when starting a
466 : transaction as otherwise we'll be screwed by lack
467 : of nested locks in posix */
468 0 : TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_start: cannot start a transaction with locks held\n"));
469 0 : tdb->ecode = TDB_ERR_LOCK;
470 0 : return -1;
471 : }
472 :
473 3366758 : if (tdb->travlocks.next != NULL) {
474 : /* you cannot use transactions inside a traverse (although you can use
475 : traverse inside a transaction) as otherwise you can end up with
476 : deadlock */
477 0 : TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_start: cannot start a transaction within a traverse\n"));
478 0 : tdb->ecode = TDB_ERR_LOCK;
479 0 : return -1;
480 : }
481 :
482 3366758 : tdb->transaction = (struct tdb_transaction *)
483 3366758 : calloc(sizeof(struct tdb_transaction), 1);
484 3366758 : if (tdb->transaction == NULL) {
485 0 : tdb->ecode = TDB_ERR_OOM;
486 0 : return -1;
487 : }
488 :
489 : /* a page at a time seems like a reasonable compromise between compactness and efficiency */
490 3366758 : tdb->transaction->block_size = tdb->page_size;
491 :
492 : /* get the transaction write lock. This is a blocking lock. As
493 : discussed with Volker, there are a number of ways we could
494 : make this async, which we will probably do in the future */
495 3366758 : if (tdb_transaction_lock(tdb, F_WRLCK, lockflags) == -1) {
496 6 : SAFE_FREE(tdb->transaction->blocks);
497 6 : SAFE_FREE(tdb->transaction);
498 6 : if ((lockflags & TDB_LOCK_WAIT) == 0) {
499 1 : tdb->ecode = TDB_ERR_NOLOCK;
500 : } else {
501 5 : TDB_LOG((tdb, TDB_DEBUG_ERROR,
502 : "tdb_transaction_start: "
503 : "failed to get transaction lock\n"));
504 : }
505 6 : return -1;
506 : }
507 :
508 : /* get a read lock from the freelist to the end of file. This
509 : is upgraded to a write lock during the commit */
510 3366752 : if (tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_WAIT, true) == -1) {
511 0 : TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_start: failed to get hash locks\n"));
512 0 : goto fail_allrecord_lock;
513 : }
514 :
515 : /* setup a copy of the hash table heads so the hash scan in
516 : traverse can be fast */
517 3366752 : tdb->transaction->hash_heads = (uint32_t *)
518 3366752 : calloc(tdb->hash_size+1, sizeof(uint32_t));
519 3366752 : if (tdb->transaction->hash_heads == NULL) {
520 0 : tdb->ecode = TDB_ERR_OOM;
521 0 : goto fail;
522 : }
523 3366752 : if (tdb->methods->tdb_read(tdb, FREELIST_TOP, tdb->transaction->hash_heads,
524 3343061 : TDB_HASHTABLE_SIZE(tdb), 0) != 0) {
525 0 : TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_start: failed to read hash heads\n"));
526 0 : tdb->ecode = TDB_ERR_IO;
527 0 : goto fail;
528 : }
529 :
530 : /* make sure we know about any file expansions already done by
531 : anyone else */
532 3366752 : tdb_oob(tdb, tdb->map_size, 1, 1);
533 3366752 : tdb->transaction->old_map_size = tdb->map_size;
534 :
535 : /* finally hook the io methods, replacing them with
536 : transaction specific methods */
537 3366752 : tdb->transaction->io_methods = tdb->methods;
538 3366752 : tdb->methods = &transaction_methods;
539 :
540 : /* Trace at the end, so we get sequence number correct. */
541 23691 : tdb_trace(tdb, "tdb_transaction_start");
542 3366752 : return 0;
543 :
544 0 : fail:
545 0 : tdb_allrecord_unlock(tdb, F_RDLCK, false);
546 0 : fail_allrecord_lock:
547 0 : tdb_transaction_unlock(tdb, F_WRLCK);
548 0 : SAFE_FREE(tdb->transaction->blocks);
549 0 : SAFE_FREE(tdb->transaction->hash_heads);
550 0 : SAFE_FREE(tdb->transaction);
551 0 : return -1;
552 : }
553 :
554 6206989 : _PUBLIC_ int tdb_transaction_start(struct tdb_context *tdb)
555 : {
556 6206989 : return _tdb_transaction_start(tdb, TDB_LOCK_WAIT);
557 : }
558 :
559 1 : _PUBLIC_ int tdb_transaction_start_nonblock(struct tdb_context *tdb)
560 : {
561 1 : return _tdb_transaction_start(tdb, TDB_LOCK_NOWAIT|TDB_LOCK_PROBE);
562 : }
563 :
564 : /*
565 : sync to disk
566 : */
567 4532037 : static int transaction_sync(struct tdb_context *tdb, tdb_off_t offset, tdb_len_t length)
568 : {
569 4532037 : if (tdb->flags & TDB_NOSYNC) {
570 4456821 : return 0;
571 : }
572 :
573 : #ifdef HAVE_FDATASYNC
574 8400 : if (fdatasync(tdb->fd) != 0) {
575 : #else
576 : if (fsync(tdb->fd) != 0) {
577 : #endif
578 0 : tdb->ecode = TDB_ERR_IO;
579 0 : TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction: fsync failed\n"));
580 0 : return -1;
581 : }
582 : #ifdef HAVE_MMAP
583 8400 : if (tdb->map_ptr) {
584 8400 : tdb_off_t moffset = offset & ~(tdb->page_size-1);
585 8400 : if (msync(moffset + (char *)tdb->map_ptr,
586 8400 : length + (offset - moffset), MS_SYNC) != 0) {
587 0 : tdb->ecode = TDB_ERR_IO;
588 0 : TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction: msync failed - %s\n",
589 : strerror(errno)));
590 0 : return -1;
591 : }
592 : }
593 : #endif
594 8400 : return 0;
595 : }
596 :
597 :
598 3366767 : static int _tdb_transaction_cancel(struct tdb_context *tdb)
599 : {
600 23688 : uint32_t i;
601 3366767 : int ret = 0;
602 :
603 3366767 : if (tdb->transaction == NULL) {
604 4 : TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_cancel: no transaction\n"));
605 4 : return -1;
606 : }
607 :
608 3366763 : if (tdb->transaction->nesting != 0) {
609 0 : tdb->transaction->transaction_error = 1;
610 0 : tdb->transaction->nesting--;
611 0 : return 0;
612 : }
613 :
614 3366763 : tdb->map_size = tdb->transaction->old_map_size;
615 :
616 : /* free all the transaction blocks */
617 7846569 : for (i=0;i<tdb->transaction->num_blocks;i++) {
618 4479806 : if ((tdb->transaction->blocks != NULL) &&
619 4479806 : tdb->transaction->blocks[i] != NULL) {
620 8902 : free(tdb->transaction->blocks[i]);
621 : }
622 : }
623 3366763 : SAFE_FREE(tdb->transaction->blocks);
624 :
625 3366763 : if (tdb->transaction->magic_offset) {
626 1133029 : const struct tdb_methods *methods = tdb->transaction->io_methods;
627 1133029 : const uint32_t invalid = TDB_RECOVERY_INVALID_MAGIC;
628 :
629 : /* remove the recovery marker */
630 2266013 : if (methods->tdb_write(tdb, tdb->transaction->magic_offset, &invalid, 4) == -1 ||
631 1132984 : transaction_sync(tdb, tdb->transaction->magic_offset, 4) == -1) {
632 39 : TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_cancel: failed to remove recovery magic\n"));
633 39 : ret = -1;
634 : }
635 : }
636 :
637 : /* This also removes the OPEN_LOCK, if we have it. */
638 3366757 : tdb_release_transaction_locks(tdb);
639 :
640 : /* restore the normal io methods */
641 3366748 : tdb->methods = tdb->transaction->io_methods;
642 :
643 3366748 : SAFE_FREE(tdb->transaction->hash_heads);
644 3366748 : SAFE_FREE(tdb->transaction);
645 :
646 3343060 : return ret;
647 : }
648 :
649 : /*
650 : cancel the current transaction
651 : */
652 288749 : _PUBLIC_ int tdb_transaction_cancel(struct tdb_context *tdb)
653 : {
654 1953 : tdb_trace(tdb, "tdb_transaction_cancel");
655 288749 : return _tdb_transaction_cancel(tdb);
656 : }
657 :
658 : /*
659 : work out how much space the linearised recovery data will consume
660 : */
661 1133082 : static bool tdb_recovery_size(struct tdb_context *tdb, tdb_len_t *result)
662 : {
663 1133082 : tdb_len_t recovery_size = 0;
664 16707 : uint32_t i;
665 :
666 1133082 : recovery_size = sizeof(uint32_t);
667 8409393723 : for (i=0;i<tdb->transaction->num_blocks;i++) {
668 2133104 : tdb_len_t block_size;
669 8408344518 : if (i * tdb->transaction->block_size >= tdb->transaction->old_map_size) {
670 79775 : break;
671 : }
672 8408260641 : if (tdb->transaction->blocks[i] == NULL) {
673 8398276690 : continue;
674 : }
675 9983951 : if (!tdb_add_len_t(recovery_size, 2*sizeof(tdb_off_t),
676 : &recovery_size)) {
677 0 : return false;
678 : }
679 9983951 : if (i == tdb->transaction->num_blocks-1) {
680 1049205 : block_size = tdb->transaction->last_block_size;
681 : } else {
682 8934746 : block_size = tdb->transaction->block_size;
683 : }
684 9983951 : if (!tdb_add_len_t(recovery_size, block_size,
685 : &recovery_size)) {
686 0 : return false;
687 : }
688 : }
689 :
690 1133082 : *result = recovery_size;
691 1133082 : return true;
692 : }
693 :
694 1133032 : int tdb_recovery_area(struct tdb_context *tdb,
695 : const struct tdb_methods *methods,
696 : tdb_off_t *recovery_offset,
697 : struct tdb_record *rec)
698 : {
699 16704 : int ret;
700 :
701 1133032 : if (tdb_ofs_read(tdb, TDB_RECOVERY_HEAD, recovery_offset) == -1) {
702 0 : return -1;
703 : }
704 :
705 1133032 : if (*recovery_offset == 0) {
706 74976 : rec->rec_len = 0;
707 74976 : return 0;
708 : }
709 :
710 1058056 : if (methods->tdb_read(tdb, *recovery_offset, rec, sizeof(*rec),
711 1058056 : DOCONV()) == -1) {
712 0 : return -1;
713 : }
714 :
715 : /* ignore invalid recovery regions: can happen in crash */
716 1058056 : if (rec->magic != TDB_RECOVERY_MAGIC &&
717 1044526 : rec->magic != TDB_RECOVERY_INVALID_MAGIC) {
718 0 : *recovery_offset = 0;
719 0 : rec->rec_len = 0;
720 : }
721 :
722 1058056 : ret = methods->tdb_oob(tdb, *recovery_offset, rec->rec_len, 1);
723 1058056 : if (ret == -1) {
724 0 : *recovery_offset = 0;
725 0 : rec->rec_len = 0;
726 : }
727 :
728 1044526 : return 0;
729 : }
730 :
731 : /*
732 : allocate the recovery area, or use an existing recovery area if it is
733 : large enough
734 : */
735 1133026 : static int tdb_recovery_allocate(struct tdb_context *tdb,
736 : tdb_len_t *recovery_size,
737 : tdb_off_t *recovery_offset,
738 : tdb_len_t *recovery_max_size)
739 : {
740 16704 : struct tdb_record rec;
741 1133026 : const struct tdb_methods *methods = tdb->transaction->io_methods;
742 16704 : tdb_off_t recovery_head, new_end;
743 :
744 1133026 : if (tdb_recovery_area(tdb, methods, &recovery_head, &rec) == -1) {
745 0 : TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_recovery_allocate: failed to read recovery head\n"));
746 0 : return -1;
747 : }
748 :
749 1133026 : if (!tdb_recovery_size(tdb, recovery_size)) {
750 0 : TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_recovery_allocate: "
751 : "overflow recovery size\n"));
752 0 : return -1;
753 : }
754 :
755 : /* Existing recovery area? */
756 1133026 : if (recovery_head != 0 && *recovery_size <= rec.rec_len) {
757 : /* it fits in the existing area */
758 1057997 : *recovery_max_size = rec.rec_len;
759 1057997 : *recovery_offset = recovery_head;
760 1057997 : return 0;
761 : }
762 :
763 : /* If recovery area in middle of file, we need a new one. */
764 75029 : if (recovery_head == 0
765 59 : || recovery_head + sizeof(rec) + rec.rec_len != tdb->map_size) {
766 : /* we need to free up the old recovery area, then allocate a
767 : new one at the end of the file. Note that we cannot use
768 : tdb_allocate() to allocate the new one as that might return
769 : us an area that is being currently used (as of the start of
770 : the transaction) */
771 75026 : if (recovery_head) {
772 56 : if (tdb_free(tdb, recovery_head, &rec) == -1) {
773 0 : TDB_LOG((tdb, TDB_DEBUG_FATAL,
774 : "tdb_recovery_allocate: failed to"
775 : " free previous recovery area\n"));
776 0 : return -1;
777 : }
778 :
779 : /* the tdb_free() call might have increased
780 : * the recovery size */
781 56 : if (!tdb_recovery_size(tdb, recovery_size)) {
782 0 : TDB_LOG((tdb, TDB_DEBUG_FATAL,
783 : "tdb_recovery_allocate: "
784 : "overflow recovery size\n"));
785 0 : return -1;
786 : }
787 : }
788 :
789 : /* New head will be at end of file. */
790 75026 : recovery_head = tdb->map_size;
791 : }
792 :
793 : /* Now we know where it will be. */
794 75029 : *recovery_offset = recovery_head;
795 :
796 : /* Expand by more than we need, so we don't do it often. */
797 75029 : *recovery_max_size = tdb_expand_adjust(tdb->map_size,
798 : *recovery_size,
799 : tdb->page_size)
800 75029 : - sizeof(rec);
801 :
802 78206 : if (!tdb_add_off_t(recovery_head, sizeof(rec), &new_end) ||
803 75029 : !tdb_add_off_t(new_end, *recovery_max_size, &new_end)) {
804 0 : TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_recovery_allocate: "
805 : "overflow recovery area\n"));
806 0 : return -1;
807 : }
808 :
809 75029 : if (methods->tdb_expand_file(tdb, tdb->transaction->old_map_size,
810 75029 : new_end - tdb->transaction->old_map_size)
811 : == -1) {
812 0 : TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_recovery_allocate: failed to create recovery area\n"));
813 0 : return -1;
814 : }
815 :
816 : /* remap the file (if using mmap) */
817 75029 : methods->tdb_oob(tdb, tdb->map_size, 1, 1);
818 :
819 : /* we have to reset the old map size so that we don't try to expand the file
820 : again in the transaction commit, which would destroy the recovery area */
821 75029 : tdb->transaction->old_map_size = tdb->map_size;
822 :
823 : /* write the recovery header offset and sync - we can sync without a race here
824 : as the magic ptr in the recovery record has not been set */
825 75029 : CONVERT(recovery_head);
826 75029 : if (methods->tdb_write(tdb, TDB_RECOVERY_HEAD,
827 : &recovery_head, sizeof(tdb_off_t)) == -1) {
828 0 : TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_recovery_allocate: failed to write recovery head\n"));
829 0 : return -1;
830 : }
831 75023 : if (transaction_write_existing(tdb, TDB_RECOVERY_HEAD, &recovery_head, sizeof(tdb_off_t)) == -1) {
832 0 : TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_recovery_allocate: failed to write recovery head\n"));
833 0 : return -1;
834 : }
835 :
836 71846 : return 0;
837 : }
838 :
839 :
840 : /*
841 : setup the recovery data that will be used on a crash during commit
842 : */
843 1133026 : static int transaction_setup_recovery(struct tdb_context *tdb,
844 : tdb_off_t *magic_offset)
845 : {
846 16704 : tdb_len_t recovery_size;
847 16704 : unsigned char *data, *p;
848 1133026 : const struct tdb_methods *methods = tdb->transaction->io_methods;
849 16704 : struct tdb_record *rec;
850 16704 : tdb_off_t recovery_offset, recovery_max_size;
851 1133026 : tdb_off_t old_map_size = tdb->transaction->old_map_size;
852 16704 : uint32_t magic, tailer;
853 16704 : uint32_t i;
854 :
855 : /*
856 : check that the recovery area has enough space
857 : */
858 1133026 : if (tdb_recovery_allocate(tdb, &recovery_size,
859 : &recovery_offset, &recovery_max_size) == -1) {
860 0 : return -1;
861 : }
862 :
863 1133020 : rec = malloc(recovery_size + sizeof(*rec));
864 1133020 : if (rec == NULL) {
865 0 : tdb->ecode = TDB_ERR_OOM;
866 0 : return -1;
867 : }
868 :
869 1133020 : memset(rec, 0, sizeof(*rec));
870 :
871 1133020 : rec->magic = TDB_RECOVERY_INVALID_MAGIC;
872 1133020 : rec->data_len = recovery_size;
873 1133020 : rec->rec_len = recovery_max_size;
874 1133020 : rec->key_len = old_map_size;
875 1133020 : CONVERT(*rec);
876 :
877 1133020 : data = (unsigned char *)rec;
878 :
879 : /* build the recovery data into a single blob to allow us to do a single
880 : large write, which should be more efficient */
881 1133020 : p = data + sizeof(*rec);
882 8411100601 : for (i=0;i<tdb->transaction->num_blocks;i++) {
883 2314663 : tdb_off_t offset;
884 2314663 : tdb_len_t length;
885 :
886 8409967581 : if (tdb->transaction->blocks[i] == NULL) {
887 8400291882 : continue;
888 : }
889 :
890 12125202 : offset = i * tdb->transaction->block_size;
891 12125202 : length = tdb->transaction->block_size;
892 12125202 : if (i == tdb->transaction->num_blocks-1) {
893 1133020 : length = tdb->transaction->last_block_size;
894 : }
895 :
896 12125202 : if (offset >= old_map_size) {
897 2449503 : continue;
898 : }
899 9675699 : if (offset + length > tdb->transaction->old_map_size) {
900 0 : TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_setup_recovery: transaction data over new region boundary\n"));
901 0 : free(data);
902 0 : tdb->ecode = TDB_ERR_CORRUPT;
903 0 : return -1;
904 : }
905 9675699 : memcpy(p, &offset, 4);
906 9675699 : memcpy(p+4, &length, 4);
907 9675699 : if (DOCONV()) {
908 0 : tdb_convert(p, 8);
909 : }
910 : /* the recovery area contains the old data, not the
911 : new data, so we have to call the original tdb_read
912 : method to get it */
913 9675699 : if (methods->tdb_read(tdb, offset, p + 8, length, 0) != 0) {
914 0 : free(data);
915 0 : tdb->ecode = TDB_ERR_IO;
916 0 : return -1;
917 : }
918 9675699 : p += 8 + length;
919 : }
920 :
921 : /* and the tailer */
922 1133020 : tailer = sizeof(*rec) + recovery_max_size;
923 1133020 : memcpy(p, &tailer, 4);
924 1133020 : if (DOCONV()) {
925 0 : tdb_convert(p, 4);
926 : }
927 :
928 : /* write the recovery data to the recovery area */
929 1133020 : if (methods->tdb_write(tdb, recovery_offset, data, sizeof(*rec) + recovery_size) == -1) {
930 0 : TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_setup_recovery: failed to write recovery data\n"));
931 0 : free(data);
932 0 : tdb->ecode = TDB_ERR_IO;
933 0 : return -1;
934 : }
935 1133014 : if (transaction_write_existing(tdb, recovery_offset, data, sizeof(*rec) + recovery_size) == -1) {
936 0 : TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_setup_recovery: failed to write secondary recovery data\n"));
937 0 : free(data);
938 0 : tdb->ecode = TDB_ERR_IO;
939 0 : return -1;
940 : }
941 :
942 : /* as we don't have ordered writes, we have to sync the recovery
943 : data before we update the magic to indicate that the recovery
944 : data is present */
945 1133014 : if (transaction_sync(tdb, recovery_offset, sizeof(*rec) + recovery_size) == -1) {
946 0 : free(data);
947 0 : return -1;
948 : }
949 :
950 1133014 : free(data);
951 :
952 1133014 : magic = TDB_RECOVERY_MAGIC;
953 1133014 : CONVERT(magic);
954 :
955 1133014 : *magic_offset = recovery_offset + offsetof(struct tdb_record, magic);
956 :
957 1133014 : if (methods->tdb_write(tdb, *magic_offset, &magic, sizeof(magic)) == -1) {
958 0 : TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_setup_recovery: failed to write recovery magic\n"));
959 0 : tdb->ecode = TDB_ERR_IO;
960 0 : return -1;
961 : }
962 1133008 : if (transaction_write_existing(tdb, *magic_offset, &magic, sizeof(magic)) == -1) {
963 0 : TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_setup_recovery: failed to write secondary recovery magic\n"));
964 0 : tdb->ecode = TDB_ERR_IO;
965 0 : return -1;
966 : }
967 :
968 : /* ensure the recovery magic marker is on disk */
969 1133008 : if (transaction_sync(tdb, *magic_offset, sizeof(magic)) == -1) {
970 0 : return -1;
971 : }
972 :
973 1116304 : return 0;
974 : }
975 :
976 2993390 : static int _tdb_transaction_prepare_commit(struct tdb_context *tdb)
977 : {
978 20860 : const struct tdb_methods *methods;
979 :
980 2993390 : if (tdb->transaction == NULL) {
981 0 : TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_prepare_commit: no transaction\n"));
982 0 : return -1;
983 : }
984 :
985 2993390 : if (tdb->transaction->prepared) {
986 0 : tdb->ecode = TDB_ERR_EINVAL;
987 0 : _tdb_transaction_cancel(tdb);
988 0 : TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_prepare_commit: transaction already prepared\n"));
989 0 : return -1;
990 : }
991 :
992 2993390 : if (tdb->transaction->transaction_error) {
993 0 : tdb->ecode = TDB_ERR_IO;
994 0 : _tdb_transaction_cancel(tdb);
995 0 : TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_prepare_commit: transaction error pending\n"));
996 0 : return -1;
997 : }
998 :
999 :
1000 2993390 : if (tdb->transaction->nesting != 0) {
1001 0 : return 0;
1002 : }
1003 :
1004 : /* check for a null transaction */
1005 2993390 : if (tdb->transaction->blocks == NULL) {
1006 1856206 : return 0;
1007 : }
1008 :
1009 1133028 : methods = tdb->transaction->io_methods;
1010 :
1011 : /* if there are any locks pending then the caller has not
1012 : nested their locks properly, so fail the transaction */
1013 1133028 : if (tdb_have_extra_locks(tdb)) {
1014 0 : tdb->ecode = TDB_ERR_LOCK;
1015 0 : TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_prepare_commit: locks pending on commit\n"));
1016 0 : _tdb_transaction_cancel(tdb);
1017 0 : return -1;
1018 : }
1019 :
1020 : /* upgrade the main transaction lock region to a write lock */
1021 1133028 : if (tdb_allrecord_upgrade(tdb) == -1) {
1022 2 : if (tdb->ecode == TDB_ERR_RDONLY && tdb->read_only) {
1023 0 : TDB_LOG((tdb, TDB_DEBUG_ERROR,
1024 : "tdb_transaction_prepare_commit: "
1025 : "failed to upgrade hash locks: "
1026 : "database is read only\n"));
1027 2 : } else if (tdb->ecode == TDB_ERR_RDONLY
1028 0 : && tdb->traverse_read) {
1029 0 : TDB_LOG((tdb, TDB_DEBUG_ERROR,
1030 : "tdb_transaction_prepare_commit: "
1031 : "failed to upgrade hash locks: "
1032 : "a database traverse is in progress\n"));
1033 : } else {
1034 2 : TDB_LOG((tdb, TDB_DEBUG_ERROR,
1035 : "tdb_transaction_prepare_commit: "
1036 : "failed to upgrade hash locks: %s\n",
1037 : tdb_errorstr(tdb)));
1038 : }
1039 2 : _tdb_transaction_cancel(tdb);
1040 2 : return -1;
1041 : }
1042 :
1043 : /* get the open lock - this prevents new users attaching to the database
1044 : during the commit */
1045 1133026 : if (tdb_nest_lock(tdb, OPEN_LOCK, F_WRLCK, TDB_LOCK_WAIT) == -1) {
1046 0 : TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_prepare_commit: failed to get open lock\n"));
1047 0 : _tdb_transaction_cancel(tdb);
1048 0 : return -1;
1049 : }
1050 :
1051 : /* write the recovery data to the end of the file */
1052 1133026 : if (transaction_setup_recovery(tdb, &tdb->transaction->magic_offset) == -1) {
1053 0 : TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_prepare_commit: failed to setup recovery data\n"));
1054 0 : _tdb_transaction_cancel(tdb);
1055 0 : return -1;
1056 : }
1057 :
1058 1133008 : tdb->transaction->prepared = true;
1059 :
1060 : /* expand the file to the new size if needed */
1061 1133008 : if (tdb->map_size != tdb->transaction->old_map_size) {
1062 9198 : if (methods->tdb_expand_file(tdb, tdb->transaction->old_map_size,
1063 8217 : tdb->map_size -
1064 8217 : tdb->transaction->old_map_size) == -1) {
1065 0 : tdb->ecode = TDB_ERR_IO;
1066 0 : TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_prepare_commit: expansion failed\n"));
1067 0 : _tdb_transaction_cancel(tdb);
1068 0 : return -1;
1069 : }
1070 9198 : tdb->map_size = tdb->transaction->old_map_size;
1071 9198 : methods->tdb_oob(tdb, tdb->map_size, 1, 1);
1072 : }
1073 :
1074 : /* Keep the open lock until the actual commit */
1075 :
1076 1116304 : return 0;
1077 : }
1078 :
1079 : /*
1080 : prepare to commit the current transaction
1081 : */
1082 2828928 : _PUBLIC_ int tdb_transaction_prepare_commit(struct tdb_context *tdb)
1083 : {
1084 16715 : tdb_trace(tdb, "tdb_transaction_prepare_commit");
1085 2828928 : return _tdb_transaction_prepare_commit(tdb);
1086 : }
1087 :
1088 : /* A repack is worthwhile if the largest is less than half total free. */
1089 83877 : static bool repack_worthwhile(struct tdb_context *tdb)
1090 : {
1091 4100 : tdb_off_t ptr;
1092 4100 : struct tdb_record rec;
1093 83877 : tdb_len_t total = 0, largest = 0;
1094 :
1095 83877 : if (tdb_ofs_read(tdb, FREELIST_TOP, &ptr) == -1) {
1096 0 : return false;
1097 : }
1098 :
1099 374568 : while (ptr != 0 && tdb_rec_free_read(tdb, ptr, &rec) == 0) {
1100 290691 : total += rec.rec_len;
1101 290691 : if (rec.rec_len > largest) {
1102 80086 : largest = rec.rec_len;
1103 : }
1104 290691 : ptr = rec.next;
1105 : }
1106 :
1107 83877 : return total > largest * 2;
1108 : }
1109 :
1110 : /*
1111 : commit the current transaction
1112 : */
1113 5918285 : _PUBLIC_ int tdb_transaction_commit(struct tdb_context *tdb)
1114 : {
1115 102092 : const struct tdb_methods *methods;
1116 102092 : uint32_t i;
1117 5918285 : bool need_repack = false;
1118 :
1119 5918285 : if (tdb->transaction == NULL) {
1120 0 : TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_commit: no transaction\n"));
1121 0 : return -1;
1122 : }
1123 :
1124 102092 : tdb_trace(tdb, "tdb_transaction_commit");
1125 :
1126 5918285 : if (tdb->transaction->transaction_error) {
1127 0 : tdb->ecode = TDB_ERR_IO;
1128 0 : _tdb_transaction_cancel(tdb);
1129 0 : TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_commit: transaction error pending\n"));
1130 0 : return -1;
1131 : }
1132 :
1133 :
1134 5918285 : if (tdb->transaction->nesting != 0) {
1135 2840231 : tdb->transaction->nesting--;
1136 2840231 : return 0;
1137 : }
1138 :
1139 : /* check for a null transaction */
1140 3078054 : if (tdb->transaction->blocks == NULL) {
1141 1945033 : _tdb_transaction_cancel(tdb);
1142 1945033 : return 0;
1143 : }
1144 :
1145 1133021 : if (!tdb->transaction->prepared) {
1146 164462 : int ret = _tdb_transaction_prepare_commit(tdb);
1147 164444 : if (ret)
1148 2 : return ret;
1149 : }
1150 :
1151 1133001 : methods = tdb->transaction->io_methods;
1152 :
1153 : /* perform all the writes */
1154 8411096931 : for (i=0;i<tdb->transaction->num_blocks;i++) {
1155 2314663 : tdb_off_t offset;
1156 2314663 : tdb_len_t length;
1157 :
1158 8409963948 : if (tdb->transaction->blocks[i] == NULL) {
1159 8397838843 : continue;
1160 : }
1161 :
1162 12125105 : offset = i * tdb->transaction->block_size;
1163 12125105 : length = tdb->transaction->block_size;
1164 12125105 : if (i == tdb->transaction->num_blocks-1) {
1165 1132989 : length = tdb->transaction->last_block_size;
1166 : }
1167 :
1168 12125105 : if (methods->tdb_write(tdb, offset, tdb->transaction->blocks[i], length) == -1) {
1169 0 : TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_commit: write failed during commit\n"));
1170 :
1171 : /* we've overwritten part of the data and
1172 : possibly expanded the file, so we need to
1173 : run the crash recovery code */
1174 0 : tdb->methods = methods;
1175 0 : tdb_transaction_recover(tdb);
1176 :
1177 0 : _tdb_transaction_cancel(tdb);
1178 :
1179 0 : TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_commit: write failed\n"));
1180 0 : return -1;
1181 : }
1182 14105683 : SAFE_FREE(tdb->transaction->blocks[i]);
1183 : }
1184 :
1185 : /* Do this before we drop lock or blocks. */
1186 1132983 : if (tdb->transaction->expanded) {
1187 83877 : need_repack = repack_worthwhile(tdb);
1188 : }
1189 :
1190 1132983 : SAFE_FREE(tdb->transaction->blocks);
1191 1132983 : tdb->transaction->num_blocks = 0;
1192 :
1193 : /* ensure the new data is on disk */
1194 1132983 : if (transaction_sync(tdb, 0, tdb->map_size) == -1) {
1195 0 : return -1;
1196 : }
1197 :
1198 : /*
1199 : TODO: maybe write to some dummy hdr field, or write to magic
1200 : offset without mmap, before the last sync, instead of the
1201 : utime() call
1202 : */
1203 :
1204 : /* on some systems (like Linux 2.6.x) changes via mmap/msync
1205 : don't change the mtime of the file, this means the file may
1206 : not be backed up (as tdb rounding to block sizes means that
1207 : file size changes are quite rare too). The following forces
1208 : mtime changes when a transaction completes */
1209 : #ifdef HAVE_UTIME
1210 1132983 : utime(tdb->name, NULL);
1211 : #endif
1212 :
1213 : /* use a transaction cancel to free memory and remove the
1214 : transaction locks */
1215 1132983 : _tdb_transaction_cancel(tdb);
1216 :
1217 1132968 : if (need_repack) {
1218 12 : int ret = tdb_repack(tdb);
1219 12 : if (ret != 0) {
1220 0 : TDB_LOG((tdb, TDB_DEBUG_FATAL,
1221 : __location__ " Failed to repack database (not fatal)\n"));
1222 : }
1223 : /*
1224 : * Ignore the error.
1225 : *
1226 : * Why?
1227 : *
1228 : * We just committed to the DB above, so anything
1229 : * written during the transaction is committed, the
1230 : * caller needs to know that the long-term state was
1231 : * successfully modified.
1232 : *
1233 : * tdb_repack is an optimization that can fail for
1234 : * reasons like lock ordering and we cannot recover
1235 : * the transaction lock at this point, having released
1236 : * it above.
1237 : *
1238 : * If we return a failure the caller thinks the
1239 : * transaction was rolled back.
1240 : */
1241 : }
1242 :
1243 1116264 : return 0;
1244 : }
1245 :
1246 :
1247 : /*
1248 : recover from an aborted transaction. Must be called with exclusive
1249 : database write access already established (including the open
1250 : lock to prevent new processes attaching)
1251 : */
1252 1269488 : int tdb_transaction_recover(struct tdb_context *tdb)
1253 : {
1254 30598 : tdb_off_t recovery_head, recovery_eof;
1255 30598 : unsigned char *data, *p;
1256 1269488 : uint32_t zero = 0;
1257 30598 : struct tdb_record rec;
1258 :
1259 : /* find the recovery area */
1260 1269488 : if (tdb_ofs_read(tdb, TDB_RECOVERY_HEAD, &recovery_head) == -1) {
1261 0 : TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to read recovery head\n"));
1262 0 : tdb->ecode = TDB_ERR_IO;
1263 0 : return -1;
1264 : }
1265 :
1266 1269488 : if (recovery_head == 0) {
1267 : /* we have never allocated a recovery record */
1268 736929 : return 0;
1269 : }
1270 :
1271 : /* read the recovery record */
1272 512291 : if (tdb->methods->tdb_read(tdb, recovery_head, &rec,
1273 512291 : sizeof(rec), DOCONV()) == -1) {
1274 40 : TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to read recovery record\n"));
1275 40 : tdb->ecode = TDB_ERR_IO;
1276 40 : return -1;
1277 : }
1278 :
1279 512251 : if (rec.magic != TDB_RECOVERY_MAGIC) {
1280 : /* there is no valid recovery data */
1281 501897 : return 0;
1282 : }
1283 :
1284 24 : if (tdb->read_only) {
1285 0 : TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: attempt to recover read only database\n"));
1286 0 : tdb->ecode = TDB_ERR_CORRUPT;
1287 0 : return -1;
1288 : }
1289 :
1290 24 : recovery_eof = rec.key_len;
1291 :
1292 24 : data = (unsigned char *)malloc(rec.data_len);
1293 24 : if (data == NULL) {
1294 0 : TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to allocate recovery data\n"));
1295 0 : tdb->ecode = TDB_ERR_OOM;
1296 0 : return -1;
1297 : }
1298 :
1299 : /* read the full recovery data */
1300 24 : if (tdb->methods->tdb_read(tdb, recovery_head + sizeof(rec), data,
1301 : rec.data_len, 0) == -1) {
1302 0 : TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to read recovery data\n"));
1303 0 : tdb->ecode = TDB_ERR_IO;
1304 0 : free(data);
1305 0 : return -1;
1306 : }
1307 :
1308 : /* recover the file data */
1309 24 : p = data;
1310 96 : while (p+8 < data + rec.data_len) {
1311 0 : uint32_t ofs, len;
1312 72 : if (DOCONV()) {
1313 0 : tdb_convert(p, 8);
1314 : }
1315 72 : memcpy(&ofs, p, 4);
1316 72 : memcpy(&len, p+4, 4);
1317 :
1318 72 : if (tdb->methods->tdb_write(tdb, ofs, p+8, len) == -1) {
1319 0 : free(data);
1320 0 : TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to recover %u bytes at offset %u\n", len, ofs));
1321 0 : tdb->ecode = TDB_ERR_IO;
1322 0 : return -1;
1323 : }
1324 72 : p += 8 + len;
1325 : }
1326 :
1327 24 : free(data);
1328 :
1329 24 : if (transaction_sync(tdb, 0, tdb->map_size) == -1) {
1330 0 : TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to sync recovery\n"));
1331 0 : tdb->ecode = TDB_ERR_IO;
1332 0 : return -1;
1333 : }
1334 :
1335 : /* if the recovery area is after the recovered eof then remove it */
1336 24 : if (recovery_eof <= recovery_head) {
1337 24 : if (tdb_ofs_write(tdb, TDB_RECOVERY_HEAD, &zero) == -1) {
1338 0 : TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to remove recovery head\n"));
1339 0 : tdb->ecode = TDB_ERR_IO;
1340 0 : return -1;
1341 : }
1342 : }
1343 :
1344 : /* remove the recovery magic */
1345 24 : if (tdb_ofs_write(tdb, recovery_head + offsetof(struct tdb_record, magic),
1346 : &zero) == -1) {
1347 0 : TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to remove recovery magic\n"));
1348 0 : tdb->ecode = TDB_ERR_IO;
1349 0 : return -1;
1350 : }
1351 :
1352 24 : if (transaction_sync(tdb, 0, recovery_eof) == -1) {
1353 0 : TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to sync2 recovery\n"));
1354 0 : tdb->ecode = TDB_ERR_IO;
1355 0 : return -1;
1356 : }
1357 :
1358 24 : TDB_LOG((tdb, TDB_DEBUG_TRACE, "tdb_transaction_recover: recovered %u byte database\n",
1359 : recovery_eof));
1360 :
1361 : /* all done */
1362 24 : return 0;
1363 : }
1364 :
1365 : /* Any I/O failures we say "needs recovery". */
1366 517063354 : bool tdb_needs_recovery(struct tdb_context *tdb)
1367 : {
1368 26306127 : tdb_off_t recovery_head;
1369 26306127 : struct tdb_record rec;
1370 :
1371 : /* find the recovery area */
1372 517063354 : if (tdb_ofs_read(tdb, TDB_RECOVERY_HEAD, &recovery_head) == -1) {
1373 0 : return true;
1374 : }
1375 :
1376 517063354 : if (recovery_head == 0) {
1377 : /* we have never allocated a recovery record */
1378 330061107 : return false;
1379 : }
1380 :
1381 : /* read the recovery record */
1382 165757196 : if (tdb->methods->tdb_read(tdb, recovery_head, &rec,
1383 165757196 : sizeof(rec), DOCONV()) == -1) {
1384 40 : return true;
1385 : }
1386 :
1387 165757156 : return (rec.magic == TDB_RECOVERY_MAGIC);
1388 : }
|