Line data Source code
1 : /*
2 : Unix SMB/CIFS implementation.
3 :
4 : trivial database library
5 :
6 : Copyright (C) Andrew Tridgell 1999-2005
7 : Copyright (C) Paul `Rusty' Russell 2000
8 : Copyright (C) Jeremy Allison 2000-2003
9 :
10 : ** NOTE! The following LGPL license applies to the tdb
11 : ** library. This does NOT imply that all of Samba is released
12 : ** under the LGPL
13 :
14 : This library is free software; you can redistribute it and/or
15 : modify it under the terms of the GNU Lesser General Public
16 : License as published by the Free Software Foundation; either
17 : version 3 of the License, or (at your option) any later version.
18 :
19 : This library is distributed in the hope that it will be useful,
20 : but WITHOUT ANY WARRANTY; without even the implied warranty of
21 : MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22 : Lesser General Public License for more details.
23 :
24 : You should have received a copy of the GNU Lesser General Public
25 : License along with this library; if not, see <http://www.gnu.org/licenses/>.
26 : */
27 :
28 : #include "tdb_private.h"
29 :
30 48176 : _PUBLIC_ void tdb_setalarm_sigptr(struct tdb_context *tdb, volatile sig_atomic_t *ptr)
31 : {
32 48176 : tdb->interrupt_sig_ptr = ptr;
33 48176 : }
34 :
35 415677364 : static int fcntl_lock(struct tdb_context *tdb,
36 : int rw, off_t off, off_t len, bool waitflag)
37 : {
38 10855065 : struct flock fl;
39 10855065 : int cmd;
40 :
41 : #ifdef USE_TDB_MUTEX_LOCKING
42 : {
43 10855065 : int ret;
44 415677364 : if (tdb_mutex_lock(tdb, rw, off, len, waitflag, &ret)) {
45 90627836 : return ret;
46 : }
47 : }
48 : #endif
49 :
50 325049528 : fl.l_type = rw;
51 325049528 : fl.l_whence = SEEK_SET;
52 325049528 : fl.l_start = off;
53 325049528 : fl.l_len = len;
54 325049528 : fl.l_pid = 0;
55 :
56 325049528 : cmd = waitflag ? F_SETLKW : F_SETLK;
57 :
58 325049528 : return fcntl(tdb->fd, cmd, &fl);
59 : }
60 :
61 304680602 : static int fcntl_unlock(struct tdb_context *tdb, int rw, off_t off, off_t len)
62 : {
63 8291253 : struct flock fl;
64 : #if 0 /* Check they matched up locks and unlocks correctly. */
65 : char line[80];
66 : FILE *locks;
67 : bool found = false;
68 :
69 : locks = fopen("/proc/locks", "r");
70 :
71 : while (fgets(line, 80, locks)) {
72 : char *p;
73 : int type, start, l;
74 :
75 : /* eg. 1: FLOCK ADVISORY WRITE 2440 08:01:2180826 0 EOF */
76 : p = strchr(line, ':') + 1;
77 : if (strncmp(p, " POSIX ADVISORY ", strlen(" POSIX ADVISORY ")))
78 : continue;
79 : p += strlen(" FLOCK ADVISORY ");
80 : if (strncmp(p, "READ ", strlen("READ ")) == 0)
81 : type = F_RDLCK;
82 : else if (strncmp(p, "WRITE ", strlen("WRITE ")) == 0)
83 : type = F_WRLCK;
84 : else
85 : abort();
86 : p += 6;
87 : if (atoi(p) != getpid())
88 : continue;
89 : p = strchr(strchr(p, ' ') + 1, ' ') + 1;
90 : start = atoi(p);
91 : p = strchr(p, ' ') + 1;
92 : if (strncmp(p, "EOF", 3) == 0)
93 : l = 0;
94 : else
95 : l = atoi(p) - start + 1;
96 :
97 : if (off == start) {
98 : if (len != l) {
99 : fprintf(stderr, "Len %u should be %u: %s",
100 : (int)len, l, line);
101 : abort();
102 : }
103 : if (type != rw) {
104 : fprintf(stderr, "Type %s wrong: %s",
105 : rw == F_RDLCK ? "READ" : "WRITE", line);
106 : abort();
107 : }
108 : found = true;
109 : break;
110 : }
111 : }
112 :
113 : if (!found) {
114 : fprintf(stderr, "Unlock on %u@%u not found!\n",
115 : (int)off, (int)len);
116 : abort();
117 : }
118 :
119 : fclose(locks);
120 : #endif
121 :
122 : #ifdef USE_TDB_MUTEX_LOCKING
123 : {
124 8291253 : int ret;
125 304680602 : if (tdb_mutex_unlock(tdb, rw, off, len, &ret)) {
126 90625104 : return ret;
127 : }
128 : }
129 : #endif
130 :
131 214055498 : fl.l_type = F_UNLCK;
132 214055498 : fl.l_whence = SEEK_SET;
133 214055498 : fl.l_start = off;
134 214055498 : fl.l_len = len;
135 214055498 : fl.l_pid = 0;
136 :
137 214055498 : return fcntl(tdb->fd, F_SETLKW, &fl);
138 : }
139 :
140 : /*
141 : * Calculate the lock offset for a list
142 : *
143 : * list -1 is the freelist, otherwise a hash chain.
144 : *
145 : * Note that we consistently (but without real reason) lock hash chains at an
146 : * offset that is 4 bytes below the real offset of the corresponding list head
147 : * in the db.
148 : *
149 : * This is the memory layout of the hashchain array:
150 : *
151 : * FREELIST_TOP + 0 = freelist
152 : * FREELIST_TOP + 4 = hashtable list 0
153 : * FREELIST_TOP + 8 = hashtable list 1
154 : * ...
155 : *
156 : * Otoh lock_offset computes:
157 : *
158 : * freelist = FREELIST_TOP - 4
159 : * list 0 = FREELIST_TOP + 0
160 : * list 1 = FREELIST_TOP + 4
161 : * ...
162 : *
163 : * Unfortunately we can't change this calculation in order to align the locking
164 : * offset with the memory layout, as that would make the locking incompatible
165 : * between different tdb versions.
166 : */
167 2041569227 : static tdb_off_t lock_offset(int list)
168 : {
169 2041569227 : return FREELIST_TOP + 4*list;
170 : }
171 :
172 : /* a byte range locking function - return 0 on success
173 : this functions locks/unlocks "len" byte at the specified offset.
174 :
175 : On error, errno is also set so that errors are passed back properly
176 : through tdb_open().
177 :
178 : note that a len of zero means lock to end of file
179 : */
180 438708110 : int tdb_brlock(struct tdb_context *tdb,
181 : int rw_type, tdb_off_t offset, size_t len,
182 : enum tdb_lock_flags flags)
183 : {
184 12706699 : int ret;
185 :
186 438708110 : if (tdb->flags & TDB_NOLOCK) {
187 21179107 : return 0;
188 : }
189 :
190 415677369 : if (flags & TDB_LOCK_MARK_ONLY) {
191 5 : return 0;
192 : }
193 :
194 415677364 : if ((rw_type == F_WRLCK) && (tdb->read_only || tdb->traverse_read)) {
195 0 : tdb->ecode = TDB_ERR_RDONLY;
196 0 : return -1;
197 : }
198 :
199 10855065 : do {
200 415677364 : ret = fcntl_lock(tdb, rw_type, offset, len,
201 404822299 : flags & TDB_LOCK_WAIT);
202 : /* Check for a sigalarm break. */
203 415677364 : if (ret == -1 && errno == EINTR &&
204 0 : tdb->interrupt_sig_ptr &&
205 0 : *tdb->interrupt_sig_ptr) {
206 0 : break;
207 : }
208 415677364 : } while (ret == -1 && errno == EINTR);
209 :
210 415677364 : if (ret == -1) {
211 60369 : tdb->ecode = TDB_ERR_LOCK;
212 : /* Generic lock error. errno set by fcntl.
213 : * EAGAIN is an expected return from non-blocking
214 : * locks. */
215 60369 : if (!(flags & TDB_LOCK_PROBE) && errno != EAGAIN) {
216 0 : TDB_LOG((tdb, TDB_DEBUG_TRACE,"tdb_brlock failed (fd=%d) at offset %u rw_type=%d flags=%d len=%zu\n",
217 : tdb->fd, offset, rw_type, flags, len));
218 : }
219 60369 : return -1;
220 : }
221 404762922 : return 0;
222 : }
223 :
224 327711341 : int tdb_brunlock(struct tdb_context *tdb,
225 : int rw_type, tdb_off_t offset, size_t len)
226 : {
227 10142887 : int ret;
228 :
229 327711341 : if (tdb->flags & TDB_NOLOCK) {
230 21179105 : return 0;
231 : }
232 :
233 8291253 : do {
234 304680602 : ret = fcntl_unlock(tdb, rw_type, offset, len);
235 304680593 : } while (ret == -1 && errno == EINTR);
236 :
237 304680593 : if (ret == -1) {
238 147 : TDB_LOG((tdb, TDB_DEBUG_TRACE,"tdb_brunlock failed (fd=%d) at offset %u rw_type=%u len=%zu\n",
239 : tdb->fd, offset, rw_type, len));
240 : }
241 296389340 : return ret;
242 : }
243 :
244 : /*
245 : * Do a tdb_brlock in a loop. Some OSes (such as solaris) have too
246 : * conservative deadlock detection and claim a deadlock when progress can be
247 : * made. For those OSes we may loop for a while.
248 : */
249 :
250 1133028 : static int tdb_brlock_retry(struct tdb_context *tdb,
251 : int rw_type, tdb_off_t offset, size_t len,
252 : enum tdb_lock_flags flags)
253 : {
254 1133028 : int count = 1000;
255 :
256 1133028 : while (count--) {
257 16704 : struct timeval tv;
258 16704 : int ret;
259 :
260 1133028 : ret = tdb_brlock(tdb, rw_type, offset, len, flags);
261 1133028 : if (ret == 0) {
262 1133026 : return 0;
263 : }
264 2 : if (errno != EDEADLK) {
265 2 : break;
266 : }
267 : /* sleep for as short a time as we can - more portable than usleep() */
268 0 : tv.tv_sec = 0;
269 0 : tv.tv_usec = 1;
270 0 : select(0, NULL, NULL, NULL, &tv);
271 : }
272 2 : return -1;
273 : }
274 :
275 : /*
276 : upgrade a read lock to a write lock.
277 : */
278 1133028 : int tdb_allrecord_upgrade(struct tdb_context *tdb)
279 : {
280 16704 : int ret;
281 :
282 1133028 : if (tdb->allrecord_lock.count != 1) {
283 0 : TDB_LOG((tdb, TDB_DEBUG_ERROR,
284 : "tdb_allrecord_upgrade failed: count %u too high\n",
285 : tdb->allrecord_lock.count));
286 0 : tdb->ecode = TDB_ERR_LOCK;
287 0 : return -1;
288 : }
289 :
290 1133028 : if (tdb->allrecord_lock.off != 1) {
291 0 : TDB_LOG((tdb, TDB_DEBUG_ERROR,
292 : "tdb_allrecord_upgrade failed: already upgraded?\n"));
293 0 : tdb->ecode = TDB_ERR_LOCK;
294 0 : return -1;
295 : }
296 :
297 1133028 : if (tdb_have_mutexes(tdb)) {
298 2 : ret = tdb_mutex_allrecord_upgrade(tdb);
299 2 : if (ret == -1) {
300 0 : goto fail;
301 : }
302 2 : ret = tdb_brlock_retry(tdb, F_WRLCK, lock_offset(tdb->hash_size),
303 : 0, TDB_LOCK_WAIT|TDB_LOCK_PROBE);
304 2 : if (ret == -1) {
305 0 : tdb_mutex_allrecord_downgrade(tdb);
306 : }
307 : } else {
308 1133026 : ret = tdb_brlock_retry(tdb, F_WRLCK, FREELIST_TOP, 0,
309 : TDB_LOCK_WAIT|TDB_LOCK_PROBE);
310 : }
311 :
312 1133028 : if (ret == 0) {
313 1133026 : tdb->allrecord_lock.ltype = F_WRLCK;
314 1133026 : tdb->allrecord_lock.off = 0;
315 1133026 : return 0;
316 : }
317 2 : fail:
318 2 : TDB_LOG((tdb, TDB_DEBUG_TRACE,"tdb_allrecord_upgrade failed\n"));
319 2 : return -1;
320 : }
321 :
322 583300502 : static struct tdb_lock_type *find_nestlock(struct tdb_context *tdb,
323 : tdb_off_t offset)
324 : {
325 : int i;
326 :
327 895439306 : for (i=0; i<tdb->num_lockrecs; i++) {
328 590906989 : if (tdb->lockrecs[i].off == offset) {
329 287106925 : return &tdb->lockrecs[i];
330 : }
331 : }
332 296193577 : return NULL;
333 : }
334 :
335 : /* lock an offset in the database. */
336 509343263 : int tdb_nest_lock(struct tdb_context *tdb, uint32_t offset, int ltype,
337 : enum tdb_lock_flags flags)
338 : {
339 25268802 : struct tdb_lock_type *new_lck;
340 :
341 509343263 : if (offset >= lock_offset(tdb->hash_size)) {
342 0 : tdb->ecode = TDB_ERR_LOCK;
343 0 : TDB_LOG((tdb, TDB_DEBUG_ERROR,"tdb_lock: invalid offset %u for ltype=%d\n",
344 : offset, ltype));
345 0 : return -1;
346 : }
347 509343263 : if (tdb->flags & TDB_NOLOCK)
348 248355351 : return 0;
349 :
350 242157120 : new_lck = find_nestlock(tdb, offset);
351 242157120 : if (new_lck) {
352 52079308 : if ((new_lck->ltype == F_RDLCK) && (ltype == F_WRLCK)) {
353 1 : if (!tdb_have_mutexes(tdb)) {
354 0 : int ret;
355 : /*
356 : * Upgrade the underlying fcntl
357 : * lock. Mutexes don't do readlocks,
358 : * so this only applies to fcntl
359 : * locking.
360 : */
361 1 : ret = tdb_brlock(tdb, ltype, offset, 1, flags);
362 1 : if (ret != 0) {
363 1 : return ret;
364 : }
365 : }
366 0 : new_lck->ltype = F_WRLCK;
367 : }
368 : /*
369 : * Just increment the in-memory struct, posix locks
370 : * don't stack.
371 : */
372 52079307 : new_lck->count++;
373 52079307 : return 0;
374 : }
375 :
376 190077812 : if (tdb->num_lockrecs == tdb->lockrecs_array_length) {
377 2694333 : new_lck = (struct tdb_lock_type *)realloc(
378 2694333 : tdb->lockrecs,
379 2694333 : sizeof(*tdb->lockrecs) * (tdb->num_lockrecs+1));
380 2694333 : if (new_lck == NULL) {
381 0 : errno = ENOMEM;
382 0 : return -1;
383 : }
384 2694333 : tdb->lockrecs_array_length = tdb->num_lockrecs+1;
385 2694333 : tdb->lockrecs = new_lck;
386 : }
387 :
388 : /* Since fcntl locks don't nest, we do a lock for the first one,
389 : and simply bump the count for future ones */
390 190077812 : if (tdb_brlock(tdb, ltype, offset, 1, flags)) {
391 52601 : return -1;
392 : }
393 :
394 190024227 : new_lck = &tdb->lockrecs[tdb->num_lockrecs];
395 :
396 190024227 : new_lck->off = offset;
397 190024227 : new_lck->count = 1;
398 190024227 : new_lck->ltype = ltype;
399 190024227 : tdb->num_lockrecs++;
400 :
401 190024227 : return 0;
402 : }
403 :
404 64 : static int tdb_lock_and_recover(struct tdb_context *tdb)
405 : {
406 0 : int ret;
407 :
408 : /* We need to match locking order in transaction commit. */
409 64 : if (tdb_brlock(tdb, F_WRLCK, FREELIST_TOP, 0, TDB_LOCK_WAIT)) {
410 0 : return -1;
411 : }
412 :
413 64 : if (tdb_brlock(tdb, F_WRLCK, OPEN_LOCK, 1, TDB_LOCK_WAIT)) {
414 0 : tdb_brunlock(tdb, F_WRLCK, FREELIST_TOP, 0);
415 0 : return -1;
416 : }
417 :
418 64 : ret = tdb_transaction_recover(tdb);
419 :
420 64 : tdb_brunlock(tdb, F_WRLCK, OPEN_LOCK, 1);
421 64 : tdb_brunlock(tdb, F_WRLCK, FREELIST_TOP, 0);
422 :
423 64 : return ret;
424 : }
425 :
426 471867100 : static bool have_data_locks(const struct tdb_context *tdb)
427 : {
428 : int i;
429 :
430 602993955 : for (i = 0; i < tdb->num_lockrecs; i++) {
431 195213142 : if (tdb->lockrecs[i].off >= lock_offset(-1))
432 87858525 : return true;
433 : }
434 384008575 : return false;
435 : }
436 :
437 : /*
438 : * A allrecord lock allows us to avoid per chain locks. Check if the allrecord
439 : * lock is strong enough.
440 : */
441 2509688642 : static int tdb_lock_covered_by_allrecord_lock(struct tdb_context *tdb,
442 : int ltype)
443 : {
444 2509688642 : if (ltype == F_RDLCK) {
445 : /*
446 : * The allrecord_lock is equal (F_RDLCK) or stronger
447 : * (F_WRLCK). Pass.
448 : */
449 1978169320 : return 0;
450 : }
451 :
452 511481712 : if (tdb->allrecord_lock.ltype == F_RDLCK) {
453 : /*
454 : * We ask for ltype==F_WRLCK, but the allrecord_lock
455 : * is too weak. We can't upgrade here, so fail.
456 : */
457 0 : tdb->ecode = TDB_ERR_LOCK;
458 0 : return -1;
459 : }
460 :
461 : /*
462 : * Asking for F_WRLCK, allrecord is F_WRLCK as well. Pass.
463 : */
464 497842266 : return 0;
465 : }
466 :
467 1751722009 : static int tdb_lock_list(struct tdb_context *tdb, int list, int ltype,
468 : enum tdb_lock_flags waitflag)
469 : {
470 41849116 : int ret;
471 1751722009 : bool check = false;
472 :
473 1751722009 : if (tdb->allrecord_lock.count) {
474 1261664044 : return tdb_lock_covered_by_allrecord_lock(tdb, ltype);
475 : }
476 :
477 : /*
478 : * Check for recoveries: Someone might have kill -9'ed a process
479 : * during a commit.
480 : */
481 496877688 : check = !have_data_locks(tdb);
482 496877688 : ret = tdb_nest_lock(tdb, lock_offset(list), ltype, waitflag);
483 :
484 496877688 : if (ret == 0 && check && tdb_needs_recovery(tdb)) {
485 16 : tdb_nest_unlock(tdb, lock_offset(list), ltype, false);
486 :
487 16 : if (tdb_lock_and_recover(tdb) == -1) {
488 0 : return -1;
489 : }
490 16 : return tdb_lock_list(tdb, list, ltype, waitflag);
491 : }
492 471867084 : return ret;
493 : }
494 :
495 : /* lock a list in the database. list -1 is the alloc list */
496 1750238096 : int tdb_lock(struct tdb_context *tdb, int list, int ltype)
497 : {
498 41839486 : int ret;
499 :
500 1750238096 : ret = tdb_lock_list(tdb, list, ltype, TDB_LOCK_WAIT);
501 1750238096 : if (ret) {
502 0 : TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_lock failed on list %d "
503 : "ltype=%d (%s)\n", list, ltype, strerror(errno)));
504 : }
505 1750238096 : return ret;
506 : }
507 :
508 : /* lock a list in the database. list -1 is the alloc list. non-blocking lock */
509 : _PUBLIC_ int tdb_lock_nonblock(struct tdb_context *tdb, int list, int ltype);
510 1483897 : _PUBLIC_ int tdb_lock_nonblock(struct tdb_context *tdb, int list, int ltype)
511 : {
512 1483897 : return tdb_lock_list(tdb, list, ltype, TDB_LOCK_NOWAIT);
513 : }
514 :
515 :
516 510293207 : int tdb_nest_unlock(struct tdb_context *tdb, uint32_t offset, int ltype,
517 : bool mark_lock)
518 : {
519 510293207 : int ret = -1;
520 25384426 : struct tdb_lock_type *lck;
521 :
522 510293207 : if (tdb->flags & TDB_NOLOCK)
523 254203722 : return 0;
524 :
525 : /* Sanity checks */
526 237089493 : if (offset >= lock_offset(tdb->hash_size)) {
527 0 : TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_unlock: offset %u invalid (%d)\n", offset, tdb->hash_size));
528 0 : return ret;
529 : }
530 :
531 237089493 : lck = find_nestlock(tdb, offset);
532 237089493 : if ((lck == NULL) || (lck->count == 0)) {
533 0 : TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_unlock: count is 0\n"));
534 0 : return -1;
535 : }
536 :
537 237089493 : if (lck->count > 1) {
538 52079307 : lck->count--;
539 52079307 : return 0;
540 : }
541 :
542 : /*
543 : * This lock has count==1 left, so we need to unlock it in the
544 : * kernel. We don't bother with decrementing the in-memory array
545 : * element, we're about to overwrite it with the last array element
546 : * anyway.
547 : */
548 :
549 185010186 : if (mark_lock) {
550 2 : ret = 0;
551 : } else {
552 185010184 : ret = tdb_brunlock(tdb, ltype, offset, 1);
553 : }
554 :
555 : /*
556 : * Shrink the array by overwriting the element just unlocked with the
557 : * last array element.
558 : */
559 185010186 : *lck = tdb->lockrecs[--tdb->num_lockrecs];
560 :
561 : /*
562 : * We don't bother with realloc when the array shrinks, but if we have
563 : * a completely idle tdb we should get rid of the locked array.
564 : */
565 :
566 185010186 : if (ret)
567 0 : TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_unlock: An error occurred unlocking!\n"));
568 179307162 : return ret;
569 : }
570 :
571 : _PUBLIC_ int tdb_unlock(struct tdb_context *tdb, int list, int ltype);
572 1751719255 : _PUBLIC_ int tdb_unlock(struct tdb_context *tdb, int list, int ltype)
573 : {
574 : /* a global lock allows us to avoid per chain locks */
575 1751719255 : if (tdb->allrecord_lock.count) {
576 1261664044 : return tdb_lock_covered_by_allrecord_lock(tdb, ltype);
577 : }
578 :
579 496874934 : return tdb_nest_unlock(tdb, lock_offset(list), ltype, false);
580 : }
581 :
582 : /*
583 : get the transaction lock
584 : */
585 9086150 : int tdb_transaction_lock(struct tdb_context *tdb, int ltype,
586 : enum tdb_lock_flags lockflags)
587 : {
588 9086150 : return tdb_nest_lock(tdb, TRANSACTION_LOCK, ltype, lockflags);
589 : }
590 :
591 : /*
592 : release the transaction lock
593 : */
594 5719390 : int tdb_transaction_unlock(struct tdb_context *tdb, int ltype)
595 : {
596 5719390 : return tdb_nest_unlock(tdb, TRANSACTION_LOCK, ltype, false);
597 : }
598 :
599 : /* Returns 0 if all done, -1 if error, 1 if ok. */
600 109282623 : static int tdb_allrecord_check(struct tdb_context *tdb, int ltype,
601 : enum tdb_lock_flags flags, bool upgradable)
602 : {
603 : /* There are no locks on read-only dbs */
604 109282623 : if (tdb->read_only || tdb->traverse_read) {
605 0 : tdb->ecode = TDB_ERR_LOCK;
606 0 : return -1;
607 : }
608 :
609 109282623 : if (tdb->allrecord_lock.count &&
610 116 : tdb->allrecord_lock.ltype == (uint32_t)ltype) {
611 116 : tdb->allrecord_lock.count++;
612 116 : return 0;
613 : }
614 :
615 109282507 : if (tdb->allrecord_lock.count) {
616 : /* a global lock of a different type exists */
617 0 : tdb->ecode = TDB_ERR_LOCK;
618 0 : return -1;
619 : }
620 :
621 109282507 : if (tdb_have_extra_locks(tdb)) {
622 : /* can't combine global and chain locks */
623 58 : tdb->ecode = TDB_ERR_LOCK;
624 58 : return -1;
625 : }
626 :
627 109282449 : if (upgradable && ltype != F_RDLCK) {
628 : /* tdb error: you can't upgrade a write lock! */
629 0 : tdb->ecode = TDB_ERR_LOCK;
630 0 : return -1;
631 : }
632 106748560 : return 1;
633 : }
634 :
635 : /* We only need to lock individual bytes, but Linux merges consecutive locks
636 : * so we lock in contiguous ranges. */
637 109294884 : static int tdb_chainlock_gradual(struct tdb_context *tdb,
638 : int ltype, enum tdb_lock_flags flags,
639 : size_t off, size_t len)
640 : {
641 2533905 : int ret;
642 109294884 : enum tdb_lock_flags nb_flags = (flags & ~TDB_LOCK_WAIT);
643 :
644 109294884 : if (len <= 4) {
645 : /* Single record. Just do blocking lock. */
646 7227 : return tdb_brlock(tdb, ltype, off, len, flags);
647 : }
648 :
649 : /* First we try non-blocking. */
650 109287657 : ret = tdb_brlock(tdb, ltype, off, len, nb_flags);
651 109287657 : if (ret == 0) {
652 106747539 : return 0;
653 : }
654 :
655 : /* Try locking first half, then second. */
656 6225 : ret = tdb_chainlock_gradual(tdb, ltype, flags, off, len / 2);
657 6225 : if (ret == -1)
658 2 : return -1;
659 :
660 6223 : ret = tdb_chainlock_gradual(tdb, ltype, flags,
661 6215 : off + len / 2, len - len / 2);
662 6223 : if (ret == -1) {
663 0 : tdb_brunlock(tdb, ltype, off, len / 2);
664 0 : return -1;
665 : }
666 6215 : return 0;
667 : }
668 :
669 : /* lock/unlock entire database. It can only be upgradable if you have some
670 : * other way of guaranteeing exclusivity (ie. transaction write lock).
671 : * We do the locking gradually to avoid being starved by smaller locks. */
672 109282623 : int tdb_allrecord_lock(struct tdb_context *tdb, int ltype,
673 : enum tdb_lock_flags flags, bool upgradable)
674 : {
675 2533896 : int ret;
676 :
677 109282623 : switch (tdb_allrecord_check(tdb, ltype, flags, upgradable)) {
678 58 : case -1:
679 58 : return -1;
680 116 : case 0:
681 116 : return 0;
682 : }
683 :
684 : /* We cover two kinds of locks:
685 : * 1) Normal chain locks. Taken for almost all operations.
686 : * 2) Individual records locks. Taken after normal or free
687 : * chain locks.
688 : *
689 : * It is (1) which cause the starvation problem, so we're only
690 : * gradual for that. */
691 :
692 109282449 : if (tdb_have_mutexes(tdb)) {
693 13 : ret = tdb_mutex_allrecord_lock(tdb, ltype, flags);
694 : } else {
695 109282436 : ret = tdb_chainlock_gradual(tdb, ltype, flags, FREELIST_TOP,
696 109282436 : tdb->hash_size * 4);
697 : }
698 :
699 109282449 : if (ret == -1) {
700 3 : return -1;
701 : }
702 :
703 : /* Grab individual record locks. */
704 109282446 : if (tdb_brlock(tdb, ltype, lock_offset(tdb->hash_size), 0,
705 : flags) == -1) {
706 0 : if (tdb_have_mutexes(tdb)) {
707 0 : tdb_mutex_allrecord_unlock(tdb);
708 : } else {
709 0 : tdb_brunlock(tdb, ltype, FREELIST_TOP,
710 0 : tdb->hash_size * 4);
711 : }
712 0 : return -1;
713 : }
714 :
715 109282446 : tdb->allrecord_lock.count = 1;
716 : /* If it's upgradable, it's actually exclusive so we can treat
717 : * it as a write lock. */
718 109282446 : tdb->allrecord_lock.ltype = upgradable ? F_WRLCK : ltype;
719 109282446 : tdb->allrecord_lock.off = upgradable;
720 :
721 109282446 : if (tdb_needs_recovery(tdb)) {
722 48 : bool mark = flags & TDB_LOCK_MARK_ONLY;
723 48 : tdb_allrecord_unlock(tdb, ltype, mark);
724 48 : if (mark) {
725 0 : tdb->ecode = TDB_ERR_LOCK;
726 0 : TDB_LOG((tdb, TDB_DEBUG_ERROR,
727 : "tdb_lockall_mark cannot do recovery\n"));
728 0 : return -1;
729 : }
730 48 : if (tdb_lock_and_recover(tdb) == -1) {
731 40 : return -1;
732 : }
733 8 : return tdb_allrecord_lock(tdb, ltype, flags, upgradable);
734 : }
735 :
736 106748509 : return 0;
737 : }
738 :
739 :
740 :
741 : /* unlock entire db */
742 109282110 : int tdb_allrecord_unlock(struct tdb_context *tdb, int ltype, bool mark_lock)
743 : {
744 : /* There are no locks on read-only dbs */
745 109282110 : if (tdb->read_only || tdb->traverse_read) {
746 0 : tdb->ecode = TDB_ERR_LOCK;
747 0 : return -1;
748 : }
749 :
750 109282110 : if (tdb->allrecord_lock.count == 0) {
751 0 : tdb->ecode = TDB_ERR_LOCK;
752 0 : return -1;
753 : }
754 :
755 : /* Upgradable locks are marked as write locks. */
756 109282110 : if (tdb->allrecord_lock.ltype != (uint32_t)ltype
757 8 : && (!tdb->allrecord_lock.off || ltype != F_RDLCK)) {
758 0 : tdb->ecode = TDB_ERR_LOCK;
759 0 : return -1;
760 : }
761 :
762 109282110 : if (tdb->allrecord_lock.count > 1) {
763 116 : tdb->allrecord_lock.count--;
764 116 : return 0;
765 : }
766 :
767 109281994 : if (!mark_lock) {
768 2533870 : int ret;
769 :
770 109281992 : if (tdb_have_mutexes(tdb)) {
771 7 : ret = tdb_mutex_allrecord_unlock(tdb);
772 7 : if (ret == 0) {
773 7 : ret = tdb_brunlock(tdb, ltype,
774 7 : lock_offset(tdb->hash_size),
775 : 0);
776 : }
777 : } else {
778 109281985 : ret = tdb_brunlock(tdb, ltype, FREELIST_TOP, 0);
779 : }
780 :
781 109281989 : if (ret != 0) {
782 45 : TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_unlockall failed "
783 : "(%s)\n", strerror(errno)));
784 45 : return -1;
785 : }
786 : }
787 :
788 109281946 : tdb->allrecord_lock.count = 0;
789 109281946 : tdb->allrecord_lock.ltype = 0;
790 :
791 109281946 : return 0;
792 : }
793 :
794 : /* lock entire database with write lock */
795 744 : _PUBLIC_ int tdb_lockall(struct tdb_context *tdb)
796 : {
797 18 : tdb_trace(tdb, "tdb_lockall");
798 744 : return tdb_allrecord_lock(tdb, F_WRLCK, TDB_LOCK_WAIT, false);
799 : }
800 :
801 : /* lock entire database with write lock - mark only */
802 2 : _PUBLIC_ int tdb_lockall_mark(struct tdb_context *tdb)
803 : {
804 0 : tdb_trace(tdb, "tdb_lockall_mark");
805 2 : return tdb_allrecord_lock(tdb, F_WRLCK, TDB_LOCK_MARK_ONLY, false);
806 : }
807 :
808 : /* unlock entire database with write lock - unmark only */
809 2 : _PUBLIC_ int tdb_lockall_unmark(struct tdb_context *tdb)
810 : {
811 0 : tdb_trace(tdb, "tdb_lockall_unmark");
812 2 : return tdb_allrecord_unlock(tdb, F_WRLCK, true);
813 : }
814 :
815 : /* lock entire database with write lock - nonblocking variant */
816 2 : _PUBLIC_ int tdb_lockall_nonblock(struct tdb_context *tdb)
817 : {
818 2 : int ret = tdb_allrecord_lock(tdb, F_WRLCK, TDB_LOCK_NOWAIT, false);
819 0 : tdb_trace_ret(tdb, "tdb_lockall_nonblock", ret);
820 2 : return ret;
821 : }
822 :
823 : /* unlock entire database with write lock */
824 668 : _PUBLIC_ int tdb_unlockall(struct tdb_context *tdb)
825 : {
826 16 : tdb_trace(tdb, "tdb_unlockall");
827 668 : return tdb_allrecord_unlock(tdb, F_WRLCK, false);
828 : }
829 :
830 : /* lock entire database with read lock */
831 105915108 : _PUBLIC_ int tdb_lockall_read(struct tdb_context *tdb)
832 : {
833 2510187 : tdb_trace(tdb, "tdb_lockall_read");
834 105915108 : return tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_WAIT, false);
835 : }
836 :
837 : /* lock entire database with read lock - nonblock variant */
838 0 : _PUBLIC_ int tdb_lockall_read_nonblock(struct tdb_context *tdb)
839 : {
840 0 : int ret = tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_NOWAIT, false);
841 0 : tdb_trace_ret(tdb, "tdb_lockall_read_nonblock", ret);
842 0 : return ret;
843 : }
844 :
845 : /* unlock entire database with read lock */
846 105914638 : _PUBLIC_ int tdb_unlockall_read(struct tdb_context *tdb)
847 : {
848 2510173 : tdb_trace(tdb, "tdb_unlockall_read");
849 105914638 : return tdb_allrecord_unlock(tdb, F_RDLCK, false);
850 : }
851 :
852 : /* lock/unlock one hash chain. This is meant to be used to reduce
853 : contention - it cannot guarantee how many records will be locked */
854 10107515 : _PUBLIC_ int tdb_chainlock(struct tdb_context *tdb, TDB_DATA key)
855 : {
856 10107515 : int ret = tdb_lock(tdb, BUCKET(tdb->hash_fn(&key)), F_WRLCK);
857 156614 : tdb_trace_1rec(tdb, "tdb_chainlock", key);
858 10107515 : return ret;
859 : }
860 :
861 : /* lock/unlock one hash chain, non-blocking. This is meant to be used
862 : to reduce contention - it cannot guarantee how many records will be
863 : locked */
864 9 : _PUBLIC_ int tdb_chainlock_nonblock(struct tdb_context *tdb, TDB_DATA key)
865 : {
866 9 : int ret = tdb_lock_nonblock(tdb, BUCKET(tdb->hash_fn(&key)), F_WRLCK);
867 0 : tdb_trace_1rec_ret(tdb, "tdb_chainlock_nonblock", key, ret);
868 9 : return ret;
869 : }
870 :
871 : /* mark a chain as locked without actually locking it. Warning! use with great caution! */
872 2 : _PUBLIC_ int tdb_chainlock_mark(struct tdb_context *tdb, TDB_DATA key)
873 : {
874 2 : int ret = tdb_nest_lock(tdb, lock_offset(BUCKET(tdb->hash_fn(&key))),
875 : F_WRLCK, TDB_LOCK_MARK_ONLY);
876 0 : tdb_trace_1rec(tdb, "tdb_chainlock_mark", key);
877 2 : return ret;
878 : }
879 :
880 : /* unmark a chain as locked without actually locking it. Warning! use with great caution! */
881 2 : _PUBLIC_ int tdb_chainlock_unmark(struct tdb_context *tdb, TDB_DATA key)
882 : {
883 0 : tdb_trace_1rec(tdb, "tdb_chainlock_unmark", key);
884 2 : return tdb_nest_unlock(tdb, lock_offset(BUCKET(tdb->hash_fn(&key))),
885 : F_WRLCK, true);
886 : }
887 :
888 10107513 : _PUBLIC_ int tdb_chainunlock(struct tdb_context *tdb, TDB_DATA key)
889 : {
890 156613 : tdb_trace_1rec(tdb, "tdb_chainunlock", key);
891 10107513 : return tdb_unlock(tdb, BUCKET(tdb->hash_fn(&key)), F_WRLCK);
892 : }
893 :
894 238 : _PUBLIC_ int tdb_chainlock_read(struct tdb_context *tdb, TDB_DATA key)
895 : {
896 0 : int ret;
897 238 : ret = tdb_lock(tdb, BUCKET(tdb->hash_fn(&key)), F_RDLCK);
898 0 : tdb_trace_1rec(tdb, "tdb_chainlock_read", key);
899 238 : return ret;
900 : }
901 :
902 236 : _PUBLIC_ int tdb_chainunlock_read(struct tdb_context *tdb, TDB_DATA key)
903 : {
904 0 : tdb_trace_1rec(tdb, "tdb_chainunlock_read", key);
905 236 : return tdb_unlock(tdb, BUCKET(tdb->hash_fn(&key)), F_RDLCK);
906 : }
907 :
908 0 : _PUBLIC_ int tdb_chainlock_read_nonblock(struct tdb_context *tdb, TDB_DATA key)
909 : {
910 0 : int ret = tdb_lock_nonblock(tdb, BUCKET(tdb->hash_fn(&key)), F_RDLCK);
911 0 : tdb_trace_1rec_ret(tdb, "tdb_chainlock_read_nonblock", key, ret);
912 0 : return ret;
913 : }
914 :
915 : /* record lock stops delete underneath */
916 569219423 : int tdb_lock_record(struct tdb_context *tdb, tdb_off_t off)
917 : {
918 569219423 : if (tdb->allrecord_lock.count) {
919 538183733 : return 0;
920 : }
921 27427073 : return off ? tdb_brlock(tdb, F_RDLCK, off, 1, TDB_LOCK_WAIT) : 0;
922 : }
923 :
924 : /*
925 : Write locks override our own fcntl readlocks, so check it here.
926 : Note this is meant to be F_SETLK, *not* F_SETLKW, as it's not
927 : an error to fail to get the lock here.
928 : */
929 2989040 : int tdb_write_lock_record(struct tdb_context *tdb, tdb_off_t off)
930 : {
931 215275 : struct tdb_traverse_lock *i;
932 2989040 : if (tdb == NULL) {
933 0 : return -1;
934 : }
935 6053667 : for (i = &tdb->travlocks; i; i = i->next)
936 3151841 : if (i->off == off)
937 48855 : return -1;
938 2901826 : if (tdb->allrecord_lock.count) {
939 1793088 : if (tdb->allrecord_lock.ltype == F_WRLCK) {
940 1619642 : return 0;
941 : }
942 0 : return -1;
943 : }
944 1108738 : return tdb_brlock(tdb, F_WRLCK, off, 1, TDB_LOCK_NOWAIT|TDB_LOCK_PROBE);
945 : }
946 :
947 2901271 : int tdb_write_unlock_record(struct tdb_context *tdb, tdb_off_t off)
948 : {
949 2901271 : if (tdb->allrecord_lock.count) {
950 1619642 : return 0;
951 : }
952 1108183 : return tdb_brunlock(tdb, F_WRLCK, off, 1);
953 : }
954 :
955 : /* fcntl locks don't stack: avoid unlocking someone else's */
956 569219453 : int tdb_unlock_record(struct tdb_context *tdb, tdb_off_t off)
957 : {
958 5462099 : struct tdb_traverse_lock *i;
959 569219453 : uint32_t count = 0;
960 :
961 569219453 : if (tdb->allrecord_lock.count) {
962 538183733 : return 0;
963 : }
964 :
965 27427107 : if (off == 0)
966 12 : return 0;
967 82281023 : for (i = &tdb->travlocks; i; i = i->next)
968 54853950 : if (i->off == off)
969 27427075 : count++;
970 27427073 : return (count == 1 ? tdb_brunlock(tdb, F_RDLCK, off, 1) : 0);
971 : }
972 :
973 115010994 : bool tdb_have_extra_locks(struct tdb_context *tdb)
974 : {
975 115010994 : unsigned int extra = tdb->num_lockrecs;
976 :
977 : /* A transaction holds the lock for all records. */
978 115010994 : if (!tdb->transaction && tdb->allrecord_lock.count) {
979 0 : return true;
980 : }
981 :
982 : /* We always hold the active lock if CLEAR_IF_FIRST. */
983 115010994 : if (find_nestlock(tdb, ACTIVE_LOCK)) {
984 556489 : extra--;
985 : }
986 :
987 : /* In a transaction, we expect to hold the transaction lock */
988 115051390 : if (tdb->transaction && find_nestlock(tdb, TRANSACTION_LOCK)) {
989 4499789 : extra--;
990 : }
991 :
992 115010994 : return extra;
993 : }
994 :
995 : /* The transaction code uses this to remove all locks. */
996 3366757 : void tdb_release_transaction_locks(struct tdb_context *tdb)
997 : {
998 23688 : int i;
999 3366757 : unsigned int active = 0;
1000 :
1001 3366757 : if (tdb->allrecord_lock.count != 0) {
1002 3366751 : tdb_allrecord_unlock(tdb, tdb->allrecord_lock.ltype, false);
1003 3366748 : tdb->allrecord_lock.count = 0;
1004 : }
1005 :
1006 7871533 : for (i=0;i<tdb->num_lockrecs;i++) {
1007 4504785 : struct tdb_lock_type *lck = &tdb->lockrecs[i];
1008 :
1009 : /* Don't release the active lock! Copy it to first entry. */
1010 4504785 : if (lck->off == ACTIVE_LOCK) {
1011 5002 : tdb->lockrecs[active++] = *lck;
1012 : } else {
1013 4499783 : tdb_brunlock(tdb, lck->ltype, lck->off, 1);
1014 : }
1015 : }
1016 3366748 : tdb->num_lockrecs = active;
1017 3366748 : }
1018 :
1019 : /* Following functions are added specifically to support CTDB. */
1020 :
1021 : /* Don't do actual fcntl locking, just mark tdb locked */
1022 : _PUBLIC_ int tdb_transaction_write_lock_mark(struct tdb_context *tdb);
1023 0 : _PUBLIC_ int tdb_transaction_write_lock_mark(struct tdb_context *tdb)
1024 : {
1025 0 : return tdb_transaction_lock(tdb, F_WRLCK, TDB_LOCK_MARK_ONLY);
1026 : }
1027 :
1028 : /* Don't do actual fcntl unlocking, just mark tdb unlocked */
1029 : _PUBLIC_ int tdb_transaction_write_lock_unmark(struct tdb_context *tdb);
1030 0 : _PUBLIC_ int tdb_transaction_write_lock_unmark(struct tdb_context *tdb)
1031 : {
1032 0 : return tdb_nest_unlock(tdb, TRANSACTION_LOCK, F_WRLCK, true);
1033 : }
|