Line data Source code
1 : /*
2 : Unix SMB/CIFS implementation.
3 : Samba utility functions
4 : Copyright (C) Andrew Tridgell 1992-2001
5 : Copyright (C) Simo Sorce 2001
6 : Copyright (C) Andrew Bartlett 2011
7 : Copyright (C) Jeremy Allison 1992-2007
8 : Copyright (C) Martin Pool 2003
9 : Copyright (C) James Peach 2006
10 :
11 : This program is free software; you can redistribute it and/or modify
12 : it under the terms of the GNU General Public License as published by
13 : the Free Software Foundation; either version 3 of the License, or
14 : (at your option) any later version.
15 :
16 : This program is distributed in the hope that it will be useful,
17 : but WITHOUT ANY WARRANTY; without even the implied warranty of
18 : MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 : GNU General Public License for more details.
20 :
21 : You should have received a copy of the GNU General Public License
22 : along with this program. If not, see <http://www.gnu.org/licenses/>.
23 : */
24 :
25 : #include "replace.h"
26 : #include "system/locale.h"
27 : #include "charset.h"
28 : #include "lib/util/fault.h"
29 : #include "lib/util/tsort.h"
30 :
31 : #ifdef strcasecmp
32 : #undef strcasecmp
33 : #endif
34 : #ifdef strncasecmp
35 : #undef strncasecmp
36 : #endif
37 :
38 :
39 : /**
40 : Case insensitive string comparison, handle specified for testing
41 : **/
42 348998918 : _PUBLIC_ int strcasecmp_m_handle(struct smb_iconv_handle *iconv_handle,
43 : const char *s1, const char *s2)
44 : {
45 348998918 : codepoint_t c1=0, c2=0;
46 348998918 : codepoint_t u1=0, u2=0;
47 348998918 : codepoint_t l1=0, l2=0;
48 2223030 : size_t size1, size2;
49 :
50 : /* handle null ptr comparisons to simplify the use in qsort */
51 348998918 : if (s1 == s2) return 0;
52 348998171 : if (s1 == NULL) return -1;
53 348998169 : if (s2 == NULL) return 1;
54 :
55 1166294483 : while (*s1 && *s2) {
56 1148644543 : c1 = next_codepoint_handle(iconv_handle, s1, &size1);
57 1148644543 : c2 = next_codepoint_handle(iconv_handle, s2, &size2);
58 :
59 1148644543 : if (c1 == INVALID_CODEPOINT ||
60 3962851 : c2 == INVALID_CODEPOINT) {
61 9 : return strcasecmp(s1, s2);
62 : }
63 :
64 1148644534 : s1 += size1;
65 1148644534 : s2 += size2;
66 :
67 1148644534 : if (c1 == c2) {
68 815671284 : continue;
69 : }
70 :
71 332973250 : u1 = toupper_m(c1);
72 332973250 : u2 = toupper_m(c2);
73 332973250 : if (u1 == u2) {
74 1625032 : continue;
75 : }
76 :
77 331348218 : l1 = tolower_m(c1);
78 331348218 : l2 = tolower_m(c2);
79 331348218 : if (l1 == l2) {
80 0 : continue;
81 : }
82 :
83 331348218 : return NUMERIC_CMP(l1, l2);
84 : }
85 :
86 17649940 : return NUMERIC_CMP(*s1, *s2);
87 : }
88 :
89 : /**
90 : Case insensitive string comparison
91 : **/
92 348998900 : _PUBLIC_ int strcasecmp_m(const char *s1, const char *s2)
93 : {
94 348998900 : struct smb_iconv_handle *iconv_handle = get_iconv_handle();
95 348998900 : return strcasecmp_m_handle(iconv_handle, s1, s2);
96 : }
97 :
98 : /**
99 : Case insensitive string comparison, length limited, handle specified for
100 : testing
101 : **/
102 7562928 : _PUBLIC_ int strncasecmp_m_handle(struct smb_iconv_handle *iconv_handle,
103 : const char *s1, const char *s2, size_t n)
104 : {
105 7562928 : codepoint_t c1=0, c2=0;
106 7562928 : codepoint_t u1=0, u2=0;
107 7562928 : codepoint_t l1=0, l2=0;
108 8512 : size_t size1, size2;
109 :
110 : /* handle null ptr comparisons to simplify the use in qsort */
111 7562928 : if (s1 == s2) return 0;
112 7562632 : if (s1 == NULL) return -1;
113 7562631 : if (s2 == NULL) return 1;
114 :
115 19024766 : while (*s1 && *s2 && n) {
116 18181337 : n--;
117 :
118 18181337 : c1 = next_codepoint_handle(iconv_handle, s1, &size1);
119 18181337 : c2 = next_codepoint_handle(iconv_handle, s2, &size2);
120 :
121 18181337 : if (c1 == INVALID_CODEPOINT ||
122 25188 : c2 == INVALID_CODEPOINT) {
123 : /*
124 : * n was specified in characters,
125 : * now we must convert it to bytes.
126 : * As bytes are the smallest
127 : * character unit, the following
128 : * increment and strncasecmp is always
129 : * safe.
130 : *
131 : * The source string was already known
132 : * to be n characters long, so we are
133 : * guaranteed to be able to look at the
134 : * (n remaining + size1) bytes from the
135 : * s1 position).
136 : */
137 1 : n += size1;
138 1 : return strncasecmp(s1, s2, n);
139 : }
140 :
141 18181336 : s1 += size1;
142 18181336 : s2 += size2;
143 :
144 18181336 : if (c1 == c2) {
145 11441557 : continue;
146 : }
147 :
148 6739779 : u1 = toupper_m(c1);
149 6739779 : u2 = toupper_m(c2);
150 6739779 : if (u1 == u2) {
151 20579 : continue;
152 : }
153 :
154 6719200 : l1 = tolower_m(c1);
155 6719200 : l2 = tolower_m(c2);
156 6719200 : if (l1 == l2) {
157 0 : continue;
158 : }
159 :
160 6719200 : return NUMERIC_CMP(l1, l2);
161 : }
162 :
163 843429 : if (n == 0) {
164 834409 : return 0;
165 : }
166 :
167 6772 : return NUMERIC_CMP(*s1, *s2);
168 : }
169 :
170 : /**
171 : Case insensitive string comparison, length limited
172 : **/
173 7562916 : _PUBLIC_ int strncasecmp_m(const char *s1, const char *s2, size_t n)
174 : {
175 7562916 : struct smb_iconv_handle *iconv_handle = get_iconv_handle();
176 7562916 : return strncasecmp_m_handle(iconv_handle, s1, s2, n);
177 : }
178 :
179 : /**
180 : * Compare 2 strings.
181 : *
182 : * @note The comparison is case-insensitive.
183 : **/
184 100140 : _PUBLIC_ bool strequal_m(const char *s1, const char *s2)
185 : {
186 100140 : return strcasecmp_m(s1,s2) == 0;
187 : }
188 :
189 : /**
190 : Compare 2 strings (case sensitive).
191 : **/
192 3327360 : _PUBLIC_ bool strcsequal(const char *s1,const char *s2)
193 : {
194 3327360 : if (s1 == s2)
195 40 : return true;
196 3327312 : if (!s1 || !s2)
197 0 : return false;
198 :
199 3327310 : return strcmp(s1,s2) == 0;
200 : }
201 :
202 : /**
203 : * Calculate the number of units (8 or 16-bit, depending on the
204 : * destination charset) that would be needed to convert the input
205 : * string, which is expected to be in src_charset encoding, to the
206 : * destination charset (which should be a unicode charset).
207 : */
208 41242405 : _PUBLIC_ size_t strlen_m_ext_handle(struct smb_iconv_handle *ic,
209 : const char *s, charset_t src_charset, charset_t dst_charset)
210 : {
211 41242405 : size_t count = 0;
212 :
213 : #ifdef DEVELOPER
214 41242405 : switch (dst_charset) {
215 0 : case CH_DOS:
216 : case CH_UNIX:
217 0 : smb_panic("cannot call strlen_m_ext() with a variable dest charset (must be UTF16* or UTF8)");
218 40384469 : default:
219 41242405 : break;
220 : }
221 :
222 41242405 : switch (src_charset) {
223 0 : case CH_UTF16LE:
224 : case CH_UTF16BE:
225 0 : smb_panic("cannot call strlen_m_ext() with a UTF16 src charset (must be DOS, UNIX, DISPLAY or UTF8)");
226 40384469 : default:
227 41242405 : break;
228 : }
229 : #endif
230 41242405 : if (!s) {
231 66154 : return 0;
232 : }
233 :
234 1179144529 : while (*s && !(((uint8_t)*s) & 0x80)) {
235 1137972046 : s++;
236 1137972046 : count++;
237 : }
238 :
239 41172483 : if (!*s) {
240 40306454 : return count;
241 : }
242 :
243 575596 : while (*s) {
244 3536 : size_t c_size;
245 563707 : codepoint_t c = next_codepoint_handle_ext(ic, s, strnlen(s, 5),
246 : src_charset, &c_size);
247 563707 : s += c_size;
248 :
249 563707 : switch (dst_charset) {
250 555682 : case CH_UTF16LE:
251 : case CH_UTF16BE:
252 : case CH_UTF16MUNGED:
253 555682 : if (c < 0x10000) {
254 : /* Unicode char fits into 16 bits. */
255 492815 : count += 1;
256 : } else {
257 : /* Double-width unicode char - 32 bits. */
258 62867 : count += 2;
259 : }
260 553391 : break;
261 8025 : case CH_UTF8:
262 : /*
263 : * this only checks ranges, and does not
264 : * check for invalid codepoints
265 : */
266 8025 : if (c < 0x80) {
267 6116 : count += 1;
268 1909 : } else if (c < 0x800) {
269 871 : count += 2;
270 1038 : } else if (c < 0x10000) {
271 1038 : count += 3;
272 : } else {
273 0 : count += 4;
274 : }
275 6780 : break;
276 0 : default:
277 : /*
278 : * non-unicode encoding:
279 : * assume that each codepoint fits into
280 : * one unit in the destination encoding.
281 : */
282 0 : count += 1;
283 : }
284 : }
285 :
286 11861 : return count;
287 : }
288 :
289 : /**
290 : * Calculate the number of units (8 or 16-bit, depending on the
291 : * destination charset) that would be needed to convert the input
292 : * string, which is expected to be in src_charset encoding, to the
293 : * destination charset (which should be a unicode charset).
294 : */
295 41242393 : _PUBLIC_ size_t strlen_m_ext(const char *s, charset_t src_charset, charset_t dst_charset)
296 : {
297 41242393 : struct smb_iconv_handle *ic = get_iconv_handle();
298 41242393 : return strlen_m_ext_handle(ic, s, src_charset, dst_charset);
299 : }
300 :
301 25530081 : _PUBLIC_ size_t strlen_m_ext_term(const char *s, const charset_t src_charset,
302 : const charset_t dst_charset)
303 : {
304 25530081 : if (!s) {
305 94436 : return 0;
306 : }
307 25435289 : return strlen_m_ext(s, src_charset, dst_charset) + 1;
308 : }
309 :
310 932098 : _PUBLIC_ size_t strlen_m_ext_term_null(const char *s,
311 : const charset_t src_charset,
312 : const charset_t dst_charset)
313 : {
314 1952 : size_t len;
315 932098 : if (!s) {
316 972 : return 0;
317 : }
318 931125 : len = strlen_m_ext(s, src_charset, dst_charset);
319 931125 : if (len == 0) {
320 608631 : return 0;
321 : }
322 :
323 322265 : return len+1;
324 : }
325 :
326 : /**
327 : * Calculate the number of 16-bit units that would be needed to convert
328 : * the input string, which is expected to be in CH_UNIX encoding, to UTF16.
329 : *
330 : * This will be the same as the number of bytes in a string for single
331 : * byte strings, but will be different for multibyte.
332 : */
333 14875973 : _PUBLIC_ size_t strlen_m(const char *s)
334 : {
335 14875973 : return strlen_m_ext(s, CH_UNIX, CH_UTF16LE);
336 : }
337 :
338 : /**
339 : Work out the number of multibyte chars in a string, including the NULL
340 : terminator.
341 : **/
342 2254882 : _PUBLIC_ size_t strlen_m_term(const char *s)
343 : {
344 2254882 : return strlen_m_ext_term(s, CH_UNIX, CH_UTF16LE);
345 : }
346 :
347 : /*
348 : * Weird helper routine for the winreg pipe: If nothing is around, return 0,
349 : * if a string is there, include the terminator.
350 : */
351 :
352 932098 : _PUBLIC_ size_t strlen_m_term_null(const char *s)
353 : {
354 932098 : return strlen_m_ext_term_null(s, CH_UNIX, CH_UTF16LE);
355 : }
356 :
357 : /**
358 : Strchr and strrchr_m are a bit complex on general multi-byte strings.
359 : **/
360 319574479 : _PUBLIC_ char *strchr_m(const char *src, char c)
361 : {
362 2047577 : const char *s;
363 319574479 : struct smb_iconv_handle *ic = get_iconv_handle();
364 319574479 : if (src == NULL) {
365 0 : return NULL;
366 : }
367 : /* characters below 0x3F are guaranteed to not appear in
368 : non-initial position in multi-byte charsets */
369 319574479 : if ((c & 0xC0) == 0) {
370 94384474 : return strchr(src, c);
371 : }
372 :
373 : /* this is quite a common operation, so we want it to be
374 : fast. We optimise for the ascii case, knowing that all our
375 : supported multi-byte character sets are ascii-compatible
376 : (ie. they match for the first 128 chars) */
377 :
378 1566113932 : for (s = src; *s && !(((unsigned char)s[0]) & 0x80); s++) {
379 1340988275 : if (*s == c)
380 64348 : return discard_const_p(char, s);
381 : }
382 :
383 225125657 : if (!*s)
384 223909844 : return NULL;
385 :
386 : #ifdef BROKEN_UNICODE_COMPOSE_CHARACTERS
387 : /* With compose characters we must restart from the beginning. JRA. */
388 : s = src;
389 : #endif
390 :
391 4 : while (*s) {
392 3 : size_t size;
393 3 : codepoint_t c2 = next_codepoint_handle(ic, s, &size);
394 3 : if (c2 == c) {
395 0 : return discard_const_p(char, s);
396 : }
397 3 : s += size;
398 : }
399 :
400 0 : return NULL;
401 : }
402 :
403 : /**
404 : * Multibyte-character version of strrchr
405 : */
406 7845894 : _PUBLIC_ char *strrchr_m(const char *s, char c)
407 : {
408 38671 : struct smb_iconv_handle *ic;
409 7845894 : char *ret = NULL;
410 :
411 7845894 : if (s == NULL) {
412 0 : return NULL;
413 : }
414 :
415 : /* characters below 0x3F are guaranteed to not appear in
416 : non-initial position in multi-byte charsets */
417 7845894 : if ((c & 0xC0) == 0) {
418 7794119 : return strrchr(s, c);
419 : }
420 :
421 : /* this is quite a common operation, so we want it to be
422 : fast. We optimise for the ascii case, knowing that all our
423 : supported multi-byte character sets are ascii-compatible
424 : (ie. they match for the first 128 chars). Also, in Samba
425 : we only search for ascii characters in 'c' and that
426 : in all mb character sets with a compound character
427 : containing c, if 'c' is not a match at position
428 : p, then p[-1] > 0x7f. JRA. */
429 :
430 : {
431 51775 : size_t len = strlen(s);
432 51775 : const char *cp = s;
433 51775 : bool got_mb = false;
434 :
435 51775 : if (len == 0)
436 106 : return NULL;
437 51669 : cp += (len - 1);
438 1694 : do {
439 342614 : if (c == *cp) {
440 : /* Could be a match. Part of a multibyte ? */
441 33992 : if ((cp > s) &&
442 32124 : (((unsigned char)cp[-1]) & 0x80)) {
443 : /* Yep - go slow :-( */
444 0 : got_mb = true;
445 0 : break;
446 : }
447 : /* No - we have a match ! */
448 33811 : return discard_const_p(char , cp);
449 : }
450 308622 : } while (cp-- != s);
451 17642 : if (!got_mb)
452 17642 : return NULL;
453 : }
454 :
455 0 : ic = get_iconv_handle();
456 :
457 0 : while (*s) {
458 0 : size_t size;
459 0 : codepoint_t c2 = next_codepoint_handle(ic, s, &size);
460 0 : if (c2 == c) {
461 0 : ret = discard_const_p(char, s);
462 : }
463 0 : s += size;
464 : }
465 :
466 0 : return ret;
467 : }
468 :
469 : /**
470 : return True if any (multi-byte) character is lower case
471 : */
472 35 : _PUBLIC_ bool strhaslower_handle(struct smb_iconv_handle *ic,
473 : const char *string)
474 : {
475 963 : while (*string) {
476 950 : size_t c_size;
477 950 : codepoint_t s;
478 950 : codepoint_t t;
479 :
480 950 : s = next_codepoint_handle(ic, string, &c_size);
481 950 : string += c_size;
482 :
483 950 : t = toupper_m(s);
484 :
485 950 : if (s != t) {
486 22 : return true; /* that means it has lower case chars */
487 : }
488 : }
489 :
490 0 : return false;
491 : }
492 :
493 17 : _PUBLIC_ bool strhaslower(const char *string)
494 : {
495 17 : struct smb_iconv_handle *ic = get_iconv_handle();
496 17 : return strhaslower_handle(ic, string);
497 : }
498 :
499 : /**
500 : return True if any (multi-byte) character is upper case
501 : */
502 35 : _PUBLIC_ bool strhasupper_handle(struct smb_iconv_handle *ic,
503 : const char *string)
504 : {
505 954 : while (*string) {
506 941 : size_t c_size;
507 941 : codepoint_t s;
508 941 : codepoint_t t;
509 :
510 941 : s = next_codepoint_handle(ic, string, &c_size);
511 941 : string += c_size;
512 :
513 941 : t = tolower_m(s);
514 :
515 941 : if (s != t) {
516 22 : return true; /* that means it has upper case chars */
517 : }
518 : }
519 :
520 0 : return false;
521 : }
522 :
523 17 : _PUBLIC_ bool strhasupper(const char *string)
524 : {
525 17 : struct smb_iconv_handle *ic = get_iconv_handle();
526 17 : return strhasupper_handle(ic, string);
527 : }
528 :
529 : /***********************************************************************
530 : strstr_m - We convert via ucs2 for now.
531 : ***********************************************************************/
532 :
533 2491051 : char *strstr_m(const char *src, const char *findstr)
534 : {
535 2491051 : TALLOC_CTX *mem_ctx = NULL;
536 9793 : smb_ucs2_t *p;
537 9793 : smb_ucs2_t *src_w, *find_w;
538 9793 : const char *s;
539 9793 : char *s2;
540 2491051 : char *retp = NULL;
541 2491051 : size_t converted_size, findstr_len = 0;
542 :
543 : /* for correctness */
544 2491051 : if (!findstr[0]) {
545 0 : return discard_const_p(char, src);
546 : }
547 :
548 : /* Samba does single character findstr calls a *lot*. */
549 2491049 : if (findstr[1] == '\0')
550 106875 : return strchr_m(src, *findstr);
551 :
552 : /* We optimise for the ascii case, knowing that all our
553 : supported multi-byte character sets are ascii-compatible
554 : (ie. they match for the first 128 chars) */
555 :
556 47490108 : for (s = src; *s && !(((unsigned char)s[0]) & 0x80); s++) {
557 45925833 : if (*s == *findstr) {
558 2464860 : if (!findstr_len)
559 1459322 : findstr_len = strlen(findstr);
560 :
561 2464860 : if (strncmp(s, findstr, findstr_len) == 0) {
562 819899 : return discard_const_p(char, s);
563 : }
564 : }
565 : }
566 :
567 1564275 : if (!*s)
568 1560141 : return NULL;
569 :
570 : #if 1 /* def BROKEN_UNICODE_COMPOSE_CHARACTERS */
571 : /* 'make check' fails unless we do this */
572 :
573 : /* With compose characters we must restart from the beginning. JRA. */
574 9 : s = src;
575 : #endif
576 :
577 : /*
578 : * Use get_iconv_handle() just as a non-NULL talloc ctx. In
579 : * case we leak memory, this should then be more obvious in
580 : * the talloc report.
581 : */
582 9 : mem_ctx = talloc_new(get_iconv_handle());
583 9 : if (mem_ctx == NULL) {
584 0 : return NULL;
585 : }
586 :
587 9 : if (!push_ucs2_talloc(mem_ctx, &src_w, src, &converted_size)) {
588 0 : goto done;
589 : }
590 :
591 9 : if (!push_ucs2_talloc(mem_ctx, &find_w, findstr, &converted_size)) {
592 3 : goto done;
593 : }
594 :
595 6 : p = strstr_w(src_w, find_w);
596 :
597 6 : if (!p) {
598 3 : goto done;
599 : }
600 :
601 3 : *p = 0;
602 3 : if (!pull_ucs2_talloc(mem_ctx, &s2, src_w, &converted_size)) {
603 0 : goto done;
604 : }
605 3 : retp = discard_const_p(char, (s+strlen(s2)));
606 9 : done:
607 9 : TALLOC_FREE(mem_ctx);
608 9 : return retp;
609 : }
|