Fix for negated class escape misinterpretation in regex character classes See the following bug entry: http://sourceforge.net/tracker/index.php?func=detail&aid=1760116&group_id=11005&atid=111005 [ 1760116 ] Negated escape sequences misinterpreted in character class This fix simply adds a few more character tables so that the negated charset's characters can be added to the []-bracketed custom charset, as is the case for the positive charset escapes. Interestingly, I notice that (?n\W) does not match newlines (my patch allows (?n[\W]) to do so, which is rather inconsistent). This is true also for \L, \D. Also \y without (?n ) around it will match newline. I believe these to be faults. diff -u nedit_official/source/regularExp.c nedit_mod/source/regularExp.c --- nedit_official/source/regularExp.c 2006-08-13 23:47:45.000000000 +0200 +++ nedit_mod/source/regularExp.c 2007-07-26 12:58:20.852041600 +0200 @@ -486,6 +486,14 @@ static unsigned char ASCII_Digits [] = "0123456789"; /* Same for all */ /* locales. */ + +static unsigned char NWhite_Space [ALNUM_CHAR_SIZE]; /* negated classes */ +static unsigned char NWord_Char [ALNUM_CHAR_SIZE]; +static unsigned char NLetter_Char [ALNUM_CHAR_SIZE]; + +static unsigned char NASCII_Digits [ALNUM_CHAR_SIZE];/* Same for all */ + /* locales. */ + static int Is_Case_Insensitive; static int Match_Newline; @@ -2277,97 +2285,70 @@ return ret_val; /* Just checking if this is a valid shortcut escape. */ } - switch (c) { - case 'd': - case 'D': - if (emit == EMIT_CLASS_BYTES) { - class = ASCII_Digits; - } else if (emit == EMIT_NODE) { - ret_val = (islower (c) ? emit_node (DIGIT) - : emit_node (NOT_DIGIT)); - } - - break; - - case 'l': - case 'L': - if (emit == EMIT_CLASS_BYTES) { - class = Letter_Char; - } else if (emit == EMIT_NODE) { - ret_val = (islower (c) ? emit_node (LETTER) - : emit_node (NOT_LETTER)); - } - - break; - - case 's': - case 'S': - if (emit == EMIT_CLASS_BYTES) { - if (Match_Newline) emit_byte ('\n'); - - class = White_Space; - } else if (emit == EMIT_NODE) { - if (Match_Newline) { - ret_val = (islower (c) ? emit_node (SPACE_NL) - : emit_node (NOT_SPACE_NL)); - } else { - ret_val = (islower (c) ? emit_node (SPACE) - : emit_node (NOT_SPACE)); - } - } - - break; - - case 'w': - case 'W': - if (emit == EMIT_CLASS_BYTES) { - class = Word_Char; - } else if (emit == EMIT_NODE) { - ret_val = (islower (c) ? emit_node (WORD_CHAR) - : emit_node (NOT_WORD_CHAR)); - } - - break; - - /* Since the delimiter table is not available at regex compile time \B, - \Y and \Y can only generate a node. At run time, the delimiter table - will be available for these nodes to use. */ - - case 'y': - - if (emit == EMIT_NODE) { - ret_val = emit_node (IS_DELIM); - } else { - REG_FAIL ("internal error #5 `shortcut_escape\'"); - } - - break; - - case 'Y': - - if (emit == EMIT_NODE) { - ret_val = emit_node (NOT_DELIM); - } else { - REG_FAIL ("internal error #6 `shortcut_escape\'"); + if (emit == EMIT_CLASS_BYTES) { + /* we need to add '\n' for classes non-digit \D, non-letter \L, + non-word \W and whitespace \s if matching newlines */ + if (Match_Newline) { + switch (c) { + case 'D': + case 'L': + case 's': + case 'W': + emit_byte ('\n'); + break; + default: + break; } + } + /* now provide all the extra characters required for the class */ + switch (c) { + case 'd': class = ASCII_Digits; break; + case 'D': class = NASCII_Digits; break; + case 'l': class = Letter_Char; break; + case 'L': class = NLetter_Char; break; + case 's': class = White_Space; break; + case 'S': class = NWhite_Space; break; + case 'w': class = Word_Char; break; + case 'W': class = NWord_Char; break; + + /* Since the delimiter table is not available at regex compile time + \B, \Y and \Y can only generate a node. At run time, the delimiter + table will be available for these nodes to use. */ + case 'y': REG_FAIL ("internal error #5 `shortcut_escape\'"); + case 'Y': REG_FAIL ("internal error #6 `shortcut_escape\'"); + case 'B': REG_FAIL ("internal error #7 `shortcut_escape\'"); + default: REG_FAIL ("internal error #8 `shortcut_escape\'"); + } + } + else if (emit == EMIT_NODE) { + switch (c) { + case 'd': ret_val = emit_node (DIGIT); break; + case 'D': ret_val = emit_node (NOT_DIGIT); break; + + case 'l': ret_val = emit_node (LETTER); break; + case 'L': ret_val = emit_node (NOT_LETTER); break; + + case 's': ret_val = Match_Newline ? emit_node (SPACE_NL) + : emit_node (SPACE); + break; + case 'S': ret_val = Match_Newline ? emit_node (NOT_SPACE_NL) + : emit_node (NOT_SPACE); + break; - break; + case 'w': ret_val = emit_node (WORD_CHAR); break; + case 'W': ret_val = emit_node (NOT_WORD_CHAR); break; - case 'B': + case 'y': ret_val = emit_node (IS_DELIM); break; + case 'Y': ret_val = emit_node (NOT_DELIM); break; - if (emit == EMIT_NODE) { - ret_val = emit_node (NOT_BOUNDARY); - } else { - REG_FAIL ("internal error #7 `shortcut_escape\'"); - } + case 'B': ret_val = emit_node (NOT_BOUNDARY); break; - break; - - default: - /* We get here if there isn't a case for every character in - the string "codes" */ + default: + /* We get here if there isn't a case for every character in + the string "codes" */ - REG_FAIL ("internal error #8 `shortcut_escape\'"); + REG_FAIL ("internal error #8 `shortcut_escape\'"); + } } if (emit == EMIT_NODE && c != 'B') { @@ -2935,46 +2916,73 @@ static int initialized = 0; static int underscore = (int) '_'; - int i, word_count, letter_count, space_count; + int i, word_count, letter_count, space_count, + nword_count, nletter_count, nspace_count, ndigit_count; if (!initialized) { initialized = 1; /* Only need to generate character sets once. */ word_count = 0; letter_count = 0; space_count = 0; + nword_count = 0; + nletter_count = 0; + nspace_count = 0; + ndigit_count = 0; + + /* for every 8-bit value except '\0' == 0 */ + for (i = 1; i <= (int)UCHAR_MAX; i++) { + /* Note: Whether or not newline is considered to be a member of a class + is handled by switches within the original regex and is thus omitted + here. */ + if (i == (int) '\n') + continue; - for (i = 1; i < (int)UCHAR_MAX; i++) { if (isalnum (i) || i == underscore) { Word_Char [word_count++] = (unsigned char) i; + } else { + NWord_Char [nword_count++] = (unsigned char) i; } if (isalpha (i)) { Letter_Char [letter_count++] = (unsigned char) i; + } else { + NLetter_Char [nletter_count++] = (unsigned char) i; } - /* Note: Whether or not newline is considered to be whitespace is - handled by switches within the original regex and is thus omitted - here. */ - - if (isspace (i) && (i != (int) '\n')) { + if (isspace (i)) { White_Space [space_count++] = (unsigned char) i; + } else { + NWhite_Space [nspace_count++] = (unsigned char) i; + } + + if (strchr ((char *) ASCII_Digits, i) == NULL) { + NASCII_Digits[ndigit_count++] = (unsigned char) i; } /* Make sure arrays are big enough. ("- 2" because of zero array origin and we need to leave room for the NULL terminator.) */ - if (word_count > (ALNUM_CHAR_SIZE - 2) || - space_count > (WHITE_SPACE_SIZE - 2) || - letter_count > (ALNUM_CHAR_SIZE - 2)) { + if (word_count > (ALNUM_CHAR_SIZE - 2) || + space_count > (WHITE_SPACE_SIZE - 2) || + letter_count > (ALNUM_CHAR_SIZE - 2) || + nword_count > (ALNUM_CHAR_SIZE - 2) || + nspace_count > (ALNUM_CHAR_SIZE - 2) || + nletter_count > (ALNUM_CHAR_SIZE - 2) || + ndigit_count > (ALNUM_CHAR_SIZE - 2)) { reg_error ("internal error #9 `init_ansi_classes\'"); return (0); } } - Word_Char [word_count] = '\0'; - Letter_Char [word_count] = '\0'; - White_Space [space_count] = '\0'; + Word_Char [word_count] = '\0'; + Letter_Char [letter_count] = '\0'; + White_Space [space_count] = '\0'; + + NWord_Char [nword_count] = '\0'; + NLetter_Char [nletter_count] = '\0'; + NWhite_Space [nspace_count] = '\0'; + NASCII_Digits [ndigit_count] = '\0'; } return (1);