/** * @file regex_match.c * @author Ambroz Bizjak <ambrop7@gmail.com> * * @section LICENSE * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the author nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * @section DESCRIPTION * * Regular expression matching module. * * Synopsis: * regex_match(string input, string regex) * * Variables: * succeeded - "true" or "false", indicating whether input matched regex * matchN - for N=0,1,2,..., the matching data for the N-th subexpression * (match0 = whole match) * * Description: * Matches 'input' with the POSIX extended regular expression 'regex'. * 'regex' must be a string without null bytes, but 'input' can contain null bytes. * However, it's difficult, if not impossible, to actually match nulls with the regular * expression. * The input and regex strings are interpreted according to the POSIX regex functions * (regcomp(), regexec()); in particular, the current locale setting affects the * interpretation. * * Synopsis: * regex_replace(string input, list(string) regex, list(string) replace) * * Variables: * string (empty) - transformed input * * Description: * Replaces matching parts of a string. Replacement is performed by repetedly matching * the remaining part of the string with all regular expressions. On each step, out of * all regular expressions that match the remainder of the string, the one whose match * starts at the least position wins, and the matching part is replaced with the * replacement string corresponding to this regular expression. The process continues * from the end of the just-replaced portion until no more regular expressions match. * If multiple regular expressions match at the least position, the one that appears * first in the 'regex' argument wins. */ #include <stdlib.h> #include <string.h> #include <limits.h> #include <regex.h> #include <misc/string_begins_with.h> #include <misc/parse_number.h> #include <misc/expstring.h> #include <misc/debug.h> #include <misc/balloc.h> #include <ncd/NCDModule.h> #include <ncd/extra/value_utils.h> #include <generated/blog_channel_ncd_regex_match.h> #define ModuleLog(i, ...) NCDModuleInst_Backend_Log((i), BLOG_CURRENT_CHANNEL, __VA_ARGS__) #define MAX_MATCHES 64 struct instance { NCDModuleInst *i; const char *input; size_t input_len; int succeeded; int num_matches; regmatch_t matches[MAX_MATCHES]; }; struct replace_instance { NCDModuleInst *i; char *output; size_t output_len; }; static void func_new (void *vo, NCDModuleInst *i, const struct NCDModuleInst_new_params *params) { struct instance *o = vo; o->i = i; // read arguments NCDValRef input_arg; NCDValRef regex_arg; if (!NCDVal_ListRead(params->args, 2, &input_arg, ®ex_arg)) { ModuleLog(o->i, BLOG_ERROR, "wrong arity"); goto fail0; } if (!NCDVal_IsString(input_arg) || !NCDVal_IsStringNoNulls(regex_arg)) { ModuleLog(o->i, BLOG_ERROR, "wrong type"); goto fail0; } o->input = NCDVal_StringData(input_arg); o->input_len = NCDVal_StringLength(input_arg); // make sure we don't overflow regoff_t if (o->input_len > INT_MAX) { ModuleLog(o->i, BLOG_ERROR, "input string too long"); goto fail0; } // null terminate regex NCDValNullTermString regex_nts; if (!NCDVal_StringNullTerminate(regex_arg, ®ex_nts)) { ModuleLog(i, BLOG_ERROR, "NCDVal_StringNullTerminate failed"); goto fail0; } // compile regex regex_t preg; int ret = regcomp(&preg, regex_nts.data, REG_EXTENDED); NCDValNullTermString_Free(®ex_nts); if (ret != 0) { ModuleLog(o->i, BLOG_ERROR, "regcomp failed (error=%d)", ret); goto fail0; } // execute match o->matches[0].rm_so = 0; o->matches[0].rm_eo = o->input_len; o->succeeded = (regexec(&preg, o->input, MAX_MATCHES, o->matches, REG_STARTEND) == 0); // free regex regfree(&preg); // signal up NCDModuleInst_Backend_Up(o->i); return; fail0: NCDModuleInst_Backend_DeadError(i); } static int func_getvar (void *vo, const char *name, NCDValMem *mem, NCDValRef *out) { struct instance *o = vo; if (!strcmp(name, "succeeded")) { *out = ncd_make_boolean(mem, o->succeeded, o->i->params->iparams->string_index); return 1; } size_t pos; uintmax_t n; if ((pos = string_begins_with(name, "match")) && parse_unsigned_integer(name + pos, &n)) { if (o->succeeded && n < MAX_MATCHES && o->matches[n].rm_so >= 0) { regmatch_t *m = &o->matches[n]; ASSERT(m->rm_so <= o->input_len) ASSERT(m->rm_eo >= m->rm_so) ASSERT(m->rm_eo <= o->input_len) size_t len = m->rm_eo - m->rm_so; *out = NCDVal_NewStringBin(mem, (uint8_t *)o->input + m->rm_so, len); return 1; } } return 0; } static void replace_func_new (void *vo, NCDModuleInst *i, const struct NCDModuleInst_new_params *params) { struct replace_instance *o = vo; o->i = i; // read arguments NCDValRef input_arg; NCDValRef regex_arg; NCDValRef replace_arg; if (!NCDVal_ListRead(params->args, 3, &input_arg, ®ex_arg, &replace_arg)) { ModuleLog(i, BLOG_ERROR, "wrong arity"); goto fail1; } if (!NCDVal_IsString(input_arg) || !NCDVal_IsList(regex_arg) || !NCDVal_IsList(replace_arg)) { ModuleLog(i, BLOG_ERROR, "wrong type"); goto fail1; } // check number of regex/replace if (NCDVal_ListCount(regex_arg) != NCDVal_ListCount(replace_arg)) { ModuleLog(i, BLOG_ERROR, "number of regex's is not the same as number of replacements"); goto fail1; } size_t num_regex = NCDVal_ListCount(regex_arg); // allocate array for compiled regex's regex_t *regs = BAllocArray(num_regex, sizeof(regs[0])); if (!regs) { ModuleLog(i, BLOG_ERROR, "BAllocArray failed"); goto fail1; } size_t num_done_regex = 0; // compile regex's, check arguments while (num_done_regex < num_regex) { NCDValRef regex = NCDVal_ListGet(regex_arg, num_done_regex); NCDValRef replace = NCDVal_ListGet(replace_arg, num_done_regex); if (!NCDVal_IsStringNoNulls(regex) || !NCDVal_IsString(replace)) { ModuleLog(i, BLOG_ERROR, "wrong regex/replace type for pair %zu", num_done_regex); goto fail2; } // null terminate regex NCDValNullTermString regex_nts; if (!NCDVal_StringNullTerminate(regex, ®ex_nts)) { ModuleLog(i, BLOG_ERROR, "NCDVal_StringNullTerminate failed"); goto fail2; } int res = regcomp(®s[num_done_regex], regex_nts.data, REG_EXTENDED); NCDValNullTermString_Free(®ex_nts); if (res != 0) { ModuleLog(i, BLOG_ERROR, "regcomp failed for pair %zu (error=%d)", num_done_regex, res); goto fail2; } num_done_regex++; } // init output string ExpString out; if (!ExpString_Init(&out)) { ModuleLog(i, BLOG_ERROR, "ExpString_Init failed"); goto fail2; } // input state const char *in = NCDVal_StringData(input_arg); size_t in_pos = 0; size_t in_len = NCDVal_StringLength(input_arg); // process input while (in_pos < in_len) { // find first match int have_match = 0; size_t match_regex = 0; // to remove warning regmatch_t match = {0, 0}; // to remove warning for (size_t j = 0; j < num_regex; j++) { regmatch_t this_match; this_match.rm_so = 0; this_match.rm_eo = in_len - in_pos; if (regexec(®s[j], in + in_pos, 1, &this_match, REG_STARTEND) == 0 && (!have_match || this_match.rm_so < match.rm_so)) { have_match = 1; match_regex = j; match = this_match; } } // if no match, append remaining data and finish if (!have_match) { if (!ExpString_AppendBinary(&out, (const uint8_t *)in + in_pos, in_len - in_pos)) { ModuleLog(i, BLOG_ERROR, "ExpString_AppendBinary failed"); goto fail3; } break; } // append data before match if (!ExpString_AppendBinary(&out, (const uint8_t *)in + in_pos, match.rm_so)) { ModuleLog(i, BLOG_ERROR, "ExpString_AppendBinary failed"); goto fail3; } // append replacement data NCDValRef replace = NCDVal_ListGet(replace_arg, match_regex); if (!ExpString_AppendBinary(&out, (const uint8_t *)NCDVal_StringData(replace), NCDVal_StringLength(replace))) { ModuleLog(i, BLOG_ERROR, "ExpString_AppendBinary failed"); goto fail3; } in_pos += match.rm_eo; } // set output o->output = ExpString_Get(&out); o->output_len = ExpString_Length(&out); // free compiled regex's while (num_done_regex-- > 0) { regfree(®s[num_done_regex]); } // free array BFree(regs); // signal up NCDModuleInst_Backend_Up(i); return; fail3: ExpString_Free(&out); fail2: while (num_done_regex-- > 0) { regfree(®s[num_done_regex]); } BFree(regs); fail1: NCDModuleInst_Backend_DeadError(i); } static void replace_func_die (void *vo) { struct replace_instance *o = vo; // free output BFree(o->output); NCDModuleInst_Backend_Dead(o->i); } static int replace_func_getvar (void *vo, const char *name, NCDValMem *mem, NCDValRef *out) { struct replace_instance *o = vo; if (!strcmp(name, "")) { *out = NCDVal_NewStringBin(mem, (uint8_t *)o->output, o->output_len); return 1; } return 0; } static struct NCDModule modules[] = { { .type = "regex_match", .func_new2 = func_new, .func_getvar = func_getvar, .alloc_size = sizeof(struct instance) }, { .type = "regex_replace", .func_new2 = replace_func_new, .func_die = replace_func_die, .func_getvar = replace_func_getvar, .alloc_size = sizeof(struct replace_instance) }, { .type = NULL } }; const struct NCDModuleGroup ncdmodule_regex_match = { .modules = modules };