C Regular Expressions: Extracting the Actual Matches

前端 未结 2 1824
不思量自难忘°
不思量自难忘° 2020-12-15 13:30

I am using regular expressions in C (using the \"regex.h\" library). After setting up the standard calls (and checks) for regcomp(...) and regexec(...), I can only manage t

相关标签:
2条回答
  • 2020-12-15 13:39

    Since g++ regex is bugged until who knows when, you can use my code instead (License: AGPL, no warranty, your own risk, ...)

    /**
     * regexp (License: AGPL3 or higher)
     * @param re extended POSIX regular expression
     * @param nmatch maximum number of matches
     * @param str string to match
     * @return An array of char pointers. You have to free() the first element (string storage). the second element is the string matching the full regex, then come the submatches.
    */
    char **regexp(char *re, int nmatch, char *str) {
      char **result;
      char *string;
      regex_t regex;
      regmatch_t *match;
      int i;
    
      match=malloc(nmatch*sizeof(*match));
      if (!result) {
        fprintf(stderr, "Out of memory !");
        return NULL;
      }
    
      if (regcomp(&regex, re, REG_EXTENDED)!=0) {
        fprintf(stderr, "Failed to compile regex '%s'\n", re);
        return NULL;
      }
    
      string=strdup(str);
      if (regexec(&regex,string,nmatch,match,0)) {
    #ifdef DEBUG
        fprintf(stderr, "String '%s' does not match regex '%s'\n",str,re);
    #endif
        free(string);
        return NULL;
      }
    
      result=malloc(sizeof(*result));
      if (!result) {
        fprintf(stderr, "Out of memory !");
        free(string);
        return NULL;
      }
    
      for (i=0; i<nmatch; ++i) {
        if (match[i].rm_so>=0) {
          string[match[i].rm_eo]=0;
          ((char**)result)[i]=string+match[i].rm_so;
    #ifdef DEBUG
          printf("%s\n",string+match[i].rm_so);
    #endif                                                                                                                                                                                                                                                   
        } else {                             
          ((char**)result)[i]="";            
        }
      }
    
      result[0]=string;                      
    
      return result;                         
    
    }
    
    0 讨论(0)
  • 2020-12-15 13:54

    There are quite a lot of regular expression packages, but yours seems to match the one in POSIX: regcomp() etc.

    The two structures it defines in <regex.h> are:

    • regex_t containing at least size_t re_nsub, the number of parenthesized subexpressions.

    • regmatch_t containing at least regoff_t rm_so, the byte offset from start of string to start of substring, and regoff_t rm_eo, the byte offset from start of string of the first character after the end of substring.

    Note that 'offsets' are not pointers but indexes into the character array.

    The execution function is:

    • int regexec(const regex_t *restrict preg, const char *restrict string, size_t nmatch, regmatch_t pmatch[restrict], int eflags);

    Your printing code should be:

    for (int i = 0; i <= r.re_nsub; i++)
    {
        int start = m[i].rm_so;
        int finish = m[i].rm_eo;
    //  strcpy(matches[ind], ("%.*s\n", (finish - start), p + start));  // Based on question
        sprintf(matches[ind], "%.*s\n", (finish - start), p + start);   // More plausible code
        printf("Storing:  %.*s\n", (finish - start), matches[ind]);     // Print once
        ind++;
        printf("%.*s\n", (finish - start), p + start);                  // Why print twice?
    }
    

    Note that the code should be upgraded to ensure that the string copy (via sprintf()) does not overflow the target string — maybe by using snprintf() instead of sprintf(). It is also a good idea to mark the start and end of a string in the printing. For example:

        printf("<<%.*s>>\n", (finish - start), p + start);
    

    This makes it a whole heap easier to see spaces etc.

    [In future, please attempt to provide an MCVE (Minimal, Complete, Verifiable Example) or SSCCE (Short, Self-Contained, Correct Example) so that people can help more easily.]

    This is an SSCCE that I created, probably in response to another SO question in 2010. It is one of a number of programs I keep that I call 'vignettes'; little programs that show the essence of some feature (such as POSIX regexes, in this case). I find them useful as memory joggers.

    #include <stdio.h>
    #include <stdlib.h>
    #include <string.h>
    #include <errno.h>
    #include <regex.h>
    
    #define tofind    "^DAEMONS=\\(([^)]*)\\)[ \t]*$"
    
    int main(int argc, char **argv)
    {
        FILE *fp;
        char line[1024];
        int retval = 0;
        regex_t re;
        regmatch_t rm[2];
        //this file has this line "DAEMONS=(sysklogd network sshd !netfs !crond)"
        const char *filename = "/etc/rc.conf";
    
        if (argc > 1)
            filename = argv[1];
    
        if (regcomp(&re, tofind, REG_EXTENDED) != 0)
        {
            fprintf(stderr, "Failed to compile regex '%s'\n", tofind);
            return EXIT_FAILURE;
        }
        printf("Regex: %s\n", tofind);
        printf("Number of captured expressions: %zu\n", re.re_nsub);
    
        fp = fopen(filename, "r");
        if (fp == 0)
        {
            fprintf(stderr, "Failed to open file %s (%d: %s)\n", filename, errno, strerror(errno));
            return EXIT_FAILURE;
        }
    
        while ((fgets(line, 1024, fp)) != NULL)
        {
            line[strcspn(line, "\n")] = '\0';
            if ((retval = regexec(&re, line, 2, rm, 0)) == 0)
            {
                printf("<<%s>>\n", line);
                // Complete match
                printf("Line: <<%.*s>>\n", (int)(rm[0].rm_eo - rm[0].rm_so), line + rm[0].rm_so);
                // Match captured in (...) - the \( and \) match literal parenthesis
                printf("Text: <<%.*s>>\n", (int)(rm[1].rm_eo - rm[1].rm_so), line + rm[1].rm_so);
                char *src = line + rm[1].rm_so;
                char *end = line + rm[1].rm_eo;
                while (src < end)
                {
                    size_t len = strcspn(src, " ");
                    if (src + len > end)
                        len = end - src;
                    printf("Name: <<%.*s>>\n", (int)len, src);
                    src += len;
                    src += strspn(src, " ");
                }
            }
        } 
        return EXIT_SUCCESS;
    }
    

    This was designed to find a particular line starting DAEMONS= in a file /etc/rc.conf (but you can specify an alternative file name on the command line). You can adapt it to your purposes easily enough.

    0 讨论(0)
提交回复
热议问题