Logo Search packages:      
Sourcecode: x11-apps version File versions  Download package

rep.h

/*
 * Copyright (c) 2002 by The XFree86 Project, Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE XFREE86 PROJECT BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
 * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 *
 * Except as contained in this notice, the name of the XFree86 Project shall
 * not be used in advertising or otherwise to promote the sale, use or other
 * dealings in this Software without prior written authorization from the
 * XFree86 Project.
 *
 * Author: Paulo C├ęsar Pereira de Andrade
 */

/* $XFree86: xc/programs/xedit/lisp/re/rep.h,v 1.2 2002/11/15 07:01:33 paulo Exp $ */

#include "re.h"

#ifndef _rep_h
#define _rep_h

/*
 * Local defines
 */

#ifdef MIN
#undef MIN
#endif
#define MIN(a, b) ((a) < (b) ? (a) : (b))

#ifdef MAX
#undef MAX
#endif
#define MAX(a, b) ((a) > (b) ? (a) : (b))

/*  This value can not be larger than 255, a depth value is the nesting of
 * repetition operations and alternatives. The number of nested parenthesis
 * does not matter, but a repetition on the pattern inside the parenthesis
 * does. Note also that you cannot have more than 9 parenthesis pairs in
 * an expression.
 *  Depth is always at least 1. So for MAX_DEPTH 8, it is only allowed
 * 7 complex repetitions. A complex repetition is a dot followed by an
 * repetition operator. It is called a complex repetition because dot
 * matches anything but the empty string, so the engine needs to test
 * all possible combinations until the end of the string is found.
 *  Repetitions like .* use one depth until the end of the string is found,
 * for example a.*b.*c.*d has depth 4, while a*b*c*d has depth 2.
 */
#define MAX_DEPTH 8

/*  Minimum number of strings to generate a "large" string list, that is,
 * sort the strings and allocate 512 extra bytes to map the first string
 * with a given initial byte. */
#define LARGE_STL_COUNT 16

/*
 * Local types
 */
/* Intermediate compilation types declaration */
      /* (r)egular (e)xpression (c)ompile (c)a(se) */
typedef struct _rec_cse rec_cse;

      /* (r)egular (e)xpression (c)ompile (r)a(ng)e */
typedef struct _rec_rng rec_rng;

      /* (r)egular (e)xpression (c)ompile (pat)tern */
typedef struct _rec_pat rec_pat;

      /* (r)egular (e)xpression (c)ompile (rep)etition */
typedef struct _rec_rep rec_rep;

      /* (r)egular (e)xpression (c)ompile (gr)ou(p) */
typedef struct _rec_grp rec_grp;

      /* (r)egular (e)xpression (c)ompile (alt)ernatives */
typedef struct _rec_alt rec_alt;


/* Optimization types */
      /* (r)egular (e)xpression (c)ompile (st)ring (l)ist */
typedef struct _rec_stl rec_stl;

/* Final compilation and execution types */
      /* (re)gular expression (inf)ormation */
typedef struct _re_inf re_inf;

      /* (re)gular expression (eng)ine */
typedef struct _re_eng re_eng;


/* Codes used by the engine */
typedef enum {
    /* Grouping */
    Re_Open,                  /* ( */
    Re_Close,                 /* ) */
    Re_Update,                /* Like Re_Close, but is inside a loop */

    /* Alternatives */
    Re_Alt,             /* Start alternative list, + next offset */
    Re_AltNext,               /* Next alternative, + next offset */
    Re_AltDone,               /* Finish alternative list */

    /* Repetition */
    Re_AnyTimes,        /* * */
    Re_Maybe,                 /* ? */
    Re_AtLeast,               /* +, at least one */

    /* Repetition like */
    Re_AnyAnyTimes,           /* .*<re> */
    Re_AnyMaybe,        /* .?<re> */
    Re_AnyAtLeast,            /* .+<re> */

    Re_AnyEatAnyTimes,        /* Expression ends with .* */
    Re_AnyEatMaybe,           /* Expression ends with .? */
    Re_AnyEatAtLeast,         /* Expression ends with .+ */

    /* Repetition with arguments */
    Re_Exact,                 /* {e} */
    Re_Min,             /* {n,} */
    Re_Max,             /* {,m} */
    Re_MinMax,                /* {n,m} */

    /* Repetition helper instruction */
    Re_RepJump,               /* Special code, go back to repetition */
    Re_RepLongJump,           /* Jump needs two bytes */
      /*  After the repetition data, all repetitions have an offset
       * to the code after the repetition */

    /* Matching */
    Re_Any,             /* . */
    Re_Odigit,                /* \o */
    Re_OdigitNot,       /* \O */
    Re_Digit,                 /* \d */
    Re_DigitNot,        /* \D */
    Re_Xdigit,                /* \x */
    Re_XdigitNot,       /* \x */
    Re_Space,                 /* \s */
    Re_SpaceNot,        /* \S */
    Re_Tab,             /* \t */
    Re_Newline,               /* \n */
    Re_Lower,                 /* \l */
    Re_Upper,                 /* \u */
    Re_Alnum,                 /* \w */
    Re_AlnumNot,        /* \W */
    Re_Control,               /* \c */
    Re_ControlNot,            /* \C */
    Re_Bol,             /* ^ */
    Re_Eol,             /* $ */
    Re_Bow,             /* < */
    Re_Eow,             /* > */

    /* Range matching information */
    Re_Range,                 /* + 256 bytes */
    Re_RangeNot,        /* + 256 bytes */

    /* Matching with arguments */
    Re_Literal,               /* + character */
    Re_CaseLiteral,           /* + lower + upper */
    Re_LiteralNot,            /* + character */
    Re_CaseLiteralNot,        /* + lower + upper */
    Re_String,                /* + length + string */
    Re_CaseString,            /* + length + string in format lower-upper */

    /* These are useful to start matching, or when RE_NOSPEC is used. */
    Re_SearchLiteral,
    Re_SearchCaseLiteral,
    Re_SearchString,
    Re_SearchCaseString,

    Re_StringList,            /* + total-length + lengths + strings */
    Re_CaseStringList,        /* + total-length + lengths + strings */

    Re_LargeStringList,       /* + total-length + lengths + map + strings */
    Re_LargeCaseStringList,   /* + total-length + lengths + map + strings */

    /* Backreference */
    Re_Backref,               /* + reference number */

    /* The last codes */
    Re_DoneIf,                /* Done if at end of input */
    Re_MaybeDone,       /* Done */
    Re_Done             /* If this code found, finished execution */
} ReCode;


/* (r)egular (e)xpresssion (pat)rern (t)ype */
typedef enum _rec_pat_t {
    Rep_Literal               = Re_Literal,
    Rep_CaseLiteral           = Re_CaseLiteral,
    Rep_LiteralNot            = Re_LiteralNot,
    Rep_CaseLiteralNot        = Re_CaseLiteralNot,
    Rep_Range                 = Re_Range,
    Rep_RangeNot        = Re_RangeNot,
    Rep_String                = Re_String,
    Rep_CaseString            = Re_CaseString,
    Rep_SearchLiteral         = Re_SearchLiteral,
    Rep_SearchCaseLiteral     = Re_SearchCaseLiteral,
    Rep_SearchString          = Re_SearchString,
    Rep_SearchCaseString      = Re_SearchCaseString,
    Rep_Any             = Re_Any,
    Rep_AnyAnyTimes           = Re_AnyAnyTimes,
    Rep_AnyEatAnyTimes        = Re_AnyEatAnyTimes,
    Rep_AnyMaybe        = Re_AnyMaybe,
    Rep_AnyEatMaybe           = Re_AnyEatMaybe,
    Rep_AnyAtLeast            = Re_AnyAtLeast,
    Rep_AnyEatAtLeast         = Re_AnyEatAtLeast,
    Rep_Odigit                = Re_Odigit,
    Rep_OdigitNot       = Re_OdigitNot,
    Rep_Digit                 = Re_Digit,
    Rep_DigitNot        = Re_DigitNot,
    Rep_Xdigit                = Re_Xdigit,
    Rep_XdigitNot       = Re_XdigitNot,
    Rep_Space                 = Re_Space,
    Rep_SpaceNot        = Re_SpaceNot,
    Rep_Tab             = Re_Tab,
    Rep_Newline               = Re_Newline,
    Rep_Lower                 = Re_Lower,
    Rep_Upper                 = Re_Upper,
    Rep_Alnum                 = Re_Alnum,
    Rep_AlnumNot        = Re_AlnumNot,
    Rep_Control               = Re_Control,
    Rep_ControlNot            = Re_ControlNot,
    Rep_Bol             = Re_Bol,
    Rep_Eol             = Re_Eol,
    Rep_Bow             = Re_Bow,
    Rep_Eow             = Re_Eow,
    Rep_Backref               = Re_Backref,
    Rep_StringList            = Re_StringList,
    Rep_Group                 = Re_Open
} rec_pat_t;


/* (r)egular (e)xpression (rep)etition (t)ype */
typedef enum _rec_rep_t {
    Rer_AnyTimes        = Re_AnyTimes,
    Rer_AtLeast               = Re_AtLeast,
    Rer_Maybe                 = Re_Maybe,
    Rer_Exact                 = Re_Exact,
    Rer_Min             = Re_Min,
    Rer_Max             = Re_Max,
    Rer_MinMax                = Re_MinMax
} rec_rep_t;


/*  Decide at re compilation time what is lowercase and what is uppercase */
struct _rec_cse {
    unsigned char lower;
    unsigned char upper;
};


/*  A rec_rng is used only during compilation, just a character map */
struct _rec_rng {
    unsigned char range[256];
};


/*  A rec_pat is used only during compilation, and can be viewed as
 * a regular expression element like a match to any character, a match
 * to the beginning or end of the line, etc.
 *  It is implemented as a linked list, and does not have nesting.
 *  The data field can contain:
 *    chr:  the value of a single character to match.
 *    cse:  the upper and lower case value of a character to match.
 *    rng:  a character map to match or not match.
 *    str:  a simple string or a string where every two bytes
 *          represents the character to match, in lower/upper
 *          case sequence.
 *  The rep field is not used for strings, strings are broken in the
 * last character in this case. That is, strings are just a concatenation
 * of several character matches.
 */
struct _rec_pat {
    rec_pat_t type;
    rec_pat *next, *prev;     /* Linked list information */
    union {
      unsigned char chr;
      rec_cse cse;
      rec_rng *rng;
      rec_grp *grp;
      unsigned char *str;
      rec_stl *stl;
    } data;
    rec_rep *rep;       /* Pattern repetition information */
};


/*  A rec_rep is used only during compilation, and can be viewed as:
 *
 *    ? or * or + or {<e>} or {<m>,} or {,<M>} or {<m>,<M>}
 *
 * where <e> is "exact", <m> is "minimum" and <M> is "maximum".
 *  In the compiled step it can also be just a NULL pointer, that
 * is actually equivalent to {1}.
 */
struct _rec_rep {
    rec_rep_t type;
    short mine;               /* minimum or exact number of matches */
    short maxc;               /* maximum number of matches */
};


/*  A rec_alt is used only during compilation, and can be viewed as:
 *
 *    <re>|<re>
 *
 * where <re> is any regular expression. The expressions are nested
 * using the grp field of the rec_pat structure.
 */
struct _rec_alt {
    rec_alt *next, *prev;     /* Linked list information */
    rec_pat *pat;
};


/*  A rec_grp is a place holder for expressions enclosed in parenthesis
 * and is linked to the compilation data by an rec_pat structure. */
struct _rec_grp {
    rec_pat *parent;          /* Reference to parent pattern */
    rec_alt *alt;       /* The pattern information */
    rec_alt *palt;            /* Parent alternative */
    rec_grp *pgrp;            /* Nested groups */
    int comp;                 /* (comp)lex repetition pattern inside group */
};


/* Optimization compilation types definition */
      /* (r)egular (e)xpression (c)ompile (st)ring (l)ist (t)ype */
typedef enum {
    Resl_StringList           = Re_StringList,
    Resl_CaseStringList       = Re_CaseStringList
} rec_stl_t;

struct _rec_stl {
    rec_stl_t type;
    int nstrs;                /* Number of strings in list */
    int tlen;                 /* Total length of all strings */
    unsigned char *lens;      /* Vector of string lengths */
    unsigned char **strs;     /* The strings */
};


/*
 * Prototypes
 */
      /* rep.c */
rec_alt *irec_comp(const char*, const char*, int, int*);
void irec_free_alt(rec_alt*);

      /* reo.c */
int orec_comp(rec_alt*, int);
void orec_free_stl(rec_stl*);

#endif /* _rep_h */

Generated by  Doxygen 1.6.0   Back to index