Logo Search packages:      
Sourcecode: x11-apps version File versions  Download package

reo.c

/*
 * Copyright (c) 2002 by The XFree86 Project, Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE XFREE86 PROJECT BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
 * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 *
 * Except as contained in this notice, the name of the XFree86 Project shall
 * not be used in advertising or otherwise to promote the sale, use or other
 * dealings in this Software without prior written authorization from the
 * XFree86 Project.
 *
 * Author: Paulo C├ęsar Pereira de Andrade
 */

/* $XFree86: xc/programs/xedit/lisp/re/reo.c,v 1.8 2002/09/29 02:55:01 paulo Exp $ */

#include "rep.h"

/*
 *  This file is a placeholder to add code to analyse and optimize the
 * intermediate data structure generated in rep.c.
 *  Character ranges are optimized while being generated.
 */

/*
 * Types
 */
typedef struct _orec_inf {
    rec_alt *alt;             /* Main alternatives list */
    rec_grp *grp;             /* Current group pointer */
    int flags;
    int ecode;
} orec_inf;

/*
 * Prototypes
 */
static int orec_alt(orec_inf*, rec_alt*);
static int orec_pat(orec_inf*, rec_pat*);
static int orec_grp(orec_inf*, rec_grp*);
static int orec_pat_bad_rpt(orec_inf*, rec_pat*);
static int orec_pat_bad_forward_rpt(orec_inf*, rec_pat*);
static int orec_pat_rng(orec_inf*, rec_pat*);
static int orec_pat_cse(orec_inf*, rec_pat*);
static int orec_pat_cse_can(orec_inf*, rec_pat*);
static int orec_str_list(orec_inf*, rec_alt*, int, int);

/*
 * Initialization
 */
extern unsigned char re__alnum[256];
extern unsigned char re__odigit[256];
extern unsigned char re__ddigit[256];
extern unsigned char re__xdigit[256];
extern unsigned char re__control[256];

/*
 * Implementation
 */
int
orec_comp(rec_alt *alt, int flags)
{
    orec_inf inf;

    inf.alt = alt;
    inf.grp = NULL;
    inf.flags = flags;
    inf.ecode = 0;

    orec_alt(&inf, alt);

    return (inf.ecode);
}

void
orec_free_stl(rec_stl *stl)
{
    int i;

    for (i = 0; i < stl->nstrs; i++) {
      if (stl->lens[i] > 2)
          free(stl->strs[i]);
    }

    free(stl->lens);
    free(stl->strs);
    free(stl);
}


static int
orec_alt(orec_inf *inf, rec_alt *alt)
{
    if (alt) {
      rec_alt *ptr = alt;
      int ret, count = 0, str = 1, cstr = 1, lits = 0, clits = 0;

      /* Check if can build a string list */
      if (ptr->next) {
          /* If more than one alternative */
          while (ptr && (str || cstr)) {
            if (ptr->pat == NULL || ptr->pat->rep != NULL) {
                cstr = str = 0;
                break;
            }
            if ((inf->flags & RE_ICASE)) {
                if (!(ret = orec_pat_cse_can(inf, ptr->pat))) {
                  cstr = str = 0;
                  break;
                }
                if (ret == 1)
                  ++lits;
                else if (ret == 2)
                  ++clits;
            }
            else if (ptr->pat->next == NULL) {
                if (ptr->pat->type != Rep_String) {
                  if (ptr->pat->type != Rep_Literal) {
                      str = 0;
                      if (ptr->pat->type != Rep_CaseString) {
                        if (ptr->pat->type != Rep_CaseLiteral)
                            cstr = 0;
                        else
                            ++clits;
                      }
                      else if (strlen((char*)ptr->pat->data.str) >= 255)
                        str = cstr = 0;
                  }
                  else
                      ++lits;
                }
                else if (strlen((char*)ptr->pat->data.str) >= 255)
                  str = cstr = 0;
            }
            else {
                str = cstr = 0;
                break;
            }
            if (++count >= 255)
                str = cstr = 0;
            ptr = ptr->next;
          }

          if (str || cstr) {
            if (inf->flags & RE_ICASE) {
                for (ptr = alt; ptr; ptr = ptr->next) {
                  if (orec_pat_cse(inf, ptr->pat))
                      return (inf->ecode);
                }
                str = 0;
            }
            return (orec_str_list(inf, alt, str, count));
          }
      }
      else if (alt == inf->alt && alt->pat && alt->pat->rep == NULL) {
          /* If the toplevel single alternative */
          switch (alt->pat->type) {
            /*  One of these will always be true for RE_NOSPEC,
             * but can also be optimized for simple patterns */
            case Rep_Literal:
                alt->pat->type = Rep_SearchLiteral;
                break;
            case Rep_CaseLiteral:
                alt->pat->type = Rep_SearchCaseLiteral;
                break;
            case Rep_String:
                alt->pat->type = Rep_SearchString;
                break;
            case Rep_CaseString:
                alt->pat->type = Rep_SearchCaseString;
                break;
            default:
                break;
          }
      }

      while (alt) {
          orec_pat(inf, alt->pat);
          alt = alt->next;
      }
    }

    return (inf->ecode);
}

static int
orec_pat(orec_inf *inf, rec_pat *pat)
{
    rec_pat *next;

    while (pat) {
      switch (pat->type) {
          case Rep_AnyAnyTimes:
            if (pat->next == NULL) {
                rec_grp *grp = inf->grp;

                next = NULL;
                while (grp) {
                  next = grp->parent->next;
                  /* Cannot check if is .*$ as the input
                   * may be a substring */
                  if (next)
                      break;
                  grp = grp->pgrp;
                }
                if (next == NULL) {
                  /*    <re>.*    */
                  pat->type = Rep_AnyEatAnyTimes;
                  grp = inf->grp;
                  while (grp) {
                      --grp->comp;
                      next = grp->parent->next;
                      if (next)
                        break;
                      grp = grp->pgrp;
                  }
                }
                else if (orec_pat_bad_rpt(inf, next))
                  return (inf->ecode);
            }
            else if (orec_pat_bad_rpt(inf, pat->next))
                return (inf->ecode);
            break;
          case Rep_AnyMaybe:
            if (pat->next == NULL) {
                rec_grp *grp = inf->grp;

                next = NULL;
                while (grp) {
                  next = grp->parent->next;
                  if (next)
                      break;
                  grp = grp->pgrp;
                }
                if (next == NULL) {
                  /*    <re>.?    */
                  pat->type = Rep_AnyEatMaybe;
                  grp = inf->grp;
                  while (grp) {
                      --grp->comp;
                      next = grp->parent->next;
                      if (next)
                        break;
                      grp = grp->pgrp;
                  }
                }
                else if (orec_pat_bad_rpt(inf, next))
                  return (inf->ecode);
            }
            else if (orec_pat_bad_rpt(inf, pat->next))
                return (inf->ecode);
            break;
          case Rep_AnyAtLeast:
            if (pat->next == NULL) {
                rec_grp *grp = inf->grp;

                next = NULL;
                while (grp) {
                  next = grp->parent->next;
                  if (next)
                      break;
                  grp = grp->pgrp;
                }
                if (next == NULL) {
                  /*    <re>.+    */
                  pat->type = Rep_AnyEatAtLeast;
                  grp = inf->grp;
                  while (grp) {
                      --grp->comp;
                      next = grp->parent->next;
                      if (next)
                        break;
                      grp = grp->pgrp;
                  }
                }
                else if (orec_pat_bad_rpt(inf, next))
                  return (inf->ecode);
            }
            else if (orec_pat_bad_rpt(inf, pat->next))
                return (inf->ecode);
            break;
          case Rep_Range:
          case Rep_RangeNot:
            orec_pat_rng(inf, pat);
            break;
          case Rep_Group:
            orec_grp(inf, pat->data.grp);
            break;
          default:
            break;
      }
      pat = pat->next;
    }

    return (inf->ecode);
}

static int
orec_pat_bad_rpt(orec_inf *inf, rec_pat *pat)
{
    switch (pat->type) {
      /* Not really an error, but aren't supported by the library.
       * Includes:  .*.*, .+<re>?  .*<re>*, (.*)(<re>*), etc.
       */

      /* Not a repetition, but mathes anything... */
      case Rep_Any:

      /* Zero length matches */
      case Rep_Eol:
          if (!(inf->flags & RE_NEWLINE))
            break;
      case Rep_Bol:
      case Rep_Bow:
      case Rep_Eow:

      /* Repetitions */
      case Rep_AnyAnyTimes:
      case Rep_AnyMaybe:
      case Rep_AnyAtLeast:
          inf->ecode = RE_BADRPT;
          break;

      /* Check if the first group element is a complex pattern */
      case Rep_Group:
          if (pat->rep == NULL) {
            if (pat->data.grp->alt) {
                for (pat = pat->data.grp->alt->pat; pat; pat = pat->next) {
                  if (orec_pat_bad_rpt(inf, pat))
                      break;
                }
            }
            break;
          }
          /*FALLTHROUGH*/
      default:
          if (pat->rep)
            inf->ecode = RE_BADRPT;
          break;
    }

    if (!inf->ecode && pat && pat->next)
      orec_pat_bad_forward_rpt(inf, pat->next);

    return (inf->ecode);
}

static int
orec_pat_bad_forward_rpt(orec_inf *inf, rec_pat *pat)
{
    if (pat->rep) {
      switch (pat->rep->type) {
          case Rer_MinMax:
            if (pat->rep->mine > 0)
                break;
          case Rer_AnyTimes:
          case Rer_Maybe:
          case Rer_Max:
            inf->ecode = RE_BADRPT;
          default:
            break;
      }
    }
    else if (pat->type == Rep_Group &&
           pat->data.grp->alt &&
           pat->data.grp->alt->pat)
      orec_pat_bad_forward_rpt(inf, pat->data.grp->alt->pat);

    return (inf->ecode);
}

static int
orec_grp(orec_inf *inf, rec_grp *grp)
{
    rec_grp *prev = inf->grp;

    inf->grp = grp;
    orec_alt(inf, grp->alt);
    /* Could also just say: inf->grp = grp->gparent */
    inf->grp = prev;

    return (inf->ecode);
}

static int
orec_pat_rng(orec_inf *inf, rec_pat *pat)
{
    int i, j[2], count;
    rec_pat_t type = pat->type;
    unsigned char *range = pat->data.rng->range;

    for (i = count = j[0] = j[1] = 0; i < 256; i++) {
      if (range[i]) {
          if (count == 2) {
            ++count;
            break;
          }
          j[count++] = i;
      }
    }

    if (count == 1 ||
      (count == 2 &&
       ((islower(j[0]) && toupper(j[0]) == j[1]) ||
        (isupper(j[0]) && tolower(j[0]) == j[1])))) {
      free(pat->data.rng);
      if (count == 1) {
          pat->data.chr = j[0];
          pat->type = type == Rep_Range ? Rep_Literal : Rep_LiteralNot;
      }
      else {
          pat->data.cse.upper = j[0];
          pat->data.cse.lower = j[1];
          pat->type = type == Rep_Range ? Rep_CaseLiteral : Rep_CaseLiteralNot;
      }
    }
    else {
      if (memcmp(re__alnum, range, 256) == 0)
          type = type == Rep_Range ? Rep_Alnum : Rep_AlnumNot;
      else if (memcmp(re__odigit, range, 256) == 0)
          type = type == Rep_Range ? Rep_Odigit : Rep_OdigitNot;
      else if (memcmp(re__ddigit, range, 256) == 0)
          type = type == Rep_Range ? Rep_Digit : Rep_DigitNot;
      else if (memcmp(re__xdigit, range, 256) == 0)
          type = type == Rep_Range ? Rep_Xdigit : Rep_XdigitNot;
      else if (memcmp(re__control, range, 256) == 0)
          type = type == Rep_Range ? Rep_Control : Rep_ControlNot;

      if (type != pat->type) {
          free(pat->data.rng);
          pat->type = type;
      }
    }

    return (inf->ecode);
}

/*  Join patterns if required, will only fail on memory allocation failure:
 */
static int
orec_pat_cse(orec_inf *inf, rec_pat *pat)
{
    rec_pat_t type;
    int i, len, length;
    rec_pat *ptr, *next;
    unsigned char *str, *tofree;

    if (pat->next == NULL && pat->type == Rep_CaseString)
      return (inf->ecode);

    type = Rep_CaseString;

    /* First calculate how many bytes will be required */
    for (ptr = pat, length = 1; ptr; ptr = ptr->next) {
      switch (ptr->type) {
          case Rep_Literal:
            length += 2;
            break;
          case Rep_String:
            length += strlen((char*)ptr->data.str) << 1;
            break;
          case Rep_CaseLiteral:
            length += 2;
            break;
          case Rep_CaseString:
            length += strlen((char*)ptr->data.str);
            break;
          default:
            break;
      }
    }

    if ((str = malloc(length)) == NULL)
      return (inf->ecode = RE_ESPACE);

    for (ptr = pat, length = 0; ptr; ptr = next) {
      tofree = NULL;
      next = ptr->next;
      switch (ptr->type) {
          case Rep_Literal:
            str[length++] = ptr->data.chr;
            str[length++] = ptr->data.chr;
            break;
          case Rep_String:
            tofree = ptr->data.str;
            len = strlen((char*)tofree);
            for (i = 0; i < len; i++) {
                str[length++] = tofree[i];
                str[length++] = tofree[i];
            }
            break;
          case Rep_CaseLiteral:
            str[length++] = ptr->data.cse.lower;
            str[length++] = ptr->data.cse.upper;
            break;
          case Rep_CaseString:
            tofree = ptr->data.str;
            len = strlen((char*)tofree);
            memcpy(str + length, tofree, len);
            length += len;
            break;
          default:
            break;
      }
      if (tofree)
          free(tofree);
      if (ptr != pat)
          free(ptr);
    }
    str[length] = '\0';

    pat->type = type;
    pat->data.str = str;
    pat->next = NULL;

    return (inf->ecode);
}

/*  Return 0 if the patterns in the list cannot be merged, 1 if will
 * be a simple string, 2 if a case string.
 *  This is useful when building an alternative list that is composed
 * only of strings, but the regex is case insensitive, in wich case
 * the first pass may have splited some patterns, but if it is a member
 * of an alternatives list, the cost of using a string list is smaller */
static int
orec_pat_cse_can(orec_inf *inf, rec_pat *pat)
{
    int ret;

    if (pat == NULL)
      return (0);

    for (ret = 1; pat; pat = pat->next) {
      if (pat->rep)
          return (0);
      switch (pat->type) {
          case Rep_Literal:
          case Rep_String:
            break;
          case Rep_CaseLiteral:
          case Rep_CaseString:
            ret = 2;
            break;
          default:
            return (0);
      }
    }

    return (ret);
}


/*  XXX If everything is a (case) byte, the pattern should be
 * [abcde] instead of a|b|c|d|e (or [aAbBcCdDeE] instead of aA|bB|cC|dD|eE)
 * as a string list works fine, but as a character range
 * should be faster, and maybe could be converted here. But not
 * very important, if performance is required, it should have already
 * been done in the pattern.
 */
static int
orec_str_list(orec_inf *inf, rec_alt *alt, int str, int count)
{
    rec_stl *stl;
    rec_pat *pat;
    rec_alt *ptr, *next;
    int i, j, tlen, len, is;

    if ((stl = calloc(1, sizeof(rec_stl))) == NULL)
      return (inf->ecode = RE_ESPACE);

    if ((stl->lens = malloc(sizeof(unsigned char) * count)) == NULL) {
      free(stl);
      return (inf->ecode = RE_ESPACE);
    }

    if ((stl->strs = malloc(sizeof(char*) * count)) == NULL) {
      free(stl->lens);
      free(stl);
      return (inf->ecode = RE_ESPACE);
    }

    if ((pat = calloc(1, sizeof(rec_pat))) == NULL) {
      free(stl->strs);
      free(stl->lens);
      free(stl);
      return (inf->ecode = RE_ESPACE);
    }

    pat->data.stl = stl;
    pat->type = Rep_StringList;
    stl->type = str ? Resl_StringList : Resl_CaseStringList;
    for (i = tlen = 0, ptr = alt; i < count; i++) {
      next = ptr->next;
      switch (ptr->pat->type) {
          case Rep_Literal:
            is = len = 1;
            break;
          case Rep_CaseLiteral:
            is = len = 2;
            break;
          default:
            is = 0;
            len = strlen((char*)ptr->pat->data.str);
            break;
      }
      tlen += len;
      stl->lens[i] = len;
      if (!is) {
          if (len > 2)
            stl->strs[i] = ptr->pat->data.str;
          else {
            if (len == 1)
                stl->strs[i] = (void*)(long)(ptr->pat->data.str[0]);
            else
                stl->strs[i] = (void*)(long)
                           (ptr->pat->data.str[0] |
                            ((int)ptr->pat->data.str[1] << 8));
            free(ptr->pat->data.str);
          }
      }
      else {
          if (is == 1)
            stl->strs[i] = (void*)(long)ptr->pat->data.chr;
          else
            stl->strs[i] = (void*)(long)
                         (ptr->pat->data.cse.lower |
                        (ptr->pat->data.cse.upper << 8));
      }
      free(ptr->pat);
      if (i)
          free(ptr);
      ptr = next;
    }
    stl->tlen = tlen;
    stl->nstrs = count;

    alt->pat = pat;
    alt->next = NULL;

    {
      int li, lj;
      unsigned char ci, cj, *str;

      /*  Don't need a stable sort, there shouldn't be duplicated strings,
       * but don't check for it either. Only need to make sure that all
       * strings that start with the same byte are together */
      for (i = 0; i < count; i++) {
          li = stl->lens[i];
          ci = li > 2 ? stl->strs[i][0] : (long)stl->strs[i] & 0xff;
          for (j = i + 1; j < count; j++) {
            lj = stl->lens[j];
            cj = lj > 2 ? stl->strs[j][0] : (long)stl->strs[j] & 0xff;
            if ((count >= LARGE_STL_COUNT && cj < ci) ||
                (cj == ci && lj > li)) {
                /* If both strings start with the same byte,
                 * put the longer first */
                str = stl->strs[j];
                stl->strs[j] = stl->strs[i];
                stl->strs[i] = str;
                stl->lens[j] = li;
                stl->lens[i] = lj;
                li ^= lj; lj ^= li; li ^= lj;
                ci ^= cj; cj ^= ci; ci ^= cj;
            }
          }
      }
    }

    return (inf->ecode);
}

Generated by  Doxygen 1.6.0   Back to index