Logo Search packages:      
Sourcecode: pan version File versions  Download package

filter-phrase.c

/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
/*
 * Pan - A Newsreader for Gtk+
 * Copyright (C) 2002  Charles Kerr <charles@rebelbase.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; version 2 of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

#include <config.h>

#include <ctype.h>
#include <string.h>

#include <pcre.h>

#include <glib.h>

#include <pan/base/debug.h>
#include <pan/base/log.h>
#include <pan/base/pan-glib-extensions.h>
#include <pan/base/pan-i18n.h>

#include <pan/filters/filter-phrase.h>

const char * FILTER_PHRASE_CLASS_ID = "PanObject::Filter::FilterPhrase";

/************
*************  
************/

#define is_metacharacter(A) (_metacharacters[(guchar)(A)])
#if 0
static char _metacharacters[UCHAR_MAX];
#define PRINT_TABLE(A) \
      printf ("static char " #A "[UCHAR_MAX] = {"); \
      for (i=0; i<UCHAR_MAX; ++i) { \
            if (!(i%40)) \
                  printf ("\n\t"); \
            printf ("%d,", A[i]); \
      } \
      printf ("};\n\n");
static void
build_table (void)
{
      int i;
      unsigned char ch;

      for (ch=0; ch<UCHAR_MAX; ++ch) {
            _metacharacters[ch] = ch=='.' || ch=='^' || ch=='$' || ch=='*' ||
                                  ch=='+' || ch=='?' || ch=='{' || ch=='[' ||
                              ch=='|' || ch=='(' || ch==')' || ch=='\\';
      }

      PRINT_TABLE(_metacharacters)
}
#else
static char _metacharacters[UCHAR_MAX] = {
      0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,
      1,1,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
      0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
      0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
      0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
      0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
      0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
#endif


/************
*************  PROTECTED
************/

/****
*****  PcreInfo loosely based on Anjuta's search-replace.c
****/

struct _PcreInfo
{
        int ovector_bufsize;
        int * ovector;
        pcre * re;
        pcre_extra * extra;
};

static void
pcre_info_free (PcreInfo *re)
{
      if (re)
      {
            if (re->re)
                  pcre_free (re->re);
            if (re->extra)
                  pcre_free (re->extra);
            if (re->ovector)
                  g_free (re->ovector);
            g_free (re);
      }
}

static PcreInfo *
pcre_info_new (const char * pattern, gboolean case_sensitive, char ** ret_err)
{
      PcreInfo *re;
      int options;
      const char *err;
      int err_offset;
      int status;
      int capturecount;
      static int default_options = 0;
      static gboolean checked_pcre_for_utf8_support = FALSE;

      g_return_val_if_fail (pattern!=NULL, NULL);

      re = g_new0 (PcreInfo, 1);

      if (!checked_pcre_for_utf8_support)
      {
            int i = 0;
            pcre_config (PCRE_CONFIG_UTF8, &i);
            if (i)
                  default_options |= PCRE_UTF8;
            else
                  log_add (LOG_ERROR, _("Your system's copy of libpcre doesn't support UTF-8.  UTF-8 regular expressions may fail."));

            checked_pcre_for_utf8_support = TRUE;
      }

      options = default_options;
      if (!case_sensitive)
            options |= PCRE_CASELESS;

      re->re = pcre_compile (pattern, options, &err, &err_offset, NULL);
      if (NULL == re->re) {
            char * pch = g_strdup_printf (_("Can't use regular expression \"%s\": %s at position %d"), pattern, err, err_offset);
            if (ret_err != NULL)
                  *ret_err = g_strdup (pch);
            log_add (LOG_ERROR, pch);
            pcre_info_free (re);
            g_free (pch);
            return NULL;
      }

      re->extra = pcre_study (re->re, 0, &err);
      if (err != NULL) {
            char * pch = g_strdup_printf (_("Can't use regular expression \"%s\": %s"), pattern, err);
            if (ret_err != NULL)
                  *ret_err = g_strdup (pch);
            log_add (LOG_ERROR, pch);
            pcre_info_free (re);
            g_free (pch);
            return NULL;
      }

      capturecount = 0;
      status = pcre_fullinfo (re->re, re->extra, PCRE_INFO_CAPTURECOUNT, &capturecount);
        re->ovector_bufsize = 3 * (capturecount + 1); /* +1 for safety; * 3 for pcre_exec's tuples */
      re->ovector = g_new (int, re->ovector_bufsize);
      if (ret_err != NULL)
            *ret_err = NULL;

      return re;
}

static int
my_regexec (FilterPhrase * filter, const char * text_str, int text_len)
{
      int retval;

      if (filter->pcre_state == REGEX_NEED_COMPILE)
      {
            filter->pcre_info = pcre_info_new (filter->private_key, filter->case_sensitive, NULL);

            if (filter->pcre_info != NULL)
                  filter->pcre_state = REGEX_COMPILED;
            else
                  filter->pcre_state = REGEX_ERR;
      }

      if (filter->pcre_state != REGEX_COMPILED)
            retval = PCRE_ERROR_NOMATCH;
      else
            retval = pcre_exec (filter->pcre_info->re,
                                filter->pcre_info->extra,
                                text_str,
                                text_len,
                                0,
                                PCRE_NOTEMPTY,
                                filter->pcre_info->ovector,
                                filter->pcre_info->ovector_bufsize);

      return retval;
}

/****
*****
****/

static Filter *
filter_phrase_dup (const Filter * f_old)
{
      Filter * f_new = filter_phrase_new ();
      FilterPhrase * fp_old = FILTER_PHRASE(f_old);
      FilterPhrase * fp_new = FILTER_PHRASE(f_new);
      filter_class_dup (f_old, f_new);
      filter_phrase_set (fp_new, fp_old->public_match_type, fp_old->key_type, fp_old->public_key, fp_old->case_sensitive);
      return f_new;
}

static char*
filter_phrase_to_string (const Filter * filter)
{
      const char * key_str = "";
      const char * match_str = "";
      const gboolean negate = filter->negate;
      const FilterPhrase * phrase = FILTER_PHRASE(filter);

      switch (phrase->key_type) {
            case PHRASE_KEY_SUBJECT:     key_str = _("Subject"); break;
            case PHRASE_KEY_AUTHOR:      key_str = _("Author"); break;
            case PHRASE_KEY_MESSAGE_ID:  key_str = _("Message-ID"); break;
            case PHRASE_KEY_REFERENCES:  key_str = _("References"); break;
            case PHRASE_KEY_XREF:        key_str = _("Xref"); break;
            default:                     g_warning ("Unrecognized key_type %d", phrase->key_type);
      }

      if (!negate) {
            switch (phrase->public_match_type) {
                  case PHRASE_MATCH_CONTAINS:     match_str = _("contains"); break;
                  case PHRASE_MATCH_IS:           match_str = _("is"); break;
                  case PHRASE_MATCH_STARTS_WITH:  match_str = _("starts with"); break;
                  case PHRASE_MATCH_ENDS_WITH:    match_str = _("ends with"); break;
                  case PHRASE_MATCH_REGEX:        match_str = _("matches regular expression"); break;
                    default:                        g_warning ("Unrecognized match_type %d", phrase->public_match_type);
            }
      } else {
            switch (phrase->public_match_type) {
                  case PHRASE_MATCH_CONTAINS:     match_str = _("does not contain"); break;
                  case PHRASE_MATCH_IS:           match_str = _("is not"); break;
                  case PHRASE_MATCH_STARTS_WITH:  match_str = _("does not start with"); break;
                  case PHRASE_MATCH_ENDS_WITH:    match_str = _("does not end with"); break;
                  case PHRASE_MATCH_REGEX:        match_str = _("does not match regular expression"); break;
                    default:                        g_warning ("Unrecognized match_type %d", phrase->public_match_type);
            }
      }

      return g_strdup_printf ("%s %s \"%s\"", key_str, match_str, phrase->public_key);
}

/**
 * Boyer-Moore-Horspool-Sunday search algorithm.
 * Returns position of match, or -1 if no match.
 */
static int
bmhs_isearch (const unsigned char * text, int text_len,
              const unsigned char * pat, int pat_len,
              const char * skip)

{
      const guchar first_uc = toupper(*pat);
      const guchar first_lc = tolower(*pat);
      const guchar * t = text;
      const guchar * text_end = text + text_len - pat_len + 1;
      const guchar * pat_end = pat + pat_len; 
      const guchar * p;
      const guchar * q;

      for (;;)
      {
            /* scan loop that searches for the first character of the pattern */
            while (t<text_end && *t!=first_uc && *t!=first_lc)
                  t += skip[tolower(t[pat_len])];
            if (t >= text_end)
                  break;

            /* first character matches, so execute match loop in fwd direction */
            p = pat;
            q = t;
            while (++p < pat_end && *p == tolower(*++q))
                  ;

            if (p == pat_end)
                  return t - text;

            t += skip[t[pat_len]];
      }

      return -1;
}

/**
 * Boyer-Moore-Horspool-Sunday search algorithm.
 * Returns position of match, or -1 if no match.
 */
static int
bmhs_search (const unsigned char * text, int text_len,
             const unsigned char * pat, int pat_len,
             const char * skip)

{
      const guchar first = *pat;
      const guchar * t = text;
      const guchar * text_end = text + text_len - pat_len + 1;
      const guchar * pat_end = pat + pat_len; 
      const guchar * p;
      const guchar * q;

      for (;;)
      {
            /* scan loop that searches for the first character of the pattern */
            while (t<text_end && *t!=first)
                  t += skip[t[pat_len]];
            if (t >= text_end)
                  break;

            /* first character matches, so execute match loop in fwd direction */
            p = pat;
            q = t;
            while (++p < pat_end && *p == *++q)
                  ;

            if (p == pat_end)
                  return t - text;

            t += skip[t[pat_len]];
      }

      return -1;
}

gboolean
filter_phrase_does_match   (FilterPhrase    * phrase,
                            const char      * text,
                            int               text_len)
{
      gboolean retval;

      if (!is_nonempty_string (text))
      {
            retval = FALSE;
      }
      else
      {
            const char * pat = phrase->private_key;
            const int pat_len = phrase->private_key_len;

            switch (phrase->match_type)
            {
                  case PHRASE_MATCH_REGEX:
                        retval = my_regexec (phrase, text, text_len)>0;
                        break;

                  case PHRASE_MATCH_ENDS_WITH:
                        if (text_len < pat_len) {
                              retval = FALSE;
                              break;
                        }
                        text += text_len - pat_len;
                        text_len -= pat_len;
                        /* fall through to "is" */

                  case PHRASE_MATCH_IS:
                        if (phrase->case_sensitive)
                              retval = !pan_strcmp (pat, text);
                        else
                              retval = !g_ascii_strcasecmp (pat, text);
                        break;

                  case PHRASE_MATCH_STARTS_WITH:
                        if (phrase->case_sensitive)
                              retval = !strncmp (text, pat, pat_len);
                        else
                              retval = !g_ascii_strncasecmp (text, pat, pat_len);
                        break;

                  case PHRASE_MATCH_CONTAINS:
                        if (phrase->case_sensitive)
                              retval = bmhs_search ((const guchar*)text, text_len, phrase->private_key, phrase->private_key_len, phrase->bmhs_skip) != -1;
                        else
                              retval = bmhs_isearch ((const guchar*)text, text_len, phrase->private_key, phrase->private_key_len, phrase->bmhs_skip) != -1;
                        break;
            }
      }

      return retval;
}

static void
filter_phrase_test_articles (Filter          * filter,
                             const Article  ** articles,
                             int               article_qty,
                             gboolean        * does_match)
{
      int i;
      FilterPhrase * phrase = FILTER_PHRASE(filter);

      for (i=0; i<article_qty; ++i)
      {
            char buf[512];
            const Article * a = articles[i];
            const char * text = NULL;
            int text_len;

            /* tweak: some newsreaders trim out the leading message-ids from long references headers,
             * which breaks threading and References filtering.  The common use for looking at References;
             * is for identifying threads, i.e., watch/ignore thread, so for the special case of
             * "References STARTS_WITH" we'll walk up Pan's own threading before applying the test. */
            if (phrase->key_type==PHRASE_KEY_REFERENCES && phrase->match_type == PHRASE_MATCH_STARTS_WITH)
                  while (a->parent!=NULL && pstring_is_set(&a->parent->references))
                        a = a->parent;

            switch (phrase->key_type)
            {
                  case PHRASE_KEY_SUBJECT:
                        text = a->subject.str;
                        text_len = a->subject.len;
                        break;

                  case PHRASE_KEY_AUTHOR:
                        text = article_get_author_str (a, buf, sizeof(buf));
                        text_len = strlen (text);
                        break;

                  case PHRASE_KEY_MESSAGE_ID:
                        text = a->message_id.str;
                        text_len = a->message_id.len;
                        break;

                  case PHRASE_KEY_REFERENCES:
                        text = a->references.str;
                        text_len = a->references.len;
                        break;

                  case PHRASE_KEY_XREF:
                        text = a->xref.str;
                        text_len = a->xref.len;
                        break;
            }

            does_match[i] = filter_phrase_does_match (phrase, text, text_len);
      }
}

static void
filter_phrase_destructor (PanObject * o)
{
      filter_phrase_set (FILTER_PHRASE(o), PHRASE_MATCH_IS, PHRASE_KEY_SUBJECT, NULL, FALSE);
      filter_destructor (o);
}

static void
filter_phrase_constructor (FilterPhrase * f)
{
      debug_enter ("filter_phase_constructor");

      filter_constructor ((Filter*)f,
                          filter_phrase_destructor,
                          filter_phrase_test_articles,
                          filter_phrase_to_string,
                          filter_phrase_dup,
                          FILTER_PHRASE_CLASS_ID);

      f->public_match_type = PHRASE_MATCH_IS;
      f->match_type = PHRASE_MATCH_IS;
      f->key_type = PHRASE_KEY_SUBJECT;
      f->private_key_len = 0;
      f->private_key = NULL;
      f->public_key = NULL;
      f->case_sensitive = FALSE;
      f->pcre_state = REGEX_NEED_COMPILE;

      debug_exit ("filter_phase_constructor");
}

static char*
regexp_unescape (const char * in)
{
      char * retval = g_new (char, strlen(in)+1);
      char * out = retval;
      if (*in == '^')
            ++in;
      while (*in) {
            if (in[0]=='\\' && is_metacharacter(in[1]))
                  ++in;
            *out++ = *in++;
      }
      if ((out-retval>1) && in[-1]=='$' && in[-2]!='\\')
            --out;
      *out = '\0';
      return retval;
}

/**
 * Try to downgrade the more-expensive regexes to a cheaper type.
 */
static PhraseMatchType
get_real_match_type (const char * key, PhraseMatchType type)
{
      gboolean starts_with = FALSE;
      gboolean ends_with = FALSE;
      const char * pch;
      const char * end;
      size_t len;

      /* sanity clause */
      g_return_val_if_fail (is_nonempty_string(key), type);

      /* if it's not a regex, keep it */
      if (type != PHRASE_MATCH_REGEX)
            return type;

      /* must it be a regex? */
      len = strlen (key);
      end = key + len;
      for (pch=key; pch!=end; ++pch)
            if (*pch=='\\' && is_metacharacter(pch[1]))
                  ++pch;
            else if (*pch=='^' && pch==key)
                  starts_with = TRUE;
            else if (*pch=='$' && pch+1==end)
                  ends_with = TRUE;
            else if (is_metacharacter(*pch))
                  return PHRASE_MATCH_REGEX;

      if (starts_with && ends_with)
            return PHRASE_MATCH_IS;
      if (starts_with)
            return PHRASE_MATCH_STARTS_WITH;
      if (ends_with)
            return PHRASE_MATCH_ENDS_WITH;
      return PHRASE_MATCH_CONTAINS;
}

/************
*************  PUBLIC
************/

Filter*
filter_phrase_new (void)
{
      FilterPhrase * f;
      debug_enter ("filter_phrase_new");

      f = g_new0 (FilterPhrase, 1);
      filter_phrase_constructor (f);

      debug_exit ("filter_phrase_new");
      return FILTER(f);
}

char*
filter_phrase_validate_pattern   (const char * pattern)
{
      char * retval = NULL;
      PcreInfo * pcre_info;

      pcre_info = pcre_info_new (pattern, FALSE, &retval);
      if (pcre_info != NULL)
            pcre_info_free (pcre_info);

      return retval;
}

void
filter_phrase_set (FilterPhrase        * filter,
                   PhraseMatchType       match_type,
                   PhraseKeyType         key_type,
                   const char          * key,
                   gboolean              case_sensitive)
{
      debug_enter ("filter_phrase_set");

      /* sanity clause */
      g_return_if_fail (filter!=NULL);

      /* free the previous phrase */
      replace_gstr (&filter->public_key, NULL);
      replace_gstr (&filter->private_key, NULL);
      if (filter->pcre_state == REGEX_COMPILED) {
            pcre_info_free (filter->pcre_info);
            filter->pcre_info = NULL;
            filter->pcre_state = REGEX_NEED_COMPILE;
      }

      /* repopulate the filter */
      filter->public_match_type = match_type;
      filter->match_type = key==NULL ? match_type : get_real_match_type (key, match_type);
      filter->key_type = key_type;
      filter->case_sensitive = case_sensitive;
      filter->private_key_len = 0;
      if (key != NULL)
      {
            /* build the key strings */
            filter->public_key = g_strdup (key);
            filter->private_key = filter->match_type == filter->public_match_type
                  ? g_strdup (key)
                  : regexp_unescape (key);
            if (!filter->case_sensitive) {
                  char * pch = g_utf8_strdown (filter->private_key, -1);
                  if (pch) {
                        replace_gstr (&filter->private_key, pch);
                  }
            }
            filter->private_key_len = strlen (filter->private_key);

            /* Boyer-Moore-Horspool-Sunday */
            {
                  int i;
                  char * skip = filter->bmhs_skip;
                  const char * pat = filter->private_key;
                  const int len = filter->private_key_len;

                  for (i=0; i<UCHAR_MAX; ++i)
                        skip[i] = len + 1;
                  for (i=0; i<len; i++)
                        skip[(guchar)(pat[i])] = len - i;
            }
      }

      debug_exit ("filter_phrase_set");
}

/**
***
**/

static void
quote_regexp (GString * out, const char * in)
{
      for (; in && *in; ++in) {
            if (is_metacharacter (*in))
                  g_string_append_c (out, '\\');
            g_string_append_c (out, *in);
      }
}

char*
filter_phrase_create_regex   (const char           * in,
                              PhraseMatchType        match_type)
{
      GString * out = g_string_new (NULL);

      if (match_type == PHRASE_MATCH_REGEX)
            g_string_assign (out, in);
      else
            quote_regexp (out, in);

      pan_g_string_strstrip (out);

      if (out->len)
      {
            if (match_type == PHRASE_MATCH_IS || match_type == PHRASE_MATCH_STARTS_WITH)
                  g_string_insert_c (out, 0, '^');

            if (match_type == PHRASE_MATCH_IS || match_type == PHRASE_MATCH_ENDS_WITH)
                  g_string_append_c (out, '$');
      }

      return g_string_free (out, FALSE);
}

Generated by  Doxygen 1.6.0   Back to index