Logo Search packages:      
Sourcecode: pan version File versions  Download package

article-thread.c

/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
/*
 * Pan - A Newsreader for Gtk+
 * Copyright (C) 2002  Charles Kerr <charles@rebelbase.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; version 2 of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

#include <config.h>

#include <glib/gmem.h>

#include <ctype.h>
#include <limits.h>
#include <stdlib.h>
#include <string.h>

#include <pan/base/acache.h>
#include <pan/base/article.h>
#include <pan/base/article-thread.h>
#include <pan/base/base-prefs.h>
#include <pan/base/debug.h>
#include <pan/base/pan-i18n.h>
#include <pan/base/pan-glib-extensions.h>

/**
 * Skip the "Re: " part of a subject header, if any
 * @param subject
 * @return the non-"Re:" portion of the subject header
 */
#define skip_reply_leader(a) \
      (((a!=NULL) && \
        (a[0]=='R' || a[0]=='r') && \
        (a[1]=='E' || a[1]=='e') && \
        (a[2]==':') && \
        (a[3]==' ')) ? a+4 : a)

/**
 * Normalized Article
 */
00052 typedef struct
{
      gboolean is_reply;
      int subject_len;
      char * subject;
      Article * a;
}
Norm;

/**
 * Normalizing a subject header involves:
 *
 * 1. tearing out the numerator from multipart indicators
 *    (i.e., remove "21" from (21/42))
 *    for threading.
 * 2. convert it to lowercase so that, when sorting, we can
 *    have case-insensitive sorting without having to use
 *    a slower case-insensitive compare function.
 * 3. strip out all the leading noise that breaks sorting.
 *
 * When we're threading articles, it's much faster to
 * normalize the * subjects at the outset instead of
 * normalizing them for each comparison.
 */
size_t
normalize_subject (char            * buf,
                   const Article   * a,
                   StripFlags        strip)
{
      static gboolean _inited = FALSE;
      static gboolean _keep_chars[UCHAR_MAX+1];
      register const guchar * in = (guchar*) skip_reply_leader (a->subject.str);
      register char * out = buf;
      const gboolean multipart = a->parts != 0;
      const gboolean strip_case = strip & STRIP_CASE;
      const gboolean strip_numerator = strip & STRIP_MULTIPART_NUMERATOR;
      const gboolean strip_multipart = strip & STRIP_MULTIPART;

      /* populate the _keep_chars array */
      if (!_inited) {
            int i;
            for (i=0; i<=UCHAR_MAX; ++i) {
                  const guchar uch = (guchar)i;
                  _keep_chars[i] = isalnum(uch) || isdigit(uch) || isspace(uch);
            }
            _inited = TRUE;
      }

      /* skip the leading noise */
      while (*in && isspace(*in))
            ++in;

      while (*in)
      {
            /* strip numerator out of multipart information */
            if ((strip_multipart||strip_numerator) && multipart && (*in=='('||*in=='[') && isdigit(in[1]))
            {
                  const guchar * start = ++in;

                  if (strip_multipart || strip_numerator)
                  {
                        while (*in && isdigit(*in))
                              ++in;

                        if (*in!='/' && *in!='|') /* oops, not multipart information */
                              in = start;

                        else if (strip_multipart)
                        {
                              for (++in; *in && *in!=']' && *in!=')'; ++in)
                              {
                                    if (isalpha(*in)) { /* oops, not multipart information */
                                          in = ++start;
                                          break;
                                    }
                              }
                        }
                  }
                  continue;
            }

            /* strip out junk that breaks sorting */
            if (_keep_chars[*in])
                  *out++ = (char) (strip_case ? tolower(*in) : *in);

            ++in;
      }

      *out = '\0';
      return out - buf;
}


/**
 * This Normalizes a group of articles in just two memory blocks.
 * These blocks will need to be g_free()d when the client is done with them.
 */
static void
normalize_articles (Article    ** articles,
                    gint          qty,
                    StripFlags    strip_flags,
                    Norm       ** alloc_and_setme_norm,
                    gchar      ** alloc_and_setme_str)
{
      gint i;
      glong str_buf_idx;
      glong len;
      gchar * str_buf;
      Norm * norm_buf;

      /* sanity clause */
      g_return_if_fail (articles!=NULL);
      g_return_if_fail (qty>0);
      g_return_if_fail (articles_are_valid ((const Article **)articles, qty));
      g_return_if_fail (alloc_and_setme_norm!=NULL);
      g_return_if_fail (alloc_and_setme_str!=NULL);

      /* alloc a buf for the norms */
      *alloc_and_setme_norm = norm_buf = g_new (Norm, qty);

      /* alloc a buf for the subject */
      len = 0;
      for (i=0; i<qty; ++i)
            len += articles[i]->subject.len + 2;
      *alloc_and_setme_str = str_buf = g_new (char, len);
      
      /* normalize the articles */
      str_buf_idx = 0;
      for (i=0; i<qty; ++i) {
            Article * a = articles[i];
            norm_buf[i].a = a;
            norm_buf[i].is_reply = skip_reply_leader (a->subject.str) != a->subject.str ? 1 : 0;
            norm_buf[i].subject = str_buf + str_buf_idx;
            norm_buf[i].subject_len = normalize_subject (norm_buf[i].subject, a, strip_flags);
            str_buf_idx += norm_buf[i].subject_len + 1;
      }
}


static int
compare_pA_to_pA_by_part (const void * va, const void * vb)
{
      int             value;
      const Article * a = (const Article *)va;
      const Article * b = (const Article *)vb;
      gboolean        a_is_reply;
      gboolean        b_is_reply;

      /* already normalized, so check multipart first */
      if (a->parts!=0 && (a->parts == b->parts))
            return a->part - b->part;

      /* order by subject */
      if ((value = pstring_compare (&a->subject, &b->subject)))
            return value;

      /* the rest is probably unnecessary. just carry-over from
       * compare_pA_to_pA_by_subject */

      /* if one but not both is a reply, the reply goes second */
      a_is_reply = skip_reply_leader (a->subject.str) != a->subject.str ? 1 : 0;
      b_is_reply = skip_reply_leader (b->subject.str) != b->subject.str ? 1 : 0;
      if (a_is_reply != b_is_reply)
            return a_is_reply ? 1 : -1;

      /* oldest goes first... */
      if (a->date < b->date)
            value = -1;
      else if (a->date > b->date)
            value = 1;
      else
            value = 0;
      return value;
}

static int
compare_pN_to_pN_by_subject (const void * va, const void * vb)
{
      register int value;
      register const Norm * a = (const Norm *)va;
      register const Norm * b = (const Norm *)vb;

      /* subject is the primary key, of course... */
      if ((value = *a->subject - *b->subject))
            return value;
      if ((value = pan_strcmp_len (a->subject, a->subject_len, b->subject, b->subject_len)))
            return value;

      /* if one but not both is a reply, the reply goes second */
      if (a->is_reply != b->is_reply)
            return a->is_reply ? 1 : -1;

      /* check multipart */
      if ((value = a->a->part - b->a->part))
            return value;

      /* oldest goes first... */
      if (a->a->date < b->a->date)
            value = -1;
      else if (a->a->date > b->a->date)
            value = 1;
      else
            value = 0;
      return value;
}

static int
compare_ppA_to_ppA_by_score (const void* va, const void* vb)
{
        register const Article * a = *(const Article**)va;
        register const Article * b = *(const Article**)vb;
      return a->score - b->score;
}

static gulong
article_get_combined_linecount (const Article * a)
{
      gulong retval = 0;

      g_return_val_if_fail (article_is_valid(a), FALSE);

      retval = a->linecount;
      if (a->part!=0 && a->parts!=0 && a->threads!=NULL) {
            GSList * l;
            int part = a->part + 1;
            for (l=a->threads; l; l=l->next) {
                  Article * child = ARTICLE(l->data);
                  if (child->part == part) {
                        retval += child->linecount;
                        ++part;
                  }
            }
      }

      return retval;
}

static int
compare_ppA_to_ppA_by_linecount (const void* va, const void* vb)
{
        register const Article * a = *(const Article**)va;
        register const Article * b = *(const Article**)vb;
      return article_get_combined_linecount(a) - article_get_combined_linecount(b);
}

static int
compare_ppA_to_ppA_by_action (const void * va, const void * vb)
{
      register const Article * a = *(const Article **)va;
      register const Article * b = *(const Article **)vb;
      int ia, ib;

      ia = article_get_decode_state (a);
      ib = article_get_decode_state (b);
      if (ia != ib)
            return ib - ia;

      ia = acache_has_message (group_get_acache_key(a->group), &a->message_id) ? 1 : 0;
      ib = acache_has_message (group_get_acache_key(b->group), &b->message_id) ? 1 : 0;
      if (ia != ib)
            return ib - ia;

      return 0;
}

static int
compare_ppA_to_ppA_by_read (const void * va, const void * vb)
{
      register const Article * a = *(const Article **)va;
      register const Article * b = *(const Article **)vb;
      const gboolean a_is_read = article_is_read (a);
      const gboolean b_is_read = article_is_read (b);
      const gboolean a_is_new = article_is_new (a);
      const gboolean b_is_new = article_is_new (b);
      gint ia, ib;

      ia = (a_is_new?1:0) + a->new_children;
      ib = (b_is_new?1:0) + b->new_children;
      if (ia != ib)
            return ib - ia;

      ia = (a_is_read?0:1) + a->unread_children;
      ib = (b_is_read?0:1) + b->unread_children;
      if (ia != ib)
            return ib - ia;

      ia = a_is_read ? 0 : 1;
      ib = b_is_read ? 0 : 1;
      if (ia != ib)
            return ib - ia;

      ia = article_get_multipart_state (a);
      ib = article_get_multipart_state (b);
      if (ia != ib)
            return ib - ia;

      return 0;
}


static int
compare_ppA_to_ppA_by_date (const void* va, const void* vb)
{
      gint value;
      time_t date_a = (*(const Article**)va)->date;
      time_t date_b = (*(const Article**)vb)->date;

      if (date_a < date_b)
            value = -1;
      else if (date_a > date_b)
            value = 1;
      else
            value = 0;
      return value;
}

static int
compare_ppA_to_ppA_by_message_id (const void* va, const void* vb)
{
      register const Article * a = *(const Article **)va;
      register const Article * b = *(const Article **)vb;
      return pstring_compare (&a->message_id, &b->message_id);
}

typedef struct
{
      char name[128];
      int name_len;
      Article * article;
}
ArticleStruct;

static int
compare_pAS_to_pAS_by_data (const void * va, const void * vb)
{
      const ArticleStruct * a = (const ArticleStruct*)va;
      const ArticleStruct * b = (const ArticleStruct*)vb;
      return pan_strcmp_len (a->name, a->name_len, b->name, b->name_len);
}

void
sort_articles (Article      ** buf,
               size_t          article_qty,
               int             sort_type,
               gboolean        ascending)
{
      g_return_if_fail (articles_are_valid ((const Article **)buf, article_qty));

      if (!article_qty)
            return;

      switch (sort_type)
      {
            case ARTICLE_SORT_AUTHOR:
            {
                  size_t i;
                  ArticleStruct * as = g_new (ArticleStruct, article_qty);
                  for (i=0; i<article_qty; ++i)
                  {
                        as[i].article = buf[i];
                        as[i].name_len = article_get_short_author_str (buf[i], as[i].name, sizeof(as[i].name));
                        g_strdown (as[i].name);
                  }
                  msort (as,
                         article_qty,
                         sizeof(ArticleStruct),
                         compare_pAS_to_pAS_by_data);
                  for (i=0; i<article_qty; ++i)
                        buf[i] = as[i].article;
                  g_free (as);
                  break;
            }
            case ARTICLE_SORT_LINES:
            {
                  msort (buf, article_qty, sizeof(Article*), compare_ppA_to_ppA_by_linecount);
                  break;
            }
            case ARTICLE_SORT_DATE:
            {
                  msort (buf, article_qty, sizeof(Article*), compare_ppA_to_ppA_by_date);
                  break;
            }
            case ARTICLE_SORT_MSG_ID:
            {
                  msort (buf, article_qty, sizeof(Article*), compare_ppA_to_ppA_by_message_id);
                  break;
            }
            case ARTICLE_SORT_ACTION_STATE:
            {
                  msort (buf, article_qty, sizeof(Article*), compare_ppA_to_ppA_by_action);
                  break;
            }
            case ARTICLE_SORT_READ_STATE:
            {
                  msort (buf, article_qty, sizeof(Article*), compare_ppA_to_ppA_by_read);
                  break;
            }
            case ARTICLE_SORT_SCORE:
            {
                  msort (buf, article_qty, sizeof(Article*), compare_ppA_to_ppA_by_score);
                  break;
            }
            case ARTICLE_SORT_SUBJECT:
            default:
            {
                  gint i;
                  Norm * norm_buf = NULL;
                  char * str_buf = NULL;
                  normalize_articles (buf, article_qty, STRIP_CASE, &norm_buf, &str_buf);
                  msort (norm_buf, article_qty, sizeof(Norm), compare_pN_to_pN_by_subject);
                  for (i=0; i<article_qty; ++i)
                        buf[i] = ARTICLE(norm_buf[i].a);
                  g_free (norm_buf);
                  g_free (str_buf);
            }
      }

      /* if not ascending, reverse the order */
      if (!ascending) {
            const size_t mid = article_qty/2;
            size_t i;
            for (i=0; i!=mid; ++i) { /* swap */
                  Article * tmp = buf[i];
                  buf[i] = buf[article_qty-1-i];
                  buf[article_qty-1-i] = tmp;
            }
      }
}


static gboolean
is_child_of (const Article * child,
             const Article * parent)
{
      g_return_val_if_fail (child!=NULL, FALSE);
      g_return_val_if_fail (parent!=NULL, FALSE);

      for (;;)
      {
            if (!child)
                  return FALSE;
            if (child == parent)
                  return TRUE;
            child = child->parent;
      }
}

static int
article_root_get_part_state (const Article * a)
{
      int retval;

      /* sanity clause */
      g_return_val_if_fail (a!=NULL, 0u);
      g_return_val_if_fail (a->parent==NULL, 0u);

      /* not a multipart */
      if (a->parts<1)
            retval = MULTIPART_STATE_NONE;

      /* someone's posted a "00/124" nfo message */
      else if (a->part==0)
            retval = MULTIPART_STATE_NONE;

      /* incomplete multipart */
      else if (a->part>1)
            retval = MULTIPART_STATE_SOME;

      /* someone's posted a followup to a multipart */
      else if (a->linecount<250 && !g_strncasecmp(a->subject.str,"Re:", 3))
            retval = MULTIPART_STATE_NONE;

      /* a multipart */
      else {
            GSList * l;
            gint part = a->part + 1;
            for (l=a->threads; l!=NULL; l=l->next)
                  if (ARTICLE(l->data)->parts==a->parts && ARTICLE(l->data)->part==part)
                        ++part;
            retval = part==a->parts+1 ? MULTIPART_STATE_ALL : MULTIPART_STATE_SOME;
      }

      return retval;
}

static void
set_children_part_state (Article * a, const int state)
{
      gint part;
      GSList * l;

      a->multipart_state = state;

      part = a->part + 1;
      for (l=a->threads; l!=NULL; l=l->next) {
            Article * child = ARTICLE(l->data);
            if (child->parts==a->parts && child->part==part) {
                  child->multipart_state = state;
                  ++part;
            }
      }
}

/**
 * Thread the articles specified in list
 */
void
thread_articles (Article   ** articles,
                 guint        article_qty)
{
      guint i;
      Article search_a;
      gchar * norm_str_buf;
      Norm * norm;
      Norm * sorted_norm;
      GArray * buf = NULL;
      GHashTable * mid_to_article;

      /* sanity clause */
      g_return_if_fail (articles!=NULL);
      g_return_if_fail (articles_are_valid ((const Article**)articles, article_qty));

      if (article_qty<1 || !articles)
            return;

      if (break_thread_when_subject_changes)
            buf = g_array_new (FALSE, FALSE, 1);

      /* make a plausiably-legal article */
      search_a.number = 1;
      search_a.subject = pstring_shallow ("dummy subject", -1);
      search_a.references = PSTRING_INIT;

      /* make a message-id-sorted array of the articles */
      mid_to_article = g_hash_table_new (pstring_hash, pstring_equal);
      for (i=0; i<article_qty; ++i)
            g_hash_table_insert (mid_to_article, &articles[i]->message_id, articles[i]);

      /* normalize the articles */
      norm = NULL;
      norm_str_buf = NULL;
      normalize_articles (articles, article_qty, STRIP_MULTIPART_NUMERATOR, &norm, &norm_str_buf);

      /* sort the normalized articles */
      sorted_norm = g_memdup (norm, sizeof(Norm)*article_qty);
      qsort (sorted_norm, article_qty, sizeof(Norm), compare_pN_to_pN_by_subject);

      /* unthread the articles, just in case they were threaded before */
      for (i=0; i!=article_qty; ++i)
      {
            Article * a = articles[i];
            a->parent = NULL;
            g_slist_free (a->threads);
            a->threads = NULL;
      }


      /* thread the articles */
      for (i=0; i!=article_qty; ++i)
      {
            PString references;
            Article * parent = NULL;
            Article * a = articles[i];
            gint index = -1;

            /* thread by reference.
               (except for parts 2...n of multiparts, which need to be threaded by multipart).
               We take the references header and work our way from the last message-id backwards
               to the first.  For each message-id in this series, the first one we find in our
               set of articles is flagged as the parent. */
            references = a->references;
            if (a->parts<2 && pstring_is_set(&a->references))
            {
                  /* pull the last message-id from the references string. */
                  const char * cpch = pan_strrchr_len (references.str, references.len, '<');
                  PString message_id = pstring_substr_shallow (&references, cpch, NULL);

                  while (parent==NULL && pstring_is_set (&message_id))
                  {
                        /* look for a match in our message-id hashtable */
                        Article * match = (Article*) g_hash_table_lookup (mid_to_article, &message_id);

                        /* if we found the ancestor & it's worthy, thread it */
                        if (match!=NULL && !is_child_of(match,a))
                        {
                              gboolean subject_changed = FALSE;

                              if (break_thread_when_subject_changes)
                              {
                                    int buf_len;
                                    g_array_set_size (buf, match->subject.len + 1);
                                    buf_len = normalize_subject (buf->data, match, STRIP_MULTIPART_NUMERATOR);
                                    subject_changed = pan_strcmp_len (buf->data, buf_len, norm[i].subject, norm[i].subject_len);
                              }

                              if (!subject_changed)
                                    parent = match;
                        }

                        /* if we couldn't find the ancestor, march up the References string */
                        references.len -= message_id.len;
                        references = pstring_strstrip_shallow (&references);
                        cpch = pan_strrchr_len (references.str, references.len, '<');
                        message_id = pstring_substr_shallow (&references, cpch, NULL);
                  }
            }

            /* thread by multipart */
            if (!parent && a->parts>1 && a->part>1)
            {
                  Norm n = norm[i];
                  search_a.part = 1;
                  search_a.date = 0; /* unlikely to get an exact match.. :) */
                  n.a = &search_a;

                  index = lower_bound (&n,
                                       sorted_norm,
                                       article_qty,
                                       sizeof(Norm),
                                       compare_pN_to_pN_by_subject,
                                       NULL);

                  if (0<=index && index<article_qty)
                  {
                        Norm * match = &sorted_norm[index];
                        if ((match->a != a)
                              && (match->a->parts == a->parts)
                              && (!pan_strcmp_len(match->subject,match->subject_len,n.subject,n.subject_len))
                              && (!is_child_of(match->a,a)))
                        {
                              parent = match->a;
                        }
                  }
            }

            /* thread by subject */
            if (!parent && skip_reply_leader(a->subject.str)!=a->subject.str)
            {
                  Norm n = norm[i];
                  search_a.part = 0;
                  search_a.date = 0; /* unlikely to get an exact match.. :) */
                  n.a = &search_a;

                  index = lower_bound (
                        &n,
                        sorted_norm,
                        article_qty,
                        sizeof(Norm),
                        compare_pN_to_pN_by_subject,
                        NULL);

                  if (0<=index && index<article_qty && !is_child_of(sorted_norm[index].a,a))
                  {
                        Norm * match = &sorted_norm[index];

                        if (!pan_strcmp_len (match->subject, match->subject_len, n.subject, n.subject_len))
                        {
                              /* 1 original, 1 reply */
                              parent = match->a;
                        }
                        else if (!pan_strcmp_len (match->subject, match->subject_len, a->subject.str, a->subject.len) && (match->a->date<a->date))
                        {
                              /* 2 replies, no top --  oldest on top */
                              parent = match->a;
                        }
                  }
            }

            if (parent != NULL) /* this article has a parent */
            {
                  g_assert (!is_child_of(parent,a));

                  /* link the two articles */
                  a->parent = parent;
                  parent->threads = g_slist_prepend (parent->threads, norm[i].a);
            }
      }

      /* right now all the children are normalized; point to articles */
      for (i=0; i!=article_qty; ++i) {
            Article * a = articles[i];
            a->threads = g_slist_sort (a->threads, compare_pA_to_pA_by_part);
      }

      /**
      ***  Calculate multipart states
      **/

      for (i=0; i!=article_qty; ++i) {
            Article * a = articles[i];
            a->multipart_state = MULTIPART_STATE_NONE;
      }
      for (i=0; i!=article_qty; ++i) {
            Article * a = articles[i];
            if (a->parent == NULL)
                  set_children_part_state (a, article_root_get_part_state(a));
      }

      /* cleanup */
      g_hash_table_destroy (mid_to_article);
      g_free (norm);
      g_free (norm_str_buf);
      g_free (sorted_norm);
      if (buf != NULL)
            g_array_free (buf, TRUE);
}

Generated by  Doxygen 1.6.0   Back to index