C# Generic Inverted Index

Inverted Index

Introduction

This is just a quick inverted index I wrote for to generate an index over an XML product feed I was working on. It should work for nearly any type providing that you create a corresponding classifier to extract the indices from the entry.

Background information

An inverted index is a data structure designed to aid the location of records containing a particular item of data. Wikipedia as a good explaination of inverted indexes, as normal.

IClassifier

In order to extract the indices from the from each entry in the index I created something I’ve termed a classifier, these implement the IClassifier interface and optionally the AbstractClassifier abstract class (in case you need any of the functionality I intend to add to the abstract class).

WordClassifier

This is just an example implementation; that extracts all the different words used in the entry to use as indices. This just removes any non-alphanumeric characters, then splits the string into an array using any whitespace as a delimiter, finally we de-duplicate the words in the array.

	   1:  /// <summary>
	   2:  /// An IClassifier extracts the values to use as indices in the index from each entry. 
	   3:  /// </summary>
	   4:  /// <typeparam name="T"></typeparam>
	   5:  /// <typeparam name="X"></typeparam>
	   6:  public interface IClassifier<T, X> {
	   7:  List<X> Classify( T Subject );
	   8:  }
	   9:   
	  10:  /// <summary>
	  11:  /// The abstract classifier will contain any common methods for use in the classifiers.
	  12:  /// </summary>
	  13:  public abstract class AbstractClassifier {
	  14:   
	  15:  }

WordClassifier

This is just an example implementation; that extracts all the different words used in the entry to use as indices. This just removes any non-alphanumeric characters, then splits the string into an array using any whitespace as a delimiter. Finishing with a final de-duplicating of words in the array.

	  17:      /// <summary>
	  18:      /// This classifier builds a list of indices for the index using each distinct word in the entry.
	  19:      /// </summary>
	  20:      public class WordClassifier : AbstractClassifier, IClassifier<string, string> {
	  21:   
	  22:          public List<string> Classify( string Subject ) {
	  23:              List<string> ListOfWords = new List<string>();
	  24:   
	  25:              Subject = RemoveNonAlphaChars( Subject );
	  26:   
	  27:              string[] words = Subject.Split( ' ', '    ', '\n' );
	  28:              foreach ( string word in words ) {
	  29:                  if ( !ListOfWords.Contains( word ) ) {
	  30:                      ListOfWords.Add( word );
	  31:                  }
	  32:              }
	  33:              return ListOfWords;
	  34:          }
	  35:   
	  36:          private string RemoveNonAlphaChars( string rawstring ) {
	  37:              StringBuilder sb = new StringBuilder();
	  38:   
	  39:              foreach ( char c in rawstring.ToCharArray() ) {
	  40:                  if ( char.IsLetter( c ) || char.IsSeparator( c ) ) {
	  41:                      sb.Append( c );
	  42:                  }
	  43:              }
	  44:   
	  45:              return sb.ToString();
	  46:          }
	  47:      }

All the code together

	   1:      /// <summary>
	   2:      /// An IClassifier extracts the values to use as indices in the index from each entry. 
	   3:      /// </summary>
	   4:      /// <typeparam name="T"></typeparam>
	   5:      /// <typeparam name="X"></typeparam>
	   6:      public interface IClassifier<T, X> {
	   7:          List<X> Classify( T Subject );
	   8:      }
	   9:   
	  10:      /// <summary>
	  11:      /// The abstract classifier will contain any common methods for use in the classifiers.
	  12:      /// </summary>
	  13:      public abstract class AbstractClassifier {
	  14:   
	  15:      }
	  16:   
	  17:      /// <summary>
	  18:      /// This classifier builds a list of indices for the index using each distinct word in the entry.
	  19:      /// </summary>
	  20:      public class WordClassifier : AbstractClassifier, IClassifier<string, string> {
	  21:   
	  22:          public List<string> Classify( string Subject ) {
	  23:              List<string> ListOfWords = new List<string>();
	  24:   
	  25:              Subject = RemoveNonAlphaChars( Subject );
	  26:   
	  27:              string[] words = Subject.Split( ' ', '    ', '\n' );
	  28:              foreach ( string word in words ) {
	  29:                  if ( !ListOfWords.Contains( word ) ) {
	  30:                      ListOfWords.Add( word );
	  31:                  }
	  32:              }
	  33:              return ListOfWords;
	  34:          }
	  35:   
	  36:          private string RemoveNonAlphaChars( string rawstring ) {
	  37:              StringBuilder sb = new StringBuilder();
	  38:   
	  39:              foreach ( char c in rawstring.ToCharArray() ) {
	  40:                  if ( char.IsLetter( c ) || char.IsSeparator( c ) ) {
	  41:                      sb.Append( c );
	  42:                  }
	  43:              }
	  44:   
	  45:              return sb.ToString();
	  46:          }
	  47:      }
	  48:   
	  49:      /// <summary>
	  50:      /// An inverted index of entries, indexed by X
	  51:      /// </summary>
	  52:      /// <typeparam name="T">Entry Stored in Index</typeparam>
	  53:      /// <typeparam name="X">Type Entries indexed by</typeparam>
	  54:      public class InvertedIndex<T, X> {
	  55:          Dictionary<X, List<T>> _Index = new Dictionary<X, List<T>>();
	  56:          IClassifier<T, X> _Classifier;
	  57:   
	  58:          public InvertedIndex( IClassifier<T, X> Classifier ) {
	  59:              _Classifier = Classifier;
	  60:          }
	  61:   
	  62:          public virtual void AddEntry( T Item ) {
	  63:              List<X> things = _Classifier.Classify( Item );
	  64:              InsertIntoIndex( things, Item, _Index );
	  65:          }
	  66:   
	  67:          public virtual List<T> GetEntriesForIndex( X Indexer ) {
	  68:              if ( _Index.ContainsKey( Indexer ) ) {
	  69:                  return _Index[Indexer];
	  70:              } else {
	  71:                  return null;
	  72:              }
	  73:          }
	  74:          
	  75:   
	  76:          private void InsertIntoIndex( List<X> things, T Item, Dictionary<X, List<T>> _Index ) {
	  77:              foreach ( X thing in things ) {
	  78:                  if ( !_Index.ContainsKey( thing ) ) {
	  79:                      List<T> newList = new List<T>();
	  80:                      newList.Add( Item );
	  81:                      _Index.Add( thing, newList );
	  82:                  } else {
	  83:                      _Index[thing].Add( Item );
	  84:                  }
	  85:              }
	  86:          }
	  87:      }

code formatted by http://manoli.net/csharpformat/


Site Map | Printable View | © 2008 - 2014 Benjamin King | Powered by mojoPortal | HTML 5 | CSS | Design by styleshout