SearchFieldAnalyzer.java

  1. /*
  2.  * This file is part of dependency-check-core.
  3.  *
  4.  * Licensed under the Apache License, Version 2.0 (the "License");
  5.  * you may not use this file except in compliance with the License.
  6.  * You may obtain a copy of the License at
  7.  *
  8.  *     http://www.apache.org/licenses/LICENSE-2.0
  9.  *
  10.  * Unless required by applicable law or agreed to in writing, software
  11.  * distributed under the License is distributed on an "AS IS" BASIS,
  12.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13.  * See the License for the specific language governing permissions and
  14.  * limitations under the License.
  15.  *
  16.  * Copyright (c) 2012 Jeremy Long. All Rights Reserved.
  17.  */
  18. package org.owasp.dependencycheck.data.lucene;

  19. import java.io.IOException;
  20. import org.apache.lucene.analysis.Analyzer;
  21. import org.apache.lucene.analysis.TokenStream;
  22. import org.apache.lucene.analysis.Tokenizer;
  23. import org.apache.lucene.analysis.core.LowerCaseFilter;
  24. import org.apache.lucene.analysis.core.StopFilter;
  25. import org.apache.lucene.analysis.core.WhitespaceTokenizer;
  26. import org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter;
  27. import org.apache.lucene.analysis.CharArraySet;
  28. import org.apache.lucene.analysis.en.EnglishAnalyzer;

  29. /**
  30.  * A Lucene field analyzer used to analyzer queries against the CPE data.
  31.  *
  32.  * @author Jeremy Long
  33.  */
  34. public class SearchFieldAnalyzer extends Analyzer {

  35.     /**
  36.      * The list of additional stop words to use.
  37.      */
  38.     private static final String[] ADDITIONAL_STOP_WORDS = {"software", "framework", "inc",
  39.         "com", "org", "net", "www", "consulting", "ltd", "foundation", "project"};
  40.     /**
  41.      * The set of stop words to use in the analyzer.
  42.      */
  43.     private final CharArraySet stopWords;
  44.     /**
  45.      * A reference to the concatenating filter so that it can be reset/cleared.
  46.      */
  47.     private TokenPairConcatenatingFilter concatenatingFilter;

  48.     /**
  49.      * Returns the set of stop words being used.
  50.      *
  51.      * @return the set of stop words being used
  52.      */
  53.     public static CharArraySet getStopWords() {
  54.         final CharArraySet words = StopFilter.makeStopSet(ADDITIONAL_STOP_WORDS, true);
  55.         words.addAll(EnglishAnalyzer.ENGLISH_STOP_WORDS_SET);
  56.         return words;
  57.     }

  58.     /**
  59.      * Constructs a new SearchFieldAnalyzer.
  60.      *
  61.      */
  62.     public SearchFieldAnalyzer() {
  63.         stopWords = getStopWords();
  64.     }

  65.     /**
  66.      * Creates a the TokenStreamComponents used to analyze the stream.
  67.      *
  68.      * @param fieldName the field that this lucene analyzer will process
  69.      * @return the token stream filter chain
  70.      */
  71.     @Override
  72.     protected TokenStreamComponents createComponents(String fieldName) {
  73.         //final Tokenizer source = new AlphaNumericTokenizer();
  74.         final Tokenizer source = new WhitespaceTokenizer();
  75.         TokenStream stream = source;

  76.         stream = new UrlTokenizingFilter(stream);
  77.         stream = new AlphaNumericFilter(stream);
  78.         stream = new WordDelimiterGraphFilter(stream,
  79.                 WordDelimiterGraphFilter.GENERATE_WORD_PARTS
  80.                 //| WordDelimiterGraphFilter.GENERATE_NUMBER_PARTS
  81.                 | WordDelimiterGraphFilter.PRESERVE_ORIGINAL
  82.                 | WordDelimiterGraphFilter.SPLIT_ON_CASE_CHANGE
  83.                 | WordDelimiterGraphFilter.SPLIT_ON_NUMERICS
  84.                 | WordDelimiterGraphFilter.STEM_ENGLISH_POSSESSIVE, null);

  85.         stream = new LowerCaseFilter(stream);

  86.         stream = new StopFilter(stream, stopWords);
  87.         concatenatingFilter = new TokenPairConcatenatingFilter(stream);

  88.         return new TokenStreamComponents(source, concatenatingFilter);
  89.     }

  90.     /**
  91.      * Resets the analyzer. This must be manually called between searching and
  92.      * indexing.
  93.      *
  94.      * @throws IOException thrown if there is an error resetting the tokenizer
  95.      */
  96.     public void reset() throws IOException {
  97.         if (concatenatingFilter != null) {
  98.             concatenatingFilter.clear();
  99.         }
  100.     }
  101. }